diff -Naurp xen/acm/acm_simple_type_enforcement_hooks.c xen-redhat/acm/acm_simple_type_enforcement_hooks.c --- xen/acm/acm_simple_type_enforcement_hooks.c +++ xen-redhat/acm/acm_simple_type_enforcement_hooks.c @@ -203,10 +203,10 @@ ste_init_state(struct acm_sized_buffer * __func__, d->domain_id, ste_ssidref); /* a) check for event channel conflicts */ for (bucket = 0; bucket < NR_EVTCHN_BUCKETS; bucket++) { - spin_lock(&d->evtchn_lock); + spin_lock(&d->event_lock); ports = d->evtchn[bucket]; if (ports == NULL) { - spin_unlock(&d->evtchn_lock); + spin_unlock(&d->event_lock); break; } @@ -231,7 +231,7 @@ ste_init_state(struct acm_sized_buffer * printkd("%s: Policy violation in event channel domain " "%x -> domain %x.\n", __func__, d->domain_id, rdomid); - spin_unlock(&d->evtchn_lock); + spin_unlock(&d->event_lock); acm_array_append_tuple(errors, ACM_EVTCHN_SHARING_VIOLATION, @@ -239,7 +239,7 @@ ste_init_state(struct acm_sized_buffer * goto out; } } - spin_unlock(&d->evtchn_lock); + spin_unlock(&d->event_lock); } diff -Naurp xen/arch/ia64/asm-offsets.c xen-redhat/arch/ia64/asm-offsets.c --- xen/arch/ia64/asm-offsets.c +++ xen-redhat/arch/ia64/asm-offsets.c @@ -76,6 +76,7 @@ void foo(void) BLANK(); DEFINE(IA64_DOMAIN_SHADOW_BITMAP_OFFSET, offsetof (struct domain, arch.shadow_bitmap)); + DEFINE(IA64_DOMAIN_RID_BITS_OFFSET, offsetof (struct domain, arch.rid_bits)); BLANK(); diff -Naurp xen/arch/ia64/linux-xen/entry.S xen-redhat/arch/ia64/linux-xen/entry.S --- xen/arch/ia64/linux-xen/entry.S +++ xen-redhat/arch/ia64/linux-xen/entry.S @@ -905,7 +905,7 @@ GLOBAL_ENTRY(ia64_leave_kernel) ;; (pUStk) ssm psr.i (pUStk) br.call.sptk.many b0=do_softirq -(pUStk) ssm psr.i +(pUStk) rsm psr.i ;; (pUStk) br.call.sptk.many b0=reflect_event ;; diff -Naurp xen/arch/ia64/linux-xen/pal.S xen-redhat/arch/ia64/linux-xen/pal.S --- xen/arch/ia64/linux-xen/pal.S +++ xen-redhat/arch/ia64/linux-xen/pal.S @@ -21,11 +21,12 @@ pal_entry_point: .text /* - * Set the PAL entry point address. This could be written in C code, but we do it here - * to keep it all in one module (besides, it's so trivial that it's + * Set the PAL entry point address. This could be written in C code, but we + * do it here to keep it all in one module (besides, it's so trivial that it's * not a big deal). * - * in0 Address of the PAL entry point (text address, NOT a function descriptor). + * in0 Address of the PAL entry point (text address, NOT a function + * descriptor). */ GLOBAL_ENTRY(ia64_pal_handler_init) alloc r3=ar.pfs,1,0,0,0 @@ -36,9 +37,9 @@ GLOBAL_ENTRY(ia64_pal_handler_init) END(ia64_pal_handler_init) /* - * Default PAL call handler. This needs to be coded in assembly because it uses - * the static calling convention, i.e., the RSE may not be used and calls are - * done via "br.cond" (not "br.call"). + * Default PAL call handler. This needs to be coded in assembly because it + * uses the static calling convention, i.e., the RSE may not be used and + * calls are done via "br.cond" (not "br.call"). */ GLOBAL_ENTRY(ia64_pal_default_handler) mov r8=-1 @@ -50,12 +51,10 @@ END(ia64_pal_default_handler) * * in0 Index of PAL service * in1 - in3 Remaining PAL arguments - * in4 1 ==> clear psr.ic, 0 ==> don't clear psr.ic - * */ GLOBAL_ENTRY(ia64_pal_call_static) - .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5) - alloc loc1 = ar.pfs,5,5,0,0 + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(4) + alloc loc1 = ar.pfs,4,5,0,0 movl loc2 = pal_entry_point 1: { mov r28 = in0 @@ -64,7 +63,6 @@ GLOBAL_ENTRY(ia64_pal_call_static) } ;; ld8 loc2 = [loc2] // loc2 <- entry point - tbit.nz p6,p7 = in4, 0 adds r8 = 1f-1b,r8 mov loc4=ar.rsc // save RSE configuration ;; @@ -74,13 +72,11 @@ GLOBAL_ENTRY(ia64_pal_call_static) .body mov r30 = in2 -(p6) rsm psr.i | psr.ic mov r31 = in3 mov b7 = loc2 -(p7) rsm psr.i + rsm psr.i ;; -(p6) srlz.i mov rp = r8 br.cond.sptk.many b7 1: mov psr.l = loc3 @@ -96,8 +92,8 @@ END(ia64_pal_call_static) * Make a PAL call using the stacked registers calling convention. * * Inputs: - * in0 Index of PAL service - * in2 - in3 Remaning PAL arguments + * in0 Index of PAL service + * in2 - in3 Remaining PAL arguments */ GLOBAL_ENTRY(ia64_pal_call_stacked) .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(4) @@ -131,18 +127,18 @@ END(ia64_pal_call_stacked) * Make a physical mode PAL call using the static registers calling convention. * * Inputs: - * in0 Index of PAL service - * in2 - in3 Remaning PAL arguments + * in0 Index of PAL service + * in2 - in3 Remaining PAL arguments * * PSR_LP, PSR_TB, PSR_ID, PSR_DA are never set by the kernel. * So we don't need to clear them. */ -#define PAL_PSR_BITS_TO_CLEAR \ - (IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT | IA64_PSR_DB | IA64_PSR_RT | \ - IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED | \ +#define PAL_PSR_BITS_TO_CLEAR \ + (IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT | IA64_PSR_DB | IA64_PSR_RT |\ + IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED | \ IA64_PSR_DFL | IA64_PSR_DFH) -#define PAL_PSR_BITS_TO_SET \ +#define PAL_PSR_BITS_TO_SET \ (IA64_PSR_BN) @@ -182,7 +178,7 @@ GLOBAL_ENTRY(ia64_pal_call_phys_static) ;; andcm r16=loc3,r16 // removes bits to clear from psr br.call.sptk.many rp=ia64_switch_mode_phys -.ret1: mov rp = r8 // install return address (physical) + mov rp = r8 // install return address (physical) mov loc5 = r19 mov loc6 = r20 br.cond.sptk.many b7 @@ -192,7 +188,6 @@ GLOBAL_ENTRY(ia64_pal_call_phys_static) mov r19=loc5 mov r20=loc6 br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode -.ret2: mov psr.l = loc3 // restore init PSR mov ar.pfs = loc1 @@ -207,8 +202,8 @@ END(ia64_pal_call_phys_static) * Make a PAL call using the stacked registers in physical mode. * * Inputs: - * in0 Index of PAL service - * in2 - in3 Remaning PAL arguments + * in0 Index of PAL service + * in2 - in3 Remaining PAL arguments */ GLOBAL_ENTRY(ia64_pal_call_phys_stacked) .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5) @@ -216,17 +211,12 @@ GLOBAL_ENTRY(ia64_pal_call_phys_stacked) movl loc2 = pal_entry_point 1: { mov r28 = in0 // copy procedure index - mov loc0 = rp // save rp + mov loc0 = rp // save rp } .body ;; ld8 loc2 = [loc2] // loc2 <- entry point - mov out0 = in0 // first argument - mov out1 = in1 // copy arg2 - mov out2 = in2 // copy arg3 - mov out3 = in3 // copy arg3 - ;; - mov loc3 = psr // save psr + mov loc3 = psr // save psr ;; mov loc4=ar.rsc // save RSE configuration #ifdef XEN @@ -244,18 +234,23 @@ GLOBAL_ENTRY(ia64_pal_call_phys_stacked) ;; andcm r16=loc3,r16 // removes bits to clear from psr br.call.sptk.many rp=ia64_switch_mode_phys -.ret6: + + mov out0 = in0 // first argument + mov out1 = in1 // copy arg2 + mov out2 = in2 // copy arg3 + mov out3 = in3 // copy arg3 mov loc5 = r19 mov loc6 = r20 + br.call.sptk.many rp=b7 // now make the call -.ret7: + mov ar.rsc=0 // put RSE in enforced lazy, LE mode mov r16=loc3 // r16= original psr mov r19=loc5 mov r20=loc6 - br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode + br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode -.ret8: mov psr.l = loc3 // restore init PSR + mov psr.l = loc3 // restore init PSR mov ar.pfs = loc1 mov rp = loc0 ;; @@ -265,10 +260,11 @@ GLOBAL_ENTRY(ia64_pal_call_phys_stacked) END(ia64_pal_call_phys_stacked) /* - * Save scratch fp scratch regs which aren't saved in pt_regs already (fp10-fp15). + * Save scratch fp scratch regs which aren't saved in pt_regs already + * (fp10-fp15). * - * NOTE: We need to do this since firmware (SAL and PAL) may use any of the scratch - * regs fp-low partition. + * NOTE: We need to do this since firmware (SAL and PAL) may use any of the + * scratch regs fp-low partition. * * Inputs: * in0 Address of stack storage for fp regs diff -Naurp xen/arch/ia64/linux-xen/README.origin xen-redhat/arch/ia64/linux-xen/README.origin --- xen/arch/ia64/linux-xen/README.origin +++ xen-redhat/arch/ia64/linux-xen/README.origin @@ -18,7 +18,6 @@ minstate.h -> linux/arch/ia64/kernel/mi mm_contig.c -> linux/arch/ia64/mm/contig.c mm_numa.c -> linux/arch/ia64/mm/numa.c numa.c -> linux/arch/ia64/kernel/numa.c -pal.S -> linux/arch/ia64/kernel/pal.S process-linux-xen.c -> linux/arch/ia64/kernel/process.c sal.c -> linux/arch/ia64/kernel/sal.c setup.c -> linux/arch/ia64/kernel/setup.c @@ -42,3 +41,5 @@ perfmon_generic.h -> linux/arch/kernel/p perfmon_itanium.h -> linux/arch/kernel/perfmon_itanium.h perfmon_mckinley.h -> linux/arch/kernel/perfmon_mckinley.h perfmon_montecito.h -> linux/arch/kernel/perfmon_montecito.h +# The files below are from Linux-2.6.21 +pal.S -> linux/arch/ia64/kernel/pal.S diff -Naurp xen/arch/ia64/linux-xen/setup.c xen-redhat/arch/ia64/linux-xen/setup.c --- xen/arch/ia64/linux-xen/setup.c +++ xen-redhat/arch/ia64/linux-xen/setup.c @@ -368,16 +368,21 @@ acpi_oem_console_setup(void) * Tiger 2: SR870BH2 * Tiger 4: SR870BN4 */ - if (strncmp(hdr->oem_id, "INTEL", 5) || - (!strncmp(hdr->oem_table_id, "SR870BH2", 8) && - !strncmp(hdr->oem_table_id, "SR870BN4", 8))) - return -ENODEV; - - ns16550_com1.baud = BAUD_AUTO; - ns16550_com1.io_base = 0x2f8; - ns16550_com1.irq = 3; - - return 0; + if (!strncmp(hdr->oem_id, "INTEL", 5)) { + if (!strncmp(hdr->oem_table_id, "SR870BH2", 8) || + !strncmp(hdr->oem_table_id, "SR870BN4", 8)) { + ns16550_com1.baud = BAUD_AUTO; + ns16550_com1.io_base = 0x2f8; + ns16550_com1.irq = 3; + return 0; + } else { + ns16550_com1.baud = BAUD_AUTO; + ns16550_com1.io_base = 0x3f8; + ns16550_com1.irq = ns16550_com1_gsi = 4; + return 0; + } + } + return -ENODEV; } #endif @@ -873,7 +878,7 @@ cpu_init (void) cpu_data = per_cpu_init(); #ifdef XEN - printk("cpu_init: current=%p\n", current); + printk(XENLOG_DEBUG "cpu_init: current=%p\n", current); #endif /* diff -Naurp xen/arch/ia64/linux-xen/smp.c xen-redhat/arch/ia64/linux-xen/smp.c --- xen/arch/ia64/linux-xen/smp.c +++ xen-redhat/arch/ia64/linux-xen/smp.c @@ -122,9 +122,7 @@ stop_this_cpu (void) cpu_clear(smp_processor_id(), cpu_online_map); max_xtp(); local_irq_disable(); -#ifndef XEN cpu_halt(); -#endif } void @@ -132,9 +130,7 @@ cpu_die(void) { max_xtp(); local_irq_disable(); -#ifndef XEN cpu_halt(); -#endif /* Should never be here */ BUG(); for (;;); diff -Naurp xen/arch/ia64/linux-xen/sn/kernel/irq.c xen-redhat/arch/ia64/linux-xen/sn/kernel/irq.c --- xen/arch/ia64/linux-xen/sn/kernel/irq.c +++ xen-redhat/arch/ia64/linux-xen/sn/kernel/irq.c @@ -12,7 +12,7 @@ #include <linux/spinlock.h> #include <linux/init.h> #ifdef XEN -#include <linux/pci.h> +#include <linux/linux-pci.h> #include <asm/hw_irq.h> #endif #include <asm/sn/addrs.h> diff -Naurp xen/arch/ia64/vmx/Makefile xen-redhat/arch/ia64/vmx/Makefile --- xen/arch/ia64/vmx/Makefile +++ xen-redhat/arch/ia64/vmx/Makefile @@ -18,3 +18,4 @@ obj-y += vmx_virt.o obj-y += vmx_vsa.o obj-y += vtlb.o obj-y += optvfault.o +obj-y += vacpi.o diff -Naurp xen/arch/ia64/vmx/mmio.c xen-redhat/arch/ia64/vmx/mmio.c --- xen/arch/ia64/vmx/mmio.c +++ xen-redhat/arch/ia64/vmx/mmio.c @@ -37,6 +37,7 @@ #include <xen/domain.h> #include <asm/viosapic.h> #include <asm/vlsapic.h> +#include <asm/hvm/vacpi.h> #define HVM_BUFFERED_IO_RANGE_NR 1 @@ -214,6 +215,9 @@ static void legacy_io_access(VCPU *vcpu, if (vmx_ide_pio_intercept(p, val)) return; + if (IS_ACPI_ADDR(p->addr) && vacpi_intercept(p, val)) + return; + vmx_send_assist_req(v); if(dir==IOREQ_READ){ //read *val=p->data; diff -Naurp xen/arch/ia64/vmx/optvfault.S xen-redhat/arch/ia64/vmx/optvfault.S --- xen/arch/ia64/vmx/optvfault.S +++ xen-redhat/arch/ia64/vmx/optvfault.S @@ -7,6 +7,8 @@ */ #include <linux/config.h> +#include <asm/config.h> +#include <asm/pgtable.h> #include <asm/asmmacro.h> #include <asm/kregs.h> #include <asm/offsets.h> @@ -16,6 +18,7 @@ #include <asm/vmx_pal_vsa.h> #include <asm/asm-offsets.h> #include <asm-ia64/vmx_mm_def.h> +#include <asm/virt_event.h> #define ACCE_MOV_FROM_AR #define ACCE_MOV_FROM_RR @@ -25,6 +28,94 @@ #define ACCE_MOV_TO_PSR #define ACCE_THASH +// Inputs are: r21 (= current), r24 (= cause), r25 (= insn), r31 (=saved pr) + +ENTRY(vmx_dummy_function) + br.sptk.many vmx_dummy_function +END(vmx_dummy_function) + +/* + * Inputs: + * r24 : return address + * r25 : vpd + * r29 : scratch + * + */ +GLOBAL_ENTRY(vmx_vps_sync_read) + movl r29 = vmx_dummy_function + ;; + mov b0=r29 + br.sptk.many b0 +END(vmx_vps_sync_read) + +/* + * Inputs: + * r24 : return address + * r25 : vpd + * r29 : scratch + * + */ +GLOBAL_ENTRY(vmx_vps_sync_write) + movl r29 = vmx_dummy_function + ;; + mov b0=r29 + br.sptk.many b0 +END(vmx_vps_sync_write) + +/* + * Inputs: + * r23 : pr + * r24 : guest b0 + * r25 : vpd + * + */ +GLOBAL_ENTRY(vmx_vps_resume_normal) + movl r29 = vmx_dummy_function + ;; + mov b0=r29 + mov pr=r23,-2 + br.sptk.many b0 +END(vmx_vps_resume_normal) + +#define VMX_VPS_SYNC_READ \ + add r16=IA64_VPD_BASE_OFFSET,r21; \ + mov r17 = b0; \ + mov r18 = r24; \ + mov r19 = r25; \ + mov r20 = r31; \ + ;; \ + movl r24 = 1f; \ + ld8 r16 = [r16]; \ + ;; \ + mov r25 =r16; \ + br.sptk.many vmx_vps_sync_read; \ +1: \ + mov b0 = r17; \ + mov r24 = r18; \ + mov r25 = r19; \ + mov r31 = r20 + + +/* + * Inputs: + * r23 : pr + * r24 : guest b0 + * r25 : vpd + * r17 : isr + */ +GLOBAL_ENTRY(vmx_vps_resume_handler) + movl r29 = vmx_dummy_function + ;; + ld8 r26=[r25] + shr r17=r17,IA64_ISR_IR_BIT + ;; + dep r26=r17,r26,63,1 // bit 63 of r26 indicate whether enable CFLE + mov b0=r29 + mov pr=r23,-2 + br.sptk.many b0 +END(vmx_vps_resume_handler) + + //mov r1=ar3 GLOBAL_ENTRY(vmx_asm_mov_from_ar) #ifndef ACCE_MOV_FROM_AR @@ -42,7 +133,7 @@ GLOBAL_ENTRY(vmx_asm_mov_from_ar) add r19=r19,r18 movl r20=asm_mov_to_reg ;; - adds r30=vmx_resume_to_guest-asm_mov_to_reg,r20 + adds r30=vmx_resume_to_guest2-asm_mov_to_reg,r20 shladd r17=r17,4,r20 cmp.gtu p6,p0=r16,r19 ;; @@ -71,7 +162,7 @@ GLOBAL_ENTRY(vmx_asm_mov_from_rr) br.many b0 ;; vmx_asm_mov_from_rr_back_1: - adds r30=vmx_resume_to_guest-asm_mov_from_reg,r20 + adds r30=vmx_resume_to_guest2-asm_mov_from_reg,r20 adds r22=asm_mov_to_reg-asm_mov_from_reg,r20 shr.u r26=r19,61 ;; @@ -89,13 +180,16 @@ GLOBAL_ENTRY(vmx_asm_mov_to_rr) #ifndef ACCE_MOV_TO_RR br.many vmx_virtualization_fault_back #endif - extr.u r16=r25,20,7 - extr.u r17=r25,13,7 + add r22=IA64_VCPU_DOMAIN_OFFSET,r21 + extr.u r16=r25,20,7 // r3 + extr.u r17=r25,13,7 // r2 + ;; + ld8 r22=[r22] // Get domain movl r20=asm_mov_from_reg ;; adds r30=vmx_asm_mov_to_rr_back_1-asm_mov_from_reg,r20 - shladd r16=r16,4,r20 - mov r22=b0 + shladd r16=r16,4,r20 // get r3 + mov r18=b0 // save b0 ;; add r27=VCPU_VRR0_OFS,r21 mov b0=r16 @@ -103,47 +197,56 @@ GLOBAL_ENTRY(vmx_asm_mov_to_rr) ;; vmx_asm_mov_to_rr_back_1: adds r30=vmx_asm_mov_to_rr_back_2-asm_mov_from_reg,r20 - shr.u r23=r19,61 - shladd r17=r17,4,r20 + shr.u r23=r19,61 // get RR # + shladd r17=r17,4,r20 // get r2 ;; //if rr7, go back cmp.eq p6,p0=7,r23 - mov b0=r22 + mov b0=r18 // restore b0 (p6) br.cond.dpnt.many vmx_virtualization_fault_back ;; - mov r28=r19 + mov r28=r19 // save r3 mov b0=r17 br.many b0 vmx_asm_mov_to_rr_back_2: - adds r30=vmx_resume_to_guest-asm_mov_from_reg,r20 - shladd r27=r23,3,r27 - ;; // +starting_rid - st8 [r27]=r19 - mov b0=r30 + adds r30=vmx_resume_to_guest2-asm_mov_from_reg,r20 + shladd r27=r23,3,r27 // address of VRR + add r22=IA64_DOMAIN_RID_BITS_OFFSET,r22 ;; + ld1 r22=[r22] // Load rid_bits from domain + mov b0=r18 // restore b0 adds r16=IA64_VCPU_STARTING_RID_OFFSET,r21 ;; - ld4 r16=[r16] + ld4 r16=[r16] // load starting_rid + extr.u r17=r19,8,24 // Extract RID ;; + shr r17=r17,r22 // Shift out used bits shl r16=r16,8 ;; - add r19=r19,r16 + add r20=r19,r16 + cmp.ne p6,p0=0,r17 // If reserved RID bits are set, use C fall back. + (p6) br.cond.dpnt.many vmx_virtualization_fault_back ;; //mangling rid 1 and 3 - extr.u r16=r19,8,8 - extr.u r17=r19,24,8 - extr.u r18=r19,2,6 + extr.u r16=r20,8,8 + extr.u r17=r20,24,8 + mov r24=r18 // saved b0 for resume ;; - dep r19=r16,r19,24,8 + extr.u r18=r20,2,6 // page size + dep r20=r16,r20,24,8 + mov b0=r30 ;; - dep r19=r17,r19,8,8 + dep r20=r17,r20,8,8 ;; //set ve 1 - dep r19=-1,r19,0,1 - cmp.lt p6,p0=14,r18 + dep r20=-1,r20,0,1 + // If ps > PAGE_SHIFT, use PAGE_SHIFT + cmp.lt p6,p0=PAGE_SHIFT,r18 ;; - (p6) mov r18=14 + (p6) mov r18=PAGE_SHIFT ;; - (p6) dep r19=r18,r19,2,6 + (p6) dep r20=r18,r20,2,6 ;; + st8 [r27]=r19 // Write to vrr. + // Write to sav_rr if rr=0 or rr=4. cmp.eq p6,p0=0,r23 ;; cmp.eq.or p6,p0=4,r23 @@ -155,11 +258,10 @@ vmx_asm_mov_to_rr_back_2: cmp.eq p7,p0=r0,r0 (p6) shladd r17=r23,1,r17 ;; - (p6) st8 [r17]=r19 + (p6) st8 [r17]=r20 (p6) tbit.nz p6,p7=r16,0 ;; - (p7) mov rr[r28]=r19 - mov r24=r22 + (p7) mov rr[r28]=r20 br.many b0 END(vmx_asm_mov_to_rr) @@ -169,11 +271,11 @@ GLOBAL_ENTRY(vmx_asm_rsm) #ifndef ACCE_RSM br.many vmx_virtualization_fault_back #endif - add r16=IA64_VPD_BASE_OFFSET,r21 + VMX_VPS_SYNC_READ + ;; extr.u r26=r25,6,21 extr.u r27=r25,31,2 ;; - ld8 r16=[r16] extr.u r28=r25,36,1 dep r26=r27,r26,21,2 ;; @@ -231,11 +333,11 @@ GLOBAL_ENTRY(vmx_asm_ssm) #ifndef ACCE_SSM br.many vmx_virtualization_fault_back #endif - add r16=IA64_VPD_BASE_OFFSET,r21 + VMX_VPS_SYNC_READ + ;; extr.u r26=r25,6,21 extr.u r27=r25,31,2 ;; - ld8 r16=[r16] extr.u r28=r25,36,1 dep r26=r27,r26,21,2 ;; //r26 is imm24 @@ -305,10 +407,9 @@ GLOBAL_ENTRY(vmx_asm_mov_to_psr) #ifndef ACCE_MOV_TO_PSR br.many vmx_virtualization_fault_back #endif - add r16=IA64_VPD_BASE_OFFSET,r21 - extr.u r26=r25,13,7 //r2 + VMX_VPS_SYNC_READ ;; - ld8 r16=[r16] + extr.u r26=r25,13,7 //r2 movl r20=asm_mov_from_reg ;; adds r30=vmx_asm_mov_to_psr_back-asm_mov_from_reg,r20 @@ -403,7 +504,18 @@ END(vmx_asm_mov_to_psr) ENTRY(vmx_asm_dispatch_vexirq) //increment iip + mov r17 = b0 + mov r18 = r31 + add r25=IA64_VPD_BASE_OFFSET,r21; + movl r24 =1f + ;; + ld8 r25 = [r25] + br.sptk.many vmx_vps_sync_write +1: + mov b0 =r17 mov r16=cr.ipsr + mov r31 = r18 + mov r19 = 37 ;; extr.u r17=r16,IA64_PSR_RI_BIT,2 tbit.nz p6,p7=r16,IA64_PSR_RI_BIT+1 @@ -420,7 +532,7 @@ ENTRY(vmx_asm_dispatch_vexirq) br.many vmx_dispatch_vexirq END(vmx_asm_dispatch_vexirq) -// thash +// thash r1=r3 // TODO: add support when pta.vf = 1 GLOBAL_ENTRY(vmx_asm_thash) #ifndef ACCE_THASH @@ -433,8 +545,7 @@ GLOBAL_ENTRY(vmx_asm_thash) adds r30=vmx_asm_thash_back1-asm_mov_from_reg,r20 shladd r17=r17,4,r20 // get addr of MOVE_FROM_REG(r17) adds r16=IA64_VPD_BASE_OFFSET,r21 // get vcpu.arch.priveregs - ;; - mov r24=b0 + mov r24=b0 // save b0 ;; ld8 r16=[r16] // get VPD addr mov b0=r17 @@ -442,20 +553,25 @@ GLOBAL_ENTRY(vmx_asm_thash) ;; vmx_asm_thash_back1: shr.u r23=r19,61 // get RR number - adds r25=VCPU_VRR0_OFS,r21 // get vcpu->arch.arch_vmx.vrr[0]'s addr + adds r28=VCPU_VRR0_OFS,r21 // get vcpu->arch.arch_vmx.vrr[0]'s addr adds r16=IA64_VPD_VPTA_OFFSET,r16 // get vpta ;; - shladd r27=r23,3,r25 // get vcpu->arch.arch_vmx.vrr[r23]'s addr + shladd r27=r23,3,r28 // get vcpu->arch.arch_vmx.vrr[r23]'s addr ld8 r17=[r16] // get PTA mov r26=1 ;; extr.u r29=r17,2,6 // get pta.size - ld8 r25=[r27] // get vcpu->arch.arch_vmx.vrr[r23]'s value + ld8 r28=[r27] // get vcpu->arch.arch_vmx.vrr[r23]'s value + mov b0=r24 ;; - extr.u r25=r25,2,6 // get rr.ps + // Fall-back to C if VF (long format) is set + tbit.nz p6,p0=r17,8 + (p6) mov r24 =EVENT_THASH + (p6) br.cond.dpnt.many vmx_virtualization_fault_back + extr.u r28=r28,2,6 // get rr.ps shl r22=r26,r29 // 1UL << pta.size ;; - shr.u r23=r19,r25 // vaddr >> rr.ps + shr.u r23=r19,r28 // vaddr >> rr.ps adds r26=3,r29 // pta.size + 3 shl r27=r17,3 // pta << 3 ;; @@ -473,7 +589,7 @@ vmx_asm_thash_back1: ;; or r19=r19,r22 // calc pval shladd r17=r18,4,r26 - adds r30=vmx_resume_to_guest-asm_mov_from_reg,r20 + adds r30=vmx_resume_to_guest2-asm_mov_from_reg,r20 ;; mov b0=r17 br.many b0 @@ -594,6 +710,8 @@ MOV_FROM_BANK0_REG(31) // mov from reg table +// r19: value, r30: return address +// r26 may be destroyed ENTRY(asm_mov_from_reg) MOV_FROM_REG(0) MOV_FROM_REG(1) @@ -732,6 +850,18 @@ END(asm_mov_from_reg) * r24: b0 */ ENTRY(vmx_resume_to_guest) + adds r19=IA64_VPD_BASE_OFFSET,r21 + mov r16 = r31 + mov r17 = r24 + ;; + ld8 r25 =[r19] + movl r24 = 1f + br.sptk.many vmx_vps_sync_write +1: + mov r31 = r16 + mov r24 =r17 + ;; +vmx_resume_to_guest2: mov r16=cr.ipsr movl r20=__vsa_base ;; diff -Naurp xen/arch/ia64/vmx/vacpi.c xen-redhat/arch/ia64/vmx/vacpi.c --- xen/arch/ia64/vmx/vacpi.c +++ xen-redhat/arch/ia64/vmx/vacpi.c @@ -0,0 +1,179 @@ +/* + * vacpi.c: emulation of the ACPI + * based on x86 hvm/pmtimer.c + * + * Copyright (c) 2007, FUJITSU LIMITED + * Kouya Shimura <kouya at jp fujitsu com> + * + * Copyright (c) 2007, XenSource inc. + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + */ + +#include <asm/vmx_vcpu.h> +#include <asm/vmx.h> +#include <asm/hvm/vacpi.h> + +/* The interesting bits of the PM1a_STS register */ +#define TMR_STS (1 << 0) +#define PWRBTN_STS (1 << 5) +#define GBL_STS (1 << 8) + +/* The same in PM1a_EN */ +#define TMR_EN (1 << 0) +#define PWRBTN_EN (1 << 5) +#define GBL_EN (1 << 8) + +/* Mask of bits in PM1a_STS that can generate an SCI. Although the ACPI + * spec lists other bits, the PIIX4, which we are emulating, only + * supports these three. For now, we only use TMR_STS; in future we + * will let qemu set the other bits */ +#define SCI_MASK (TMR_STS|PWRBTN_STS|GBL_STS) + +/* SCI IRQ number (must match SCI_INT number in ACPI FADT in hvmloader) */ +#define SCI_IRQ 9 + +/* We provide a 32-bit counter (must match the TMR_VAL_EXT bit in the FADT) */ +#define TMR_VAL_MASK (0xffffffff) +#define TMR_VAL_MSB (0x80000000) + +/* Dispatch SCIs based on the PM1a_STS and PM1a_EN registers */ +static void pmt_update_sci(struct domain *d, struct vacpi *s) +{ + if (s->regs.pm1a_en & s->regs.pm1a_sts & SCI_MASK) + viosapic_set_irq(d, SCI_IRQ, 1); /* Assert */ + else + viosapic_set_irq(d, SCI_IRQ, 0); +} + +/* Set the correct value in the timer, accounting for time elapsed + * since the last time we did that. */ +static void pmt_update_time(struct domain *d) +{ + struct vacpi *s = &d->arch.hvm_domain.vacpi; + s_time_t curr_gtime; + unsigned long delta; + uint32_t msb = s->regs.tmr_val & TMR_VAL_MSB; + + /* Update the timer */ + curr_gtime = NOW(); + delta = curr_gtime - s->last_gtime; + delta = ((delta >> 8) * ((FREQUENCE_PMTIMER << 32) / SECONDS(1))) >> 24; + s->regs.tmr_val += delta; + s->regs.tmr_val &= TMR_VAL_MASK; + s->last_gtime = curr_gtime; + + /* If the counter's MSB has changed, set the status bit */ + if ((s->regs.tmr_val & TMR_VAL_MSB) != msb) { + s->regs.pm1a_sts |= TMR_STS; + pmt_update_sci(d, s); + } +} + +/* This function should be called soon after each time the MSB of the + * pmtimer register rolls over, to make sure we update the status + * registers and SCI at least once per rollover */ +static void pmt_timer_callback(void *opaque) +{ + struct domain *d = opaque; + struct vacpi *s = &d->arch.hvm_domain.vacpi; + uint64_t cycles, time_flip; + + /* Recalculate the timer and make sure we get an SCI if we need one */ + pmt_update_time(d); + + /* How close are we to the next MSB flip? */ + cycles = TMR_VAL_MSB - (s->regs.tmr_val & (TMR_VAL_MSB - 1)); + + /* Overall time between MSB flips */ + time_flip = (((SECONDS(1) << 23) / FREQUENCE_PMTIMER) * cycles) >> 23; + + /* Wake up again near the next bit-flip */ + set_timer(&s->timer, NOW() + time_flip + MILLISECS(1)); +} + +int vacpi_intercept(ioreq_t * iop, u64 * val) +{ + struct domain *d = current->domain; + struct vacpi *s = &d->arch.hvm_domain.vacpi; + uint64_t addr_off = iop->addr - ACPI_PM1A_EVT_BLK_ADDRESS; + + if (addr_off < 4) { /* Access to PM1a_STS and PM1a_EN registers */ + void *p = (void *)&s->regs.evt_blk + addr_off; + + if (iop->dir == 1) { /* Read */ + if (iop->size == 1) + *val = *(uint8_t *) p; + else if (iop->size == 2) + *val = *(uint16_t *) p; + else if (iop->size == 4) + *val = *(uint32_t *) p; + else + panic_domain(NULL, "wrong ACPI " + "PM1A_EVT_BLK access\n"); + } else { /* Write */ + uint8_t *sp = (uint8_t *) & iop->data; + int i; + + for (i = 0; i < iop->size; i++, addr_off++, p++, sp++) { + if (addr_off < 2) /* PM1a_STS */ + /* write-to-clear */ + *(uint8_t *) p &= ~*sp; + else /* PM1a_EN */ + *(uint8_t *) p = *sp; + } + /* Fix the SCI state to match the new register state */ + pmt_update_sci(d, s); + } + + iop->state = STATE_IORESP_READY; + vmx_io_assist(current); + return 1; + } + + if (iop->addr == ACPI_PM_TMR_BLK_ADDRESS) { + if (iop->size != 4) + panic_domain(NULL, "wrong ACPI PM timer access\n"); + if (iop->dir == 1) { /* Read */ + pmt_update_time(d); + *val = s->regs.tmr_val; + } + /* PM_TMR_BLK is read-only */ + iop->state = STATE_IORESP_READY; + vmx_io_assist(current); + return 1; + } + + return 0; +} + +void vacpi_init(struct domain *d) +{ + struct vacpi *s = &d->arch.hvm_domain.vacpi; + + s->regs.tmr_val = 0; + s->regs.evt_blk = 0; + s->last_gtime = NOW(); + + /* Set up callback to fire SCIs when the MSB of TMR_VAL changes */ + init_timer(&s->timer, pmt_timer_callback, d, first_cpu(cpu_online_map)); + pmt_timer_callback(d); +} + +void vacpi_relinquish_resources(struct domain *d) +{ + struct vacpi *s = &d->arch.hvm_domain.vacpi; + kill_timer(&s->timer); +} diff -Naurp xen/arch/ia64/vmx/viosapic.c xen-redhat/arch/ia64/vmx/viosapic.c --- xen/arch/ia64/vmx/viosapic.c +++ xen-redhat/arch/ia64/vmx/viosapic.c @@ -23,6 +23,10 @@ * * Yunhong Jiang <yunhong.jiang@intel.com> * Ported to xen by using virtual IRQ line. + * + * Copyright (C) 2007 VA Linux Systems Japan K.K. + * Isaku Yamahata <yamahata at valinux co jp> + * SMP support */ #include <xen/config.h> @@ -44,6 +48,7 @@ static void viosapic_deliver(struct vios uint8_t vector = viosapic->redirtbl[irq].vector; struct vcpu *v; + ASSERT(spin_is_locked(&viosapic->lock)); switch ( delivery_mode ) { case SAPIC_FIXED: @@ -90,6 +95,7 @@ static int get_redir_num(struct viosapic { int i; + ASSERT(spin_is_locked(&viosapic->lock)); for ( i = 0; i < VIOSAPIC_NUM_PINS; i++ ) if ( viosapic->redirtbl[i].vector == vector ) return i; @@ -118,19 +124,24 @@ static void viosapic_update_EOI(struct v { int redir_num; + spin_lock(&viosapic->lock); if ( (redir_num = get_redir_num(viosapic, vector)) == -1 ) { + spin_unlock(&viosapic->lock); gdprintk(XENLOG_WARNING, "Can't find redir item for %d EOI\n", vector); return; } if ( !test_and_clear_bit(redir_num, &viosapic->isr) ) { - gdprintk(XENLOG_WARNING, "redir %d not set for %d EOI\n", - redir_num, vector); + spin_unlock(&viosapic->lock); + if ( viosapic->redirtbl[redir_num].trig_mode == SAPIC_LEVEL ) + gdprintk(XENLOG_WARNING, "redir %d not set for %d EOI\n", + redir_num, vector); return; } service_iosapic(viosapic); + spin_unlock(&viosapic->lock); } @@ -149,18 +160,21 @@ static unsigned long viosapic_read_indir default: { - uint32_t redir_index = (viosapic->ioregsel - 0x10) >> 1; + /* ioregsel might be written at the same time. copy it before use. */ + uint32_t ioregsel = viosapic->ioregsel; + uint32_t redir_index; uint64_t redir_content; + redir_index = (ioregsel - 0x10) >> 1; if ( redir_index >= VIOSAPIC_NUM_PINS ) { gdprintk(XENLOG_WARNING, "viosapic_read_indirect:undefined " - "ioregsel %x\n", viosapic->ioregsel); + "ioregsel %x\n", ioregsel); break; } redir_content = viosapic->redirtbl[redir_index].bits; - result = (viosapic->ioregsel & 0x1) ? + result = (ioregsel & 0x1) ? (redir_content >> 32) & 0xffffffff : redir_content & 0xffffffff; break; @@ -212,9 +226,12 @@ static void viosapic_write_indirect(stru default: { - uint32_t redir_index = (viosapic->ioregsel - 0x10) >> 1; + /* ioregsel might be written at the same time. copy it before use. */ + uint32_t ioregsel = viosapic->ioregsel; + uint32_t redir_index; uint64_t redir_content; + redir_index = (ioregsel - 0x10) >> 1; if ( redir_index >= VIOSAPIC_NUM_PINS ) { gdprintk(XENLOG_WARNING, "viosapic_write_indirect " @@ -222,9 +239,10 @@ static void viosapic_write_indirect(stru break; } + spin_lock(&viosapic->lock); redir_content = viosapic->redirtbl[redir_index].bits; - if ( viosapic->ioregsel & 0x1 ) + if ( ioregsel & 0x1 ) { redir_content = (((uint64_t)val & 0xffffffff) << 32) | (redir_content & 0xffffffff); @@ -235,6 +253,7 @@ static void viosapic_write_indirect(stru (val & 0xffffffff); } viosapic->redirtbl[redir_index].bits = redir_content; + spin_unlock(&viosapic->lock); break; } } /* switch */ diff -Naurp xen/arch/ia64/vmx/vlsapic.c xen-redhat/arch/ia64/vmx/vlsapic.c --- xen/arch/ia64/vmx/vlsapic.c +++ xen-redhat/arch/ia64/vmx/vlsapic.c @@ -38,6 +38,7 @@ #include <asm/vmx_platform.h> #include <asm/viosapic.h> #include <asm/vlsapic.h> +#include <asm/vmx_phy_mode.h> #include <asm/linux/jiffies.h> #include <xen/domain.h> @@ -517,8 +518,11 @@ void guest_write_eoi(VCPU *vcpu) int vec; vec = highest_inservice_irq(vcpu); - if ( vec == NULL_VECTOR ) - panic_domain(vcpu_regs(vcpu), "Wrong vector to EOI\n"); + if (vec == NULL_VECTOR) { + gdprintk(XENLOG_WARNING, "vcpu(%d): Wrong vector to EOI\n", + vcpu->vcpu_id); + return; + } VLSAPIC_INSVC(vcpu,vec>>6) &= ~(1UL <<(vec&63)); VCPU(vcpu, eoi)=0; // overwrite the data vcpu->arch.irq_new_pending=1; @@ -607,9 +611,8 @@ struct vcpu * vlsapic_lid_to_vcpu(struct * To inject INIT to guest, we must set the PAL_INIT entry * and set psr to switch to physical mode */ -#define PAL_INIT_ENTRY 0x80000000ffffffa0 #define PSR_SET_BITS (IA64_PSR_DT | IA64_PSR_IT | IA64_PSR_RT | \ - IA64_PSR_IC | IA64_PSR_RI) + IA64_PSR_IC | IA64_PSR_RI | IA64_PSR_I | IA64_PSR_CPL) static void vmx_inject_guest_pal_init(VCPU *vcpu) { @@ -771,6 +774,8 @@ static void vlsapic_write_xtp(struct vcp struct viosapic * viosapic; struct vcpu *lvcpu, *vcpu; viosapic = vcpu_viosapic(v); + + spin_lock(&viosapic->lock); lvcpu = viosapic->lowest_vcpu; VLSAPIC_XTP(v) = val; @@ -783,6 +788,7 @@ static void vlsapic_write_xtp(struct vcp lvcpu = NULL; viosapic->lowest_vcpu = lvcpu; + spin_unlock(&viosapic->lock); } void vlsapic_write(struct vcpu *v, diff -Naurp xen/arch/ia64/vmx/vmmu.c xen-redhat/arch/ia64/vmx/vmmu.c --- xen/arch/ia64/vmx/vmmu.c +++ xen-redhat/arch/ia64/vmx/vmmu.c @@ -19,23 +19,48 @@ * Xuefei Xu (Anthony Xu) (Anthony.xu@intel.com) * Yaozu Dong (Eddie Dong) (Eddie.dong@intel.com) */ -#include <linux/sched.h> -#include <linux/mm.h> -#include <asm/tlb.h> -#include <asm/gcc_intrin.h> -#include <asm/vcpu.h> -#include <linux/interrupt.h> #include <asm/vmx_vcpu.h> -#include <asm/vmx_mm_def.h> -#include <asm/vmx.h> -#include <asm/hw_irq.h> #include <asm/vmx_pal_vsa.h> -#include <asm/kregs.h> -#include <asm/vcpu.h> -#include <xen/irq.h> -#include <xen/errno.h> #include <xen/sched-if.h> +static int default_vtlb_sz = DEFAULT_VTLB_SZ; +static int default_vhpt_sz = DEFAULT_VHPT_SZ; + +static void __init parse_vtlb_size(char *s) +{ + int sz = parse_size_and_unit(s, NULL); + + if (sz > 0) { + default_vtlb_sz = fls(sz - 1); + /* minimum 16KB (for tag uniqueness) */ + if (default_vtlb_sz < 14) + default_vtlb_sz = 14; + } +} + +static int canonicalize_vhpt_size(int sz) +{ + /* minimum 32KB */ + if (sz < 15) + return 15; + /* maximum 8MB (since purging TR is hard coded) */ + if (sz > IA64_GRANULE_SHIFT - 1) + return IA64_GRANULE_SHIFT - 1; + return sz; +} + +static void __init parse_vhpt_size(char *s) +{ + int sz = parse_size_and_unit(s, NULL); + if (sz > 0) { + default_vhpt_sz = fls(sz - 1); + default_vhpt_sz = canonicalize_vhpt_size(default_vhpt_sz); + } +} + +custom_param("vti_vtlb_size", parse_vtlb_size); +custom_param("vti_vhpt_size", parse_vhpt_size); + /* * Get the machine page frame number in 16KB unit * Input: @@ -89,6 +114,7 @@ void recycle_message(thash_cb_t *hcb, u6 } */ +#if 0 /* * Purge all guest TCs in logical processor. * Instead of purging all LP TCs, we should only purge @@ -129,69 +155,37 @@ purge_machine_tc_by_domid(domid_t domid) // purge all TCs belong to this guest. #endif } +#endif static int init_domain_vhpt(struct vcpu *v) { - struct page_info *page; - void * vbase; - page = alloc_domheap_pages (NULL, VCPU_VHPT_ORDER, 0); - if ( page == NULL ) { - printk("No enough contiguous memory for init_domain_vhpt\n"); - return -ENOMEM; - } - vbase = page_to_virt(page); - memset(vbase, 0, VCPU_VHPT_SIZE); - printk(XENLOG_DEBUG "Allocate domain vhpt at 0x%p\n", vbase); - - VHPT(v,hash) = vbase; - VHPT(v,hash_sz) = VCPU_VHPT_SIZE/2; - VHPT(v,cch_buf) = (void *)((u64)vbase + VHPT(v,hash_sz)); - VHPT(v,cch_sz) = VCPU_VHPT_SIZE - VHPT(v,hash_sz); - thash_init(&(v->arch.vhpt),VCPU_VHPT_SHIFT-1); - v->arch.arch_vmx.mpta = v->arch.vhpt.pta.val; + int rc; - return 0; + rc = thash_alloc(&(v->arch.vhpt), default_vhpt_sz, "vhpt"); + v->arch.arch_vmx.mpta = v->arch.vhpt.pta.val; + return rc; } static void free_domain_vhpt(struct vcpu *v) { - struct page_info *page; - - if (v->arch.vhpt.hash) { - page = virt_to_page(v->arch.vhpt.hash); - free_domheap_pages(page, VCPU_VHPT_ORDER); - v->arch.vhpt.hash = 0; - } - - return; + if (v->arch.vhpt.hash) + thash_free(&(v->arch.vhpt)); } int init_domain_tlb(struct vcpu *v) { - struct page_info *page; - void * vbase; int rc; rc = init_domain_vhpt(v); if (rc) return rc; - page = alloc_domheap_pages (NULL, VCPU_VTLB_ORDER, 0); - if ( page == NULL ) { - printk("No enough contiguous memory for init_domain_tlb\n"); + rc = thash_alloc(&(v->arch.vtlb), default_vtlb_sz, "vtlb"); + if (rc) { free_domain_vhpt(v); - return -ENOMEM; + return rc; } - vbase = page_to_virt(page); - memset(vbase, 0, VCPU_VTLB_SIZE); - printk(XENLOG_DEBUG "Allocate domain vtlb at 0x%p\n", vbase); - - VTLB(v,hash) = vbase; - VTLB(v,hash_sz) = VCPU_VTLB_SIZE/2; - VTLB(v,cch_buf) = (void *)((u64)vbase + VTLB(v,hash_sz)); - VTLB(v,cch_sz) = VCPU_VTLB_SIZE - VTLB(v,hash_sz); - thash_init(&(v->arch.vtlb),VCPU_VTLB_SHIFT-1); return 0; } @@ -199,12 +193,8 @@ int init_domain_tlb(struct vcpu *v) void free_domain_tlb(struct vcpu *v) { - struct page_info *page; - - if ( v->arch.vtlb.hash) { - page = virt_to_page(v->arch.vtlb.hash); - free_domheap_pages(page, VCPU_VTLB_ORDER); - } + if (v->arch.vtlb.hash) + thash_free(&(v->arch.vtlb)); free_domain_vhpt(v); } @@ -252,41 +242,9 @@ void machine_tlb_insert(struct vcpu *v, */ void machine_tlb_purge(u64 va, u64 ps) { -// u64 psr; -// psr = ia64_clear_ic(); ia64_ptcl(va, ps << 2); -// ia64_set_psr(psr); -// ia64_srlz_i(); -// return; -} -/* -u64 machine_thash(u64 va) -{ - return ia64_thash(va); -} - -u64 machine_ttag(u64 va) -{ - return ia64_ttag(va); -} -*/ -thash_data_t * vsa_thash(PTA vpta, u64 va, u64 vrr, u64 *tag) -{ - u64 index,pfn,rid,pfn_bits; - pfn_bits = vpta.size-5-8; - pfn = REGION_OFFSET(va)>>_REGION_PAGE_SIZE(vrr); - rid = _REGION_ID(vrr); - index = ((rid&0xff)<<pfn_bits)|(pfn&((1UL<<pfn_bits)-1)); - *tag = ((rid>>8)&0xffff) | ((pfn >>pfn_bits)<<16); - return (thash_data_t *)((vpta.base<<PTA_BASE_SHIFT)+(index<<5)); -// return ia64_call_vsa(PAL_VPS_THASH,va,vrr,vpta,0,0,0,0); } -//u64 vsa_ttag(u64 va, u64 vrr) -//{ -// return ia64_call_vsa(PAL_VPS_TTAG,va,vrr,0,0,0,0,0); -//} - int vhpt_enabled(VCPU *vcpu, uint64_t vadr, vhpt_ref_t ref) { ia64_rr vrr; @@ -544,8 +502,7 @@ IA64FAULT vmx_vcpu_ptc_e(VCPU *vcpu, u64 IA64FAULT vmx_vcpu_ptc_g(VCPU *vcpu, u64 va, u64 ps) { - vmx_vcpu_ptc_ga(vcpu, va, ps); - return IA64_ILLOP_FAULT; + return vmx_vcpu_ptc_ga(vcpu, va, ps); } /* IA64FAULT vmx_vcpu_ptc_ga(VCPU *vcpu, u64 va, u64 ps) diff -Naurp xen/arch/ia64/vmx/vmx_entry.S xen-redhat/arch/ia64/vmx/vmx_entry.S --- xen/arch/ia64/vmx/vmx_entry.S +++ xen-redhat/arch/ia64/vmx/vmx_entry.S @@ -20,21 +20,9 @@ * Kun Tian (Kevin Tian) (kevin.tian@intel.com) */ -#ifndef VCPU_TLB_SHIFT -#define VCPU_TLB_SHIFT 22 -#endif #include <linux/config.h> #include <asm/asmmacro.h> -#include <asm/cache.h> -#include <asm/kregs.h> #include <asm/offsets.h> -#include <asm/pgtable.h> -#include <asm/percpu.h> -#include <asm/processor.h> -#include <asm/thread_info.h> -#include <asm/unistd.h> -#include <asm/vhpt.h> -#include <asm/vmmu.h> #include "vmx_minstate.h" GLOBAL_ENTRY(ia64_leave_nested) @@ -373,20 +361,16 @@ vmx_rse_clear_invalid: adds r19=VPD(VPSR),r18 ;; ld8 r19=[r19] //vpsr - movl r20=__vsa_base ;; //vsa_sync_write_start - ld8 r20=[r20] // read entry point - mov r25=r18 - ;; movl r24=ia64_vmm_entry // calculate return address - add r16=PAL_VPS_SYNC_WRITE,r20 - ;; - mov b0=r16 - br.cond.sptk b0 // call the service + mov r25=r18 + br.sptk.many vmx_vps_sync_write // call the service ;; END(ia64_leave_hypervisor) // fall through + + GLOBAL_ENTRY(ia64_vmm_entry) /* * must be at bank 0 @@ -394,32 +378,18 @@ GLOBAL_ENTRY(ia64_vmm_entry) * r17:cr.isr * r18:vpd * r19:vpsr - * r20:__vsa_base * r22:b0 * r23:predicate */ mov r24=r22 mov r25=r18 tbit.nz p1,p2 = r19,IA64_PSR_IC_BIT // p1=vpsr.ic + (p1) br.cond.sptk.few vmx_vps_resume_normal + (p2) br.cond.sptk.many vmx_vps_resume_handler ;; - (p1) add r29=PAL_VPS_RESUME_NORMAL,r20 - (p1) br.sptk.many ia64_vmm_entry_out - ;; - tbit.nz p1,p2 = r17,IA64_ISR_IR_BIT //p1=cr.isr.ir - ;; - (p1) add r29=PAL_VPS_RESUME_NORMAL,r20 - (p2) add r29=PAL_VPS_RESUME_HANDLER,r20 - (p2) ld8 r26=[r25] - ;; -ia64_vmm_entry_out: - mov pr=r23,-2 - mov b0=r29 - ;; - br.cond.sptk b0 // call pal service END(ia64_vmm_entry) - /* * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't * need to switch to bank 0 and doesn't restore the scratch registers. @@ -719,7 +689,7 @@ GLOBAL_ENTRY(vmx_switch_rr7) movl r25=PAGE_KERNEL ;; or loc5 = r25,loc5 // construct PA | page properties - mov r23 = VCPU_VHPT_SHIFT <<2 + mov r23 = IA64_GRANULE_SHIFT <<2 ;; ptr.d in3,r23 ;; diff -Naurp xen/arch/ia64/vmx/vmx_init.c xen-redhat/arch/ia64/vmx/vmx_init.c --- xen/arch/ia64/vmx/vmx_init.c +++ xen-redhat/arch/ia64/vmx/vmx_init.c @@ -51,6 +51,8 @@ #include <asm/viosapic.h> #include <xen/event.h> #include <asm/vlsapic.h> +#include <asm/vmx_pal_vsa.h> +#include <asm/patch.h> #include "entry.h" /* Global flag to identify whether Intel vmx feature is on */ @@ -62,6 +64,28 @@ static u64 vm_buffer = 0; /* Buffer requ u64 __vsa_base = 0; /* Run-time service base of VMX */ /* Check whether vt feature is enabled or not. */ + +void vmx_vps_patch(void) +{ + u64 addr; + + addr = (u64)&vmx_vps_sync_read; + ia64_patch_imm64(addr, __vsa_base+PAL_VPS_SYNC_READ); + ia64_fc((void *)addr); + addr = (u64)&vmx_vps_sync_write; + ia64_patch_imm64(addr, __vsa_base+PAL_VPS_SYNC_WRITE); + ia64_fc((void *)addr); + addr = (u64)&vmx_vps_resume_normal; + ia64_patch_imm64(addr, __vsa_base+PAL_VPS_RESUME_NORMAL); + ia64_fc((void *)addr); + addr = (u64)&vmx_vps_resume_handler; + ia64_patch_imm64(addr, __vsa_base+PAL_VPS_RESUME_HANDLER); + ia64_fc((void *)addr); + ia64_sync_i(); + ia64_srlz_i(); +} + + void identify_vmx_feature(void) { @@ -130,8 +154,10 @@ vmx_init_env(void) return ; } - if (!__vsa_base) + if (!__vsa_base){ __vsa_base = tmp_base; + vmx_vps_patch(); + } else ASSERT(tmp_base != __vsa_base); @@ -220,14 +246,7 @@ vmx_create_vp(struct vcpu *v) void vmx_save_state(struct vcpu *v) { - u64 status; - - /* FIXME: about setting of pal_proc_vector... time consuming */ - status = ia64_pal_vp_save((u64 *)v->arch.privregs, 0); - if (status != PAL_STATUS_SUCCESS){ - panic_domain(vcpu_regs(v),"Save vp status failed\n"); - } - + ia64_call_vsa(PAL_VPS_SAVE, (u64)v->arch.privregs, 1, 0, 0, 0, 0, 0); /* Need to save KR when domain switch, though HV itself doesn;t * use them. @@ -246,12 +265,7 @@ vmx_save_state(struct vcpu *v) void vmx_load_state(struct vcpu *v) { - u64 status; - - status = ia64_pal_vp_restore((u64 *)v->arch.privregs, 0); - if (status != PAL_STATUS_SUCCESS){ - panic_domain(vcpu_regs(v),"Restore vp status failed\n"); - } + ia64_call_vsa(PAL_VPS_RESTORE, (u64)v->arch.privregs, 1, 0, 0, 0, 0, 0); ia64_set_kr(0, v->arch.arch_vmx.vkr[0]); ia64_set_kr(1, v->arch.arch_vmx.vkr[1]); @@ -350,6 +364,8 @@ vmx_relinquish_guest_resources(struct do for_each_vcpu(d, v) vmx_release_assist_channel(v); + + vacpi_relinquish_resources(d); } void @@ -418,6 +434,8 @@ void vmx_setup_platform(struct domain *d /* Initialize iosapic model within hypervisor */ viosapic_init(d); + + vacpi_init(d); } void vmx_do_launch(struct vcpu *v) diff -Naurp xen/arch/ia64/vmx/vmx_init.c.orig xen-redhat/arch/ia64/vmx/vmx_init.c.orig --- xen/arch/ia64/vmx/vmx_init.c.orig +++ xen-redhat/arch/ia64/vmx/vmx_init.c.orig @@ -0,0 +1,426 @@ +/* -*- Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */ +/* + * vmx_init.c: initialization work for vt specific domain + * Copyright (c) 2005, Intel Corporation. + * Kun Tian (Kevin Tian) <kevin.tian@intel.com> + * Xuefei Xu (Anthony Xu) <anthony.xu@intel.com> + * Fred Yang <fred.yang@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + +/* + * 05/08/16 Kun tian (Kevin Tian) <kevin.tian@intel.com>: + * Disable doubling mapping + * + * 05/03/23 Kun Tian (Kevin Tian) <kevin.tian@intel.com>: + * Simplied design in first step: + * - One virtual environment + * - Domain is bound to one LP + * Later to support guest SMP: + * - Need interface to handle VP scheduled to different LP + */ +#include <xen/config.h> +#include <xen/types.h> +#include <xen/sched.h> +#include <asm/pal.h> +#include <asm/page.h> +#include <asm/processor.h> +#include <asm/vmx_vcpu.h> +#include <xen/lib.h> +#include <asm/vmmu.h> +#include <public/xen.h> +#include <public/hvm/ioreq.h> +#include <public/event_channel.h> +#include <asm/vmx_phy_mode.h> +#include <asm/processor.h> +#include <asm/vmx.h> +#include <xen/mm.h> +#include <asm/viosapic.h> +#include <xen/event.h> +#include <asm/vlsapic.h> +#include "entry.h" + +/* Global flag to identify whether Intel vmx feature is on */ +u32 vmx_enabled = 0; +static u32 vm_order; +static u64 buffer_size; +static u64 vp_env_info; +static u64 vm_buffer = 0; /* Buffer required to bring up VMX feature */ +u64 __vsa_base = 0; /* Run-time service base of VMX */ + +/* Check whether vt feature is enabled or not. */ +void +identify_vmx_feature(void) +{ + pal_status_t ret; + u64 avail = 1, status = 1, control = 1; + + vmx_enabled = 0; + /* Check VT-i feature */ + ret = ia64_pal_proc_get_features(&avail, &status, &control); + if (ret != PAL_STATUS_SUCCESS) { + printk("Get proc features failed.\n"); + goto no_vti; + } + + /* FIXME: do we need to check status field, to see whether + * PSR.vm is actually enabled? If yes, aonther call to + * ia64_pal_proc_set_features may be reuqired then. + */ + printk("avail:0x%lx, status:0x%lx,control:0x%lx, vm?0x%lx\n", + avail, status, control, avail & PAL_PROC_VM_BIT); + if (!(avail & PAL_PROC_VM_BIT)) { + printk("No VT feature supported.\n"); + goto no_vti; + } + + ret = ia64_pal_vp_env_info(&buffer_size, &vp_env_info); + if (ret != PAL_STATUS_SUCCESS) { + printk("Get vp environment info failed.\n"); + goto no_vti; + } + + /* Does xen has ability to decode itself? */ + if (!(vp_env_info & VP_OPCODE)) + printk("WARNING: no opcode provided from hardware(%lx)!!!\n", vp_env_info); + vm_order = get_order(buffer_size); + printk("vm buffer size: %ld, order: %d\n", buffer_size, vm_order); + + vmx_enabled = 1; +no_vti: + return; +} + +/* + * Init virtual environment on current LP + * vsa_base is the indicator whether it's first LP to be initialized + * for current domain. + */ +void +vmx_init_env(void) +{ + u64 status, tmp_base; + + if (!vm_buffer) { + vm_buffer = (unsigned long)alloc_xenheap_pages(vm_order); + ASSERT(vm_buffer); + printk("vm_buffer: 0x%lx\n", vm_buffer); + } + + status=ia64_pal_vp_init_env(__vsa_base ? VP_INIT_ENV : VP_INIT_ENV_INITALIZE, + __pa(vm_buffer), + vm_buffer, + &tmp_base); + + if (status != PAL_STATUS_SUCCESS) { + printk("ia64_pal_vp_init_env failed.\n"); + return ; + } + + if (!__vsa_base) + __vsa_base = tmp_base; + else + ASSERT(tmp_base != __vsa_base); + +} + +typedef union { + u64 value; + struct { + u64 number : 8; + u64 revision : 8; + u64 model : 8; + u64 family : 8; + u64 archrev : 8; + u64 rv : 24; + }; +} cpuid3_t; + +/* Allocate vpd from xenheap */ +static vpd_t *alloc_vpd(void) +{ + int i; + cpuid3_t cpuid3; + vpd_t *vpd; + mapped_regs_t *mregs; + + vpd = alloc_xenheap_pages(get_order(VPD_SIZE)); + if (!vpd) { + printk("VPD allocation failed.\n"); + return NULL; + } + + printk(XENLOG_DEBUG "vpd base: 0x%p, vpd size:%ld\n", + vpd, sizeof(vpd_t)); + memset(vpd, 0, VPD_SIZE); + mregs = &vpd->vpd_low; + + /* CPUID init */ + for (i = 0; i < 5; i++) + mregs->vcpuid[i] = ia64_get_cpuid(i); + + /* Limit the CPUID number to 5 */ + cpuid3.value = mregs->vcpuid[3]; + cpuid3.number = 4; /* 5 - 1 */ + mregs->vcpuid[3] = cpuid3.value; + + mregs->vac.a_from_int_cr = 1; + mregs->vac.a_to_int_cr = 1; + mregs->vac.a_from_psr = 1; + mregs->vac.a_from_cpuid = 1; + mregs->vac.a_cover = 1; + mregs->vac.a_bsw = 1; + mregs->vac.a_int = 1; + mregs->vdc.d_vmsw = 1; + + return vpd; +} + +/* Free vpd to xenheap */ +static void +free_vpd(struct vcpu *v) +{ + if ( v->arch.privregs ) + free_xenheap_pages(v->arch.privregs, get_order(VPD_SIZE)); +} + +/* + * Create a VP on intialized VMX environment. + */ +static void +vmx_create_vp(struct vcpu *v) +{ + u64 ret; + vpd_t *vpd = (vpd_t *)v->arch.privregs; + u64 ivt_base; + extern char vmx_ia64_ivt; + /* ia64_ivt is function pointer, so need this tranlation */ + ivt_base = (u64) &vmx_ia64_ivt; + printk(XENLOG_DEBUG "ivt_base: 0x%lx\n", ivt_base); + ret = ia64_pal_vp_create((u64 *)vpd, (u64 *)ivt_base, 0); + if (ret != PAL_STATUS_SUCCESS){ + panic_domain(vcpu_regs(v),"ia64_pal_vp_create failed. \n"); + } +} + +/* Other non-context related tasks can be done in context switch */ +void +vmx_save_state(struct vcpu *v) +{ + u64 status; + + /* FIXME: about setting of pal_proc_vector... time consuming */ + status = ia64_pal_vp_save((u64 *)v->arch.privregs, 0); + if (status != PAL_STATUS_SUCCESS){ + panic_domain(vcpu_regs(v),"Save vp status failed\n"); + } + + + /* Need to save KR when domain switch, though HV itself doesn;t + * use them. + */ + v->arch.arch_vmx.vkr[0] = ia64_get_kr(0); + v->arch.arch_vmx.vkr[1] = ia64_get_kr(1); + v->arch.arch_vmx.vkr[2] = ia64_get_kr(2); + v->arch.arch_vmx.vkr[3] = ia64_get_kr(3); + v->arch.arch_vmx.vkr[4] = ia64_get_kr(4); + v->arch.arch_vmx.vkr[5] = ia64_get_kr(5); + v->arch.arch_vmx.vkr[6] = ia64_get_kr(6); + v->arch.arch_vmx.vkr[7] = ia64_get_kr(7); +} + +/* Even guest is in physical mode, we still need such double mapping */ +void +vmx_load_state(struct vcpu *v) +{ + u64 status; + + status = ia64_pal_vp_restore((u64 *)v->arch.privregs, 0); + if (status != PAL_STATUS_SUCCESS){ + panic_domain(vcpu_regs(v),"Restore vp status failed\n"); + } + + ia64_set_kr(0, v->arch.arch_vmx.vkr[0]); + ia64_set_kr(1, v->arch.arch_vmx.vkr[1]); + ia64_set_kr(2, v->arch.arch_vmx.vkr[2]); + ia64_set_kr(3, v->arch.arch_vmx.vkr[3]); + ia64_set_kr(4, v->arch.arch_vmx.vkr[4]); + ia64_set_kr(5, v->arch.arch_vmx.vkr[5]); + ia64_set_kr(6, v->arch.arch_vmx.vkr[6]); + ia64_set_kr(7, v->arch.arch_vmx.vkr[7]); + /* Guest vTLB is not required to be switched explicitly, since + * anchored in vcpu */ +} + +static void vmx_create_event_channels(struct vcpu *v) +{ + vcpu_iodata_t *p; + struct vcpu *o; + + if (v->vcpu_id == 0) { + /* Ugly: create event channels for every vcpu when vcpu 0 + starts, so that they're available for ioemu to bind to. */ + for_each_vcpu(v->domain, o) { + p = get_vio(v->domain, o->vcpu_id); + o->arch.arch_vmx.xen_port = p->vp_eport = + alloc_unbound_xen_event_channel(o, 0); + gdprintk(XENLOG_INFO, "Allocated port %ld for hvm.\n", + o->arch.arch_vmx.xen_port); + } + } +} + +/* + * Event channel has destoryed in domain_kill(), so we needn't + * do anything here + */ +static void vmx_release_assist_channel(struct vcpu *v) +{ + return; +} + +/* + * Initialize VMX envirenment for guest. Only the 1st vp/vcpu + * is registered here. + */ +int +vmx_final_setup_guest(struct vcpu *v) +{ + vpd_t *vpd; + int rc; + struct switch_stack *sw; + + vpd = alloc_vpd(); + ASSERT(vpd); + if (!vpd) + return -ENOMEM; + + v->arch.privregs = (mapped_regs_t *)vpd; + vcpu_share_privregs_with_guest(v); + vpd->vpd_low.virt_env_vaddr = vm_buffer; + + /* Per-domain vTLB and vhpt implementation. Now vmx domain will stick + * to this solution. Maybe it can be deferred until we know created + * one as vmx domain */ +#ifndef HASH_VHPT + rc = init_domain_tlb(v); + if (rc) + return rc; +#endif + vmx_create_event_channels(v); + + /* v->arch.schedule_tail = arch_vmx_do_launch; */ + vmx_create_vp(v); + + /* Physical mode emulation initialization, including + * emulation ID allcation and related memory request + */ + physical_mode_init(v); + + vlsapic_reset(v); + vtm_init(v); + + /* Set up guest 's indicator for VTi domain*/ + set_bit(ARCH_VMX_DOMAIN, &v->arch.arch_vmx.flags); + + /* Initialize pNonSys=1 for the first context switching */ + sw = (struct switch_stack *)vcpu_regs(v) - 1; + sw->pr = (1UL << PRED_NON_SYSCALL); + + return 0; +} + +void +vmx_relinquish_guest_resources(struct domain *d) +{ + struct vcpu *v; + + for_each_vcpu(d, v) + vmx_release_assist_channel(v); +} + +void +vmx_relinquish_vcpu_resources(struct vcpu *v) +{ + vtime_t *vtm = &(v->arch.arch_vmx.vtm); + + kill_timer(&vtm->vtm_timer); + + free_domain_tlb(v); + free_vpd(v); +} + +typedef struct io_range { + unsigned long start; + unsigned long size; + unsigned long type; +} io_range_t; + +static const io_range_t io_ranges[] = { + {VGA_IO_START, VGA_IO_SIZE, GPFN_FRAME_BUFFER}, + {MMIO_START, MMIO_SIZE, GPFN_LOW_MMIO}, + {LEGACY_IO_START, LEGACY_IO_SIZE, GPFN_LEGACY_IO}, + {IO_SAPIC_START, IO_SAPIC_SIZE, GPFN_IOSAPIC}, + {PIB_START, PIB_SIZE, GPFN_PIB}, +}; + +// The P2M table is built in libxc/ia64/xc_ia64_hvm_build.c @ setup_guest() +// so only mark IO memory space here +static void vmx_build_io_physmap_table(struct domain *d) +{ + unsigned long i, j; + + /* Mark I/O ranges */ + for (i = 0; i < (sizeof(io_ranges) / sizeof(io_range_t)); i++) { + for (j = io_ranges[i].start; + j < io_ranges[i].start + io_ranges[i].size; j += PAGE_SIZE) + (void)__assign_domain_page(d, j, io_ranges[i].type, + ASSIGN_writable); + } + +} + +void vmx_setup_platform(struct domain *d) +{ + ASSERT(d != dom0); /* only for non-privileged vti domain */ + + vmx_build_io_physmap_table(d); + + d->arch.vmx_platform.shared_page_va = + (unsigned long)__va(__gpa_to_mpa(d, IO_PAGE_START)); + /* For buffered IO requests. */ + spin_lock_init(&d->arch.hvm_domain.buffered_io_lock); + d->arch.hvm_domain.buffered_io_va = + (unsigned long)__va(__gpa_to_mpa(d, BUFFER_IO_PAGE_START)); + d->arch.hvm_domain.buffered_pio_va = + (unsigned long)__va(__gpa_to_mpa(d, BUFFER_PIO_PAGE_START)); + /* TEMP */ + d->arch.vmx_platform.pib_base = 0xfee00000UL; + + d->arch.sal_data = xmalloc(struct xen_sal_data); + + /* Only open one port for I/O and interrupt emulation */ + memset(&d->shared_info->evtchn_mask[0], 0xff, + sizeof(d->shared_info->evtchn_mask)); + + /* Initialize iosapic model within hypervisor */ + viosapic_init(d); +} + +void vmx_do_launch(struct vcpu *v) +{ + vmx_load_all_rr(v); +} diff -Naurp xen/arch/ia64/vmx/vmx_ivt.S xen-redhat/arch/ia64/vmx/vmx_ivt.S --- xen/arch/ia64/vmx/vmx_ivt.S +++ xen-redhat/arch/ia64/vmx/vmx_ivt.S @@ -208,11 +208,8 @@ vmx_itlb_loop: ld8 r18=[r16] ;; adds r19=VPD(VPSR),r18 - movl r20=__vsa_base ;; ld8 r19=[r19] - ld8 r20=[r20] - ;; br.sptk ia64_vmm_entry ;; vmx_itlb_out: @@ -289,11 +286,8 @@ vmx_dtlb_loop: ld8 r18=[r16] ;; adds r19=VPD(VPSR),r18 - movl r20=__vsa_base ;; ld8 r19=[r19] - ld8 r20=[r20] - ;; br.sptk ia64_vmm_entry ;; vmx_dtlb_out: @@ -1011,7 +1005,7 @@ END(vmx_speculation_vector) // 0x5900 Entry 29 (size 16 bundles) Debug (16,28,56) ENTRY(vmx_debug_vector) VMX_DBG_FAULT(29) - VMX_FAULT(29) + VMX_REFLECT(29) END(vmx_debug_vector) .org vmx_ia64_ivt+0x5a00 diff -Naurp xen/arch/ia64/vmx/vmx_minstate.h xen-redhat/arch/ia64/vmx/vmx_minstate.h --- xen/arch/ia64/vmx/vmx_minstate.h +++ xen-redhat/arch/ia64/vmx/vmx_minstate.h @@ -59,24 +59,16 @@ #define PAL_VSA_SYNC_READ \ /* begin to call pal vps sync_read */ \ - add r25=IA64_VPD_BASE_OFFSET, r21; \ - movl r20=__vsa_base; \ - ;; \ - ld8 r25=[r25]; /* read vpd base */ \ - ld8 r20=[r20]; /* read entry point */ \ - ;; \ - add r20=PAL_VPS_SYNC_READ,r20; \ - ;; \ { .mii; \ + add r25=IA64_VPD_BASE_OFFSET, r21; \ nop 0x0; \ mov r24=ip; \ - mov b0=r20; \ ;; \ }; \ { .mmb; \ add r24 = 0x20, r24; \ - nop 0x0; \ - br.cond.sptk b0; /* call the service */ \ + ld8 r25 = [r25]; \ + br.cond.sptk vmx_vps_sync_read; /* call the service */ \ ;; \ }; \ diff -Naurp xen/arch/ia64/vmx/vmx_phy_mode.c xen-redhat/arch/ia64/vmx/vmx_phy_mode.c --- xen/arch/ia64/vmx/vmx_phy_mode.c +++ xen-redhat/arch/ia64/vmx/vmx_phy_mode.c @@ -237,7 +237,12 @@ void switch_mm_mode(VCPU *vcpu, IA64_PSR old_psr, IA64_PSR new_psr) { int act; - act = mm_switch_action(old_psr, new_psr); + /* Switch to physical mode when injecting PAL_INIT */ + if (unlikely(MODE_IND(new_psr) == 0 && + vcpu_regs(vcpu)->cr_iip == PAL_INIT_ENTRY)) + act = SW_V2P; + else + act = mm_switch_action(old_psr, new_psr); perfc_incra(vmx_switch_mm_mode, act); switch (act) { case SW_V2P: diff -Naurp xen/arch/ia64/vmx/vmx_process.c xen-redhat/arch/ia64/vmx/vmx_process.c --- xen/arch/ia64/vmx/vmx_process.c +++ xen-redhat/arch/ia64/vmx/vmx_process.c @@ -76,7 +76,14 @@ static u64 vec2off[68] = {0x0,0x400,0x80 0x7f00 }; - +void vmx_lazy_load_fpu(struct vcpu *vcpu) +{ + if (FP_PSR(vcpu) & IA64_PSR_DFH) { + FP_PSR(vcpu) = IA64_PSR_MFH; + if (__ia64_per_cpu_var(fp_owner) != vcpu) + __ia64_load_fpu(vcpu->arch._thread.fph); + } +} void vmx_reflect_interruption(u64 ifa, u64 isr, u64 iim, u64 vec, REGS *regs) @@ -86,53 +93,65 @@ void vmx_reflect_interruption(u64 ifa, u u64 vpsr = VCPU(vcpu, vpsr); vector = vec2off[vec]; - if(!(vpsr&IA64_PSR_IC)&&(vector!=IA64_DATA_NESTED_TLB_VECTOR)){ - panic_domain(regs, "Guest nested fault vector=%lx!\n", vector); - } switch (vec) { - + case 5: // IA64_DATA_NESTED_TLB_VECTOR + break; case 22: // IA64_INST_ACCESS_RIGHTS_VECTOR + if (!(vpsr & IA64_PSR_IC)) + goto nested_fault; if (vhpt_access_rights_fixup(vcpu, ifa, 0)) return; break; case 25: // IA64_DISABLED_FPREG_VECTOR - - if (FP_PSR(vcpu) & IA64_PSR_DFH) { - FP_PSR(vcpu) = IA64_PSR_MFH; - if (__ia64_per_cpu_var(fp_owner) != vcpu) - __ia64_load_fpu(vcpu->arch._thread.fph); - } + if (!(vpsr & IA64_PSR_IC)) + goto nested_fault; + vmx_lazy_load_fpu(vcpu); if (!(VCPU(vcpu, vpsr) & IA64_PSR_DFH)) { regs->cr_ipsr &= ~IA64_PSR_DFH; return; } break; - + case 32: // IA64_FP_FAULT_VECTOR + if (!(vpsr & IA64_PSR_IC)) + goto nested_fault; // handle fpswa emulation // fp fault status = handle_fpu_swa(1, regs, isr); if (!status) { vcpu_increment_iip(vcpu); return; - } else if (IA64_RETRY == status) - return; + } break; case 33: // IA64_FP_TRAP_VECTOR + if (!(vpsr & IA64_PSR_IC)) + goto nested_fault; //fp trap status = handle_fpu_swa(0, regs, isr); if (!status) return; - else if (IA64_RETRY == status) { - vcpu_decrement_iip(vcpu); + break; + + case 29: // IA64_DEBUG_VECTOR + case 35: // IA64_TAKEN_BRANCH_TRAP_VECTOR + case 36: // IA64_SINGLE_STEP_TRAP_VECTOR + if (vmx_guest_kernel_mode(regs) + && current->domain->debugger_attached) { + domain_pause_for_debugger(); return; } + if (!(vpsr & IA64_PSR_IC)) + goto nested_fault; + break; + + default: + if (!(vpsr & IA64_PSR_IC)) + goto nested_fault; break; - } VCPU(vcpu,isr)=isr; VCPU(vcpu,iipa) = regs->cr_iip; @@ -142,6 +161,10 @@ void vmx_reflect_interruption(u64 ifa, u set_ifa_itir_iha(vcpu,ifa,1,1,1); } inject_guest_interruption(vcpu, vector); + return; + + nested_fault: + panic_domain(regs, "Guest nested fault vector=%lx!\n", vector); } diff -Naurp xen/arch/ia64/vmx/vmx_utility.c xen-redhat/arch/ia64/vmx/vmx_utility.c --- xen/arch/ia64/vmx/vmx_utility.c +++ xen-redhat/arch/ia64/vmx/vmx_utility.c @@ -26,7 +26,7 @@ #include <asm/processor.h> #include <asm/vmx_mm_def.h> - +#ifdef CHECK_FAULT /* * Return: * 0: Not reserved indirect registers @@ -71,6 +71,7 @@ is_reserved_indirect_register ( return 0; } +#endif /* * Return: @@ -207,7 +208,7 @@ check_psr_rsv_fields (u64 value) } - +#ifdef CHECK_FAULT /* * Return: * 1: CR reserved fields are not zero @@ -310,9 +311,9 @@ check_cr_rsv_fields (int index, u64 valu panic ("Unsupported CR"); return 0; } +#endif - - +#if 0 /* * Return: * 0: Indirect Reg reserved fields are not zero @@ -361,7 +362,7 @@ check_indirect_reg_rsv_fields ( int type return 1; } - +#endif diff -Naurp xen/arch/ia64/vmx/vmx_vcpu.c xen-redhat/arch/ia64/vmx/vmx_vcpu.c --- xen/arch/ia64/vmx/vmx_vcpu.c +++ xen-redhat/arch/ia64/vmx/vmx_vcpu.c @@ -96,8 +96,7 @@ vmx_vcpu_set_psr(VCPU *vcpu, unsigned lo */ VCPU(vcpu,vpsr) = value & (~ (IA64_PSR_ID |IA64_PSR_DA | IA64_PSR_DD | - IA64_PSR_SS | IA64_PSR_ED | IA64_PSR_IA - )); + IA64_PSR_ED | IA64_PSR_IA)); if ( !old_psr.i && (value & IA64_PSR_I) ) { // vpsr.i 0->1 diff -Naurp xen/arch/ia64/vmx/vmx_virt.c xen-redhat/arch/ia64/vmx/vmx_virt.c --- xen/arch/ia64/vmx/vmx_virt.c +++ xen-redhat/arch/ia64/vmx/vmx_virt.c @@ -178,8 +178,8 @@ static IA64FAULT vmx_emul_mov_to_psr(VCP { u64 val; - if(vcpu_get_gr_nat(vcpu, inst.M35.r2, &val) != IA64_NO_FAULT) - panic_domain(vcpu_regs(vcpu),"get_psr nat bit fault\n"); + if (vcpu_get_gr_nat(vcpu, inst.M35.r2, &val) != IA64_NO_FAULT) + panic_domain(vcpu_regs(vcpu),"get_psr nat bit fault\n"); return vmx_vcpu_set_psr_l(vcpu, val); } @@ -892,7 +892,6 @@ static IA64FAULT vmx_emul_mov_to_rr(VCPU static IA64FAULT vmx_emul_mov_to_dbr(VCPU *vcpu, INST64 inst) { u64 r3,r2; - return IA64_NO_FAULT; #ifdef CHECK_FAULT IA64_PSR vpsr; vpsr.val=vmx_vcpu_get_psr(vcpu); @@ -916,7 +915,6 @@ static IA64FAULT vmx_emul_mov_to_dbr(VCP static IA64FAULT vmx_emul_mov_to_ibr(VCPU *vcpu, INST64 inst) { u64 r3,r2; - return IA64_NO_FAULT; #ifdef CHECK_FAULT IA64_PSR vpsr; vpsr.val=vmx_vcpu_get_psr(vcpu); @@ -934,7 +932,7 @@ static IA64FAULT vmx_emul_mov_to_ibr(VCP return IA64_FAULT; #endif //CHECK_FAULT } - return (vmx_vcpu_set_ibr(vcpu,r3,r2)); + return vmx_vcpu_set_ibr(vcpu,r3,r2); } static IA64FAULT vmx_emul_mov_to_pmc(VCPU *vcpu, INST64 inst) @@ -1064,6 +1062,7 @@ static IA64FAULT vmx_emul_mov_from_pkr(V static IA64FAULT vmx_emul_mov_from_dbr(VCPU *vcpu, INST64 inst) { u64 r3,r1; + IA64FAULT res; #ifdef CHECK_FAULT if(check_target_register(vcpu, inst.M43.r1)){ set_illegal_op_isr(vcpu); @@ -1094,13 +1093,16 @@ static IA64FAULT vmx_emul_mov_from_dbr(V return IA64_FAULT; } #endif //CHECK_FAULT - vmx_vcpu_get_dbr(vcpu,r3,&r1); + res = vmx_vcpu_get_dbr(vcpu, r3, &r1); + if (res != IA64_NO_FAULT) + return res; return vcpu_set_gr(vcpu, inst.M43.r1, r1,0); } static IA64FAULT vmx_emul_mov_from_ibr(VCPU *vcpu, INST64 inst) { u64 r3,r1; + IA64FAULT res; #ifdef CHECK_FAULT if(check_target_register(vcpu, inst.M43.r1)){ set_illegal_op_isr(vcpu); @@ -1131,7 +1133,9 @@ static IA64FAULT vmx_emul_mov_from_ibr(V return IA64_FAULT; } #endif //CHECK_FAULT - vmx_vcpu_get_ibr(vcpu,r3,&r1); + res = vmx_vcpu_get_ibr(vcpu, r3, &r1); + if (res != IA64_NO_FAULT) + return res; return vcpu_set_gr(vcpu, inst.M43.r1, r1,0); } @@ -1558,22 +1562,38 @@ if ( (cause == 0xff && opcode == 0x1e000 break; case EVENT_VMSW: printk ("Unimplemented instruction %ld\n", cause); - status=IA64_FAULT; + status=IA64_FAULT; break; default: - panic_domain(regs,"unknown cause %ld, iip: %lx, ipsr: %lx\n", cause,regs->cr_iip,regs->cr_ipsr); + panic_domain(regs,"unknown cause %ld, iip: %lx, ipsr: %lx\n", + cause,regs->cr_iip,regs->cr_ipsr); break; }; #if 0 - if (status == IA64_FAULT) + if (status != IA64_NO_FAULT) panic("Emulation failed with cause %d:\n", cause); #endif - if ( status == IA64_NO_FAULT && cause !=EVENT_RFI ) { - vcpu_increment_iip(vcpu); + switch (status) { + case IA64_RSVDREG_FAULT: + set_rsv_reg_field_isr(vcpu); + rsv_reg_field(vcpu); + break; + case IA64_ILLOP_FAULT: + set_illegal_op_isr(vcpu); + illegal_op(vcpu); + break; + case IA64_FAULT: + /* Registers aleady set. */ + break; + case IA64_NO_FAULT: + if ( cause != EVENT_RFI ) + vcpu_increment_iip(vcpu); + break; } + recover_if_physical_mode(vcpu); return; diff -Naurp xen/arch/ia64/vmx/vtlb.c xen-redhat/arch/ia64/vmx/vtlb.c --- xen/arch/ia64/vmx/vtlb.c +++ xen-redhat/arch/ia64/vmx/vtlb.c @@ -21,34 +21,14 @@ * XiaoYan Feng (Fleming Feng) (Fleming.feng@intel.com) */ -#include <linux/sched.h> -#include <asm/tlb.h> -#include <xen/mm.h> -#include <asm/vmx_mm_def.h> -#include <asm/gcc_intrin.h> -#include <linux/interrupt.h> #include <asm/vmx_vcpu.h> -#include <asm/vmx_phy_mode.h> -#include <asm/vmmu.h> -#include <asm/tlbflush.h> -#include <asm/regionreg.h> -#define MAX_CCH_LENGTH 40 thash_data_t *__alloc_chain(thash_cb_t *); -static void cch_mem_init(thash_cb_t *hcb) +static inline void cch_mem_init(thash_cb_t *hcb) { - int num; - thash_data_t *p; - - hcb->cch_freelist = p = hcb->cch_buf; - num = (hcb->cch_sz/sizeof(thash_data_t))-1; - do{ - p->next =p+1; - p++; - num--; - }while(num); - p->next = NULL; + hcb->cch_free_idx = 0; + hcb->cch_freelist = NULL; } static thash_data_t *cch_alloc(thash_cb_t *hcb) @@ -56,8 +36,16 @@ static thash_data_t *cch_alloc(thash_cb_ thash_data_t *p; if ( (p = hcb->cch_freelist) != NULL ) { hcb->cch_freelist = p->next; + return p; } - return p; + if (hcb->cch_free_idx < hcb->cch_sz/sizeof(thash_data_t)) { + p = &((thash_data_t *)hcb->cch_buf)[hcb->cch_free_idx++]; + p->page_flags = 0; + p->itir = 0; + p->next = NULL; + return p; + } + return NULL; } /* @@ -298,6 +286,17 @@ u64 guest_vhpt_lookup(u64 iha, u64 *pte) return ret; } +static thash_data_t * vtlb_thash(PTA vpta, u64 va, u64 vrr, u64 *tag) +{ + u64 index, pfn, rid; + + pfn = REGION_OFFSET(va) >> _REGION_PAGE_SIZE(vrr); + rid = _REGION_ID(vrr); + index = (pfn ^ rid) & ((1UL << (vpta.size - 5)) - 1); + *tag = pfn ^ (rid << 39); + return (thash_data_t *)((vpta.base << PTA_BASE_SHIFT) + (index << 5)); +} + /* * purge software guest tlb */ @@ -320,7 +319,7 @@ static void vtlb_purge(VCPU *v, u64 va, size = PSIZE(rr_ps); vrr.ps = rr_ps; while (num) { - cur = vsa_thash(hcb->pta, curadr, vrr.rrval, &tag); + cur = vtlb_thash(hcb->pta, curadr, vrr.rrval, &tag); while (cur) { if (cur->etag == tag && cur->ps == rr_ps) cur->etag = 1UL << 63; @@ -413,7 +412,7 @@ void vtlb_insert(VCPU *v, u64 pte, u64 i vcpu_get_rr(v, va, &vrr.rrval); vrr.ps = itir_ps(itir); VMX(v, psbits[va >> 61]) |= (1UL << vrr.ps); - hash_table = vsa_thash(hcb->pta, va, vrr.rrval, &tag); + hash_table = vtlb_thash(hcb->pta, va, vrr.rrval, &tag); cch = hash_table; while (cch) { if (INVALID_TLB(cch)) { @@ -556,13 +555,15 @@ void thash_purge_and_insert(VCPU *v, u64 } else { u64 psr; - phy_pte &= ~PAGE_FLAGS_RV_MASK; - psr = ia64_clear_ic(); - ia64_itc(type + 1, ifa, phy_pte, ps); - ia64_set_psr(psr); - ia64_srlz_i(); - // ps < mrr.ps, this is not supported - // panic_domain(NULL, "%s: ps (%lx) < mrr.ps \n", __func__, ps); + vtlb_insert(v, pte, itir, ifa); + vcpu_quick_region_set(PSCBX(v,tc_regions),ifa); + if (!(pte & VTLB_PTE_IO)) { + phy_pte &= ~PAGE_FLAGS_RV_MASK; + psr = ia64_clear_ic(); + ia64_itc(type + 1, ifa, phy_pte, ps); + ia64_set_psr(psr); + ia64_srlz_i(); + } } } else{ @@ -618,6 +619,30 @@ void thash_purge_all(VCPU *v) local_flush_tlb_all(); } +static void __thash_purge_all(void *arg) +{ + struct vcpu *v = arg; + + BUG_ON(vcpu_runnable(v) || v->is_running); + thash_purge_all(v); +} + +void vmx_vcpu_flush_vtlb_all(VCPU *v) +{ + if (v == current) { + thash_purge_all(v); + return; + } + + /* SMP safe */ + vcpu_pause(v); + if (v->processor == smp_processor_id()) + __thash_purge_all(v); + else + smp_call_function_single(v->processor, __thash_purge_all, v, 1, 1); + vcpu_unpause(v); +} + /* * Lookup the hash table and its collision chain to find an entry @@ -645,30 +670,38 @@ thash_data_t *vtlb_lookup(VCPU *v, u64 v ps = __ffs(psbits); psbits &= ~(1UL << ps); vrr.ps = ps; - cch = vsa_thash(hcb->pta, va, vrr.rrval, &tag); + cch = vtlb_thash(hcb->pta, va, vrr.rrval, &tag); do { if (cch->etag == tag && cch->ps == ps) - return cch; + goto found; cch = cch->next; } while(cch); } return NULL; +found: + if (unlikely(!cch->ed && is_data == ISIDE_TLB)) { + /*The case is very rare, and it may lead to incorrect setting + for itlb's ed bit! Purge it from hash vTLB and let guest os + determin the ed bit of the itlb entry.*/ + vtlb_purge(v, va, ps); + cch = NULL; + } + return cch; } /* * Initialize internal control data before service. */ -void thash_init(thash_cb_t *hcb, u64 sz) +static void thash_init(thash_cb_t *hcb, u64 sz) { int num; - thash_data_t *head, *p; + thash_data_t *head; hcb->pta.val = (unsigned long)hcb->hash; hcb->pta.vf = 1; hcb->pta.ve = 1; hcb->pta.size = sz; - hcb->cch_rec_head = hcb->hash; head=hcb->hash; num = (hcb->hash_sz/sizeof(thash_data_t)); @@ -680,16 +713,47 @@ void thash_init(thash_cb_t *hcb, u64 sz) head++; num--; }while(num); + + hcb->cch_free_idx = 0; + hcb->cch_freelist = NULL; +} + +int thash_alloc(thash_cb_t *hcb, u64 sz_log2, char *what) +{ + struct page_info *page; + void * vbase; + u64 sz = 1UL << sz_log2; + + page = alloc_domheap_pages(NULL, (sz_log2 + 1 - PAGE_SHIFT), 0); + if (page == NULL) { + printk("No enough contiguous memory(%ldKB) for init_domain_%s\n", + sz >> (10 - 1), what); + return -ENOMEM; + } + vbase = page_to_virt(page); + memset(vbase, 0, sz + sz); // hash + collisions chain + if (sz_log2 >= 20 - 1) + printk(XENLOG_DEBUG "Allocate domain %s at 0x%p(%ldMB)\n", + what, vbase, sz >> (20 - 1)); + else + printk(XENLOG_DEBUG "Allocate domain %s at 0x%p(%ldKB)\n", + what, vbase, sz >> (10 - 1)); - hcb->cch_freelist = p = hcb->cch_buf; - num = hcb->cch_sz / sizeof(thash_data_t); - do{ - p->page_flags = 0; - p->itir = 0; - p->next =p+1; - p++; - num--; - }while(num); + hcb->hash = vbase; + hcb->hash_sz = sz; + hcb->cch_buf = (void *)((u64)vbase + hcb->hash_sz); + hcb->cch_sz = sz; + thash_init(hcb, sz_log2); + return 0; +} - (p - 1)->next = NULL; +void thash_free(thash_cb_t *hcb) +{ + struct page_info *page; + + if (hcb->hash) { + page = virt_to_page(hcb->hash); + free_domheap_pages(page, hcb->pta.size + 1 - PAGE_SHIFT); + hcb->hash = 0; + } } diff -Naurp xen/arch/ia64/xen/dom0_ops.c xen-redhat/arch/ia64/xen/dom0_ops.c --- xen/arch/ia64/xen/dom0_ops.c +++ xen-redhat/arch/ia64/xen/dom0_ops.c @@ -214,6 +214,39 @@ long arch_do_domctl(xen_domctl_t *op, XE } break; + case XEN_DOMCTL_set_address_size: + { + struct domain *d = rcu_lock_domain_by_id(op->domain); + + ret = -ESRCH; + if (d == NULL) + break; + + ret = -EINVAL; + if (op->u.address_size.size == BITS_PER_LONG) + ret = 0; + + rcu_unlock_domain(d); + } + break; + + case XEN_DOMCTL_get_address_size: + { + struct domain *d = rcu_lock_domain_by_id(op->domain); + + ret = -ESRCH; + if (d == NULL) + break; + + ret = 0; + op->u.address_size.size = BITS_PER_LONG; + rcu_unlock_domain(d); + + if (copy_to_guest(u_domctl, op, 1)) + ret = -EFAULT; + } + break; + default: printk("arch_do_domctl: unrecognized domctl: %d!!!\n",op->cmd); ret = -ENOSYS; @@ -223,12 +256,6 @@ long arch_do_domctl(xen_domctl_t *op, XE return ret; } -/* - * Temporarily disable the NUMA PHYSINFO code until the rest of the - * changes are upstream. - */ -#undef IA64_NUMA_PHYSINFO - long arch_do_sysctl(xen_sysctl_t *op, XEN_GUEST_HANDLE(xen_sysctl_t) u_sysctl) { long ret = 0; @@ -237,84 +264,68 @@ long arch_do_sysctl(xen_sysctl_t *op, XE { case XEN_SYSCTL_physinfo: { -#ifdef IA64_NUMA_PHYSINFO - int i; - node_data_t *chunks; - u64 *map, cpu_to_node_map[MAX_NUMNODES]; -#endif + int i, node_cpus = 0; + uint32_t max_array_ent; + XEN_GUEST_HANDLE_64(uint32_t) cpu_to_node_arr; xen_sysctl_physinfo_t *pi = &op->u.physinfo; - pi->threads_per_core = - cpus_weight(cpu_sibling_map[0]); + max_array_ent = pi->max_cpu_id; + cpu_to_node_arr = pi->cpu_to_node; + + pi->cpu_to_node = cpu_to_node_arr; + pi->threads_per_core = cpus_weight(cpu_sibling_map[0]); pi->cores_per_socket = cpus_weight(cpu_core_map[0]) / pi->threads_per_core; - pi->sockets_per_node = - num_online_cpus() / cpus_weight(cpu_core_map[0]); -#ifndef IA64_NUMA_PHYSINFO - pi->nr_nodes = 1; -#endif + pi->nr_nodes = num_online_nodes(); + /* + * RHEL5 ABI compat: + * Newer userspace expects 'sockets_per_node' to actually + * contain 'nr_cpus' data. + */ + if (op->interface_version > XEN_SYSCTL_INTERFACE_VERSION) + pi->sockets_per_node = (u32)num_online_cpus(); + else + { + /* + * Guess at a sockets_per_node value. Use the maximum number of + * CPUs per node to avoid deconfigured CPUs breaking the average. + */ + for_each_online_node(i) + node_cpus = max(node_cpus, cpus_weight(node_to_cpumask(i))); + + pi->sockets_per_node = node_cpus / + (pi->cores_per_socket * pi->threads_per_core); + } + pi->total_pages = total_pages; pi->free_pages = avail_domheap_pages(); - pi->scrub_pages = avail_scrub_pages(); + pi->scrub_pages = 0; pi->cpu_khz = local_cpu_data->proc_freq / 1000; memset(pi->hw_cap, 0, sizeof(pi->hw_cap)); - //memcpy(pi->hw_cap, boot_cpu_data.x86_capability, NCAPINTS*4); + + pi->max_cpu_id = last_cpu(cpu_online_map); + max_array_ent = min_t(uint32_t, max_array_ent, pi->max_cpu_id); + ret = 0; -#ifdef IA64_NUMA_PHYSINFO - /* fetch memory_chunk pointer from guest */ - get_xen_guest_handle(chunks, pi->memory_chunks); - - printk("chunks=%p, num_node_memblks=%u\n", chunks, num_node_memblks); - /* if it is set, fill out memory chunk array */ - if (chunks != NULL) { - if (num_node_memblks == 0) { - /* Non-NUMA machine. Put pseudo-values. */ - node_data_t data; - data.node_start_pfn = 0; - data.node_spanned_pages = total_pages; - data.node_id = 0; - /* copy memory chunk structs to guest */ - if (copy_to_guest_offset(pi->memory_chunks, 0, &data, 1)) { - ret = -EFAULT; - break; - } - } else { - for (i = 0; i < num_node_memblks && i < PUBLIC_MAXCHUNKS; i++) { - node_data_t data; - data.node_start_pfn = node_memblk[i].start_paddr >> - PAGE_SHIFT; - data.node_spanned_pages = node_memblk[i].size >> PAGE_SHIFT; - data.node_id = node_memblk[i].nid; - /* copy memory chunk structs to guest */ - if (copy_to_guest_offset(pi->memory_chunks, i, &data, 1)) { + /* + * RHEL5 ABI compat: + * Only fill in extended NUMA info if a newer userspace + * is talking to us + */ + if (op->interface_version > XEN_SYSCTL_INTERFACE_VERSION) + { + if (!guest_handle_is_null(cpu_to_node_arr)) { + for (i = 0; i <= max_array_ent; i++) { + uint32_t node = cpu_online(i) ? cpu_to_node(i) : ~0u; + if (copy_to_guest_offset(cpu_to_node_arr, i, &node, 1)) { ret = -EFAULT; break; } } } } - /* set number of notes */ - pi->nr_nodes = num_online_nodes(); - - /* fetch cpu_to_node pointer from guest */ - get_xen_guest_handle(map, pi->cpu_to_node); - - /* if set, fill out cpu_to_node array */ - if (map != NULL) { - /* copy cpu to node mapping to domU */ - memset(cpu_to_node_map, 0, sizeof(cpu_to_node_map)); - for (i = 0; i < num_online_cpus(); i++) { - cpu_to_node_map[i] = cpu_to_node(i); - if (copy_to_guest_offset(pi->cpu_to_node, i, - &(cpu_to_node_map[i]), 1)) { - ret = -EFAULT; - break; - } - } - } -#endif if ( copy_to_guest(u_sysctl, op, 1) ) ret = -EFAULT; diff -Naurp xen/arch/ia64/xen/domain.c xen-redhat/arch/ia64/xen/domain.c --- xen/arch/ia64/xen/domain.c +++ xen-redhat/arch/ia64/xen/domain.c @@ -52,10 +52,11 @@ #include <asm/perfmon.h> #include <public/vcpu.h> -unsigned long dom0_size = 512*1024*1024; +/* dom0_size: default memory allocation for dom0 (~4GB) */ +unsigned long dom0_size = 4096UL*1024UL*1024UL; /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */ -static unsigned int dom0_max_vcpus = 1; +static unsigned int dom0_max_vcpus = 4; integer_param("dom0_max_vcpus", dom0_max_vcpus); extern unsigned long running_on_sim; @@ -237,6 +238,14 @@ void context_switch(struct vcpu *prev, s ia64_disable_vhpt_walker(); lazy_fp_switch(prev, current); + if (prev->arch.dbg_used || next->arch.dbg_used) { + /* + * Load debug registers either because they are valid or to clear + * the previous one. + */ + ia64_load_debug_regs(next->arch.dbr); + } + prev = ia64_switch_to(next); /* Note: ia64_switch_to does not return here at vcpu initialization. */ @@ -336,7 +345,6 @@ static void continue_cpu_idle_loop(void) #else irq_stat[cpu].idle_timestamp = jiffies; #endif - page_scrub_schedule_work(); while ( !softirq_pending(smp_processor_id()) ) default_idle(); raise_softirq(SCHEDULE_SOFTIRQ); @@ -553,6 +561,9 @@ int arch_domain_create(struct domain *d) goto fail_nomem; memset(&d->arch.mm, 0, sizeof(d->arch.mm)); + d->arch.relres = RELRES_not_started; + d->arch.mm_teardown_offset = 0; + INIT_LIST_HEAD(&d->arch.relmem_list); if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL) goto fail_nomem; @@ -695,13 +706,14 @@ int arch_set_info_guest(struct vcpu *v, return 0; } -static void relinquish_memory(struct domain *d, struct list_head *list) +static int relinquish_memory(struct domain *d, struct list_head *list) { struct list_head *ent; struct page_info *page; #ifndef __ia64__ unsigned long x, y; #endif + int ret = 0; /* Use a recursive lock, as we may enter 'free_domheap_page'. */ spin_lock_recursive(&d->page_alloc_lock); @@ -714,6 +726,7 @@ static void relinquish_memory(struct dom { /* Couldn't get a reference -- someone is freeing this page. */ ent = ent->next; + list_move_tail(&page->list, &d->arch.relmem_list); continue; } @@ -750,30 +763,72 @@ static void relinquish_memory(struct dom /* Follow the list chain and /then/ potentially free the page. */ ent = ent->next; BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY); + list_move_tail(&page->list, &d->arch.relmem_list); put_page(page); + + if (hypercall_preempt_check()) { + ret = -EAGAIN; + goto out; + } } + list_splice_init(&d->arch.relmem_list, list); + + out: spin_unlock_recursive(&d->page_alloc_lock); + return ret; } -void domain_relinquish_resources(struct domain *d) +int domain_relinquish_resources(struct domain *d) { - /* Relinquish guest resources for VT-i domain. */ - if (d->vcpu[0] && VMX_DOMAIN(d->vcpu[0])) - vmx_relinquish_guest_resources(d); + int ret = 0; - /* Tear down shadow mode stuff. */ - mm_teardown(d); + switch (d->arch.relres) { + case RELRES_not_started: + /* Relinquish guest resources for VT-i domain. */ + if (d->arch.is_vti) + vmx_relinquish_guest_resources(d); + d->arch.relres = RELRES_mm_teardown; + /*fallthrough*/ + + case RELRES_mm_teardown: + /* Tear down shadow mode stuff. */ + ret = mm_teardown(d); + if (ret != 0) + return ret; + d->arch.relres = RELRES_xen; + /* fallthrough */ + + case RELRES_xen: + /* Relinquish every xen page of memory. */ + ret = relinquish_memory(d, &d->xenpage_list); + if (ret != 0) + return ret; + d->arch.relres = RELRES_dom; + /* fallthrough */ + + case RELRES_dom: + /* Relinquish every domain page of memory. */ + ret = relinquish_memory(d, &d->page_list); + if (ret != 0) + return ret; + d->arch.relres = RELRES_done; + /* fallthrough */ - /* Relinquish every page of memory. */ - relinquish_memory(d, &d->xenpage_list); - relinquish_memory(d, &d->page_list); + case RELRES_done: + break; + + default: + BUG(); + } - if (d->arch.is_vti && d->arch.sal_data) - xfree(d->arch.sal_data); + if (d->arch.is_vti && d->arch.sal_data) + xfree(d->arch.sal_data); - /* Free page used by xen oprofile buffer */ - free_xenoprof_pages(d); + /* Free page used by xen oprofile buffer */ + free_xenoprof_pages(d); + + return 0; } unsigned long @@ -1015,8 +1070,41 @@ static void loaddomainelfimage(struct do } } -void alloc_dom0(void) +static void calc_dom0_size(void) { + unsigned long domheap_pages; + unsigned long p2m_pages; + unsigned long spare_hv_pages; + unsigned long max_dom0_size; + + /* Estimate maximum memory we can safely allocate for dom0 + * by subtracting the p2m table allocation and a chunk of memory + * for DMA and PCI mapping from the available domheap pages. The + * chunk for DMA, PCI, etc., is a guestimate, as xen doesn't seem + * to have a good idea of what those requirements might be ahead + * of time, calculated at 128MB + 1MB per 4GB of system memory */ + domheap_pages = avail_domheap_pages(); + p2m_pages = domheap_pages / PTRS_PER_PTE; + spare_hv_pages = 8192 + (domheap_pages / 4096); + max_dom0_size = (domheap_pages - (p2m_pages + spare_hv_pages)) + * PAGE_SIZE; + printk("Maximum permitted dom0 size: %luMB\n", + max_dom0_size / (1024*1024)); + + /* validate proposed dom0_size, fix up as needed */ + if (dom0_size > max_dom0_size) { + printk("Reducing dom0 memory allocation from %luK to %luK " + "to fit available memory\n", + dom0_size / 1024, max_dom0_size / 1024); + dom0_size = max_dom0_size; + } + + /* dom0_mem=0 can be passed in to give all available mem to dom0 */ + if (dom0_size == 0) { + printk("Allocating all available memory to dom0\n"); + dom0_size = max_dom0_size; + } + /* Check dom0 size. */ if (dom0_size < 4 * 1024 * 1024) { panic("dom0_mem is too small, boot aborted" @@ -1081,6 +1169,8 @@ int construct_dom0(struct domain *d, printk("*** LOADING DOMAIN 0 ***\n"); + calc_dom0_size(); + max_pages = dom0_size / PAGE_SIZE; d->max_pages = max_pages; d->tot_pages = 0; @@ -1260,10 +1350,12 @@ extern void cpu_halt(void); void machine_halt(void) { console_start_sync(); - if (running_on_sim) - printk ("machine_halt called. spinning...\n"); - else - cpu_halt(); + +#ifdef CONFIG_SMP + smp_send_stop(); +#endif + + printk ("machine_halt called. spinning...\n"); while(1); } diff -Naurp xen/arch/ia64/xen/dom_fw.c xen-redhat/arch/ia64/xen/dom_fw.c --- xen/arch/ia64/xen/dom_fw.c +++ xen-redhat/arch/ia64/xen/dom_fw.c @@ -144,6 +144,117 @@ build_pal_hypercall_bundles(u64 *imva, u ia64_fc(imva + 3); } +/* xen fpswa call stub. 14 bundles */ +extern const unsigned long xen_ia64_fpswa_call_stub[]; +extern const unsigned long xen_ia64_fpswa_call_stub_end[]; +extern const unsigned long xen_ia64_fpswa_call_stub_patch[]; +asm( + ".align 32\n" + ".proc xen_ia64_fpswa_call_stub;\n" + "xen_ia64_fpswa_call_stub:\n" + ".prologue\n" + "alloc r3 = ar.pfs, 8, 0, 0, 0\n" + ".body\n" + "mov r14 = in0\n" + "ld8 r15 = [in1], 8\n" + ";;\n" + "ld8 r16 = [in1]\n" + "ld8 r17 = [in2]\n" + "ld8 r18 = [in3]\n" + "ld8 r19 = [in4]\n" + "ld8 r20 = [in5]\n" + "ld8 r21 = [in6]\n" + "ld8 r22 = [in7], 8\n" + ";;\n" + "ld8 r23 = [in7], 8\n" + ";;\n" + "ld8 r24 = [in7], 8\n" + ";;\n" + "cmp.ne p6, p0 = r24, r0\n" + "ld8 r25 = [in7], 8\n" + ";;\n" + "(p6) tpa r24 = r24\n" + "cmp.ne p7, p0 = r25, r0\n" + "ld8 r26 = [in7], 8\n" + ";;\n" + "(p7)tpa r25 = r25\n" + "cmp.ne p8, p0 = r26, r0\n" + "ld8 r27 = [in7], 8\n" + ";;\n" + "(p8)tpa r26 = r26\n" + "cmp.ne p9, p0 = r27, r0\n" + ";;\n" + "tpa r27 = r27\n" + "xen_ia64_fpswa_call_stub_patch:" + "{\n" + "mov r2 = " FW_HYPERCALL_FPSWA_STR "\n" + "break " __IA64_XEN_HYPERCALL_DEFAULT_STR "\n" + "nop.i 0\n" + "}\n" + "st8 [in2] = r17\n" + "st8 [in3] = r18\n" + "st8 [in4] = r19\n" + "st8 [in5] = r20\n" + "st8 [in6] = r21\n" + "br.ret.sptk.many rp\n" + "xen_ia64_fpswa_call_stub_end:" + ".endp xen_ia64_fpswa_call_stub\n" +); + +static void +build_fpswa_hypercall_bundle(uint64_t *imva, uint64_t brkimm, uint64_t hypnum) +{ + INST64_A5 slot0; + INST64_I19 slot1; + INST64_I18 slot2; + IA64_BUNDLE bundle; + + /* slot0: mov r2 = hypnum (low 20 bits) */ + slot0.inst = 0; + slot0.qp = 0; + slot0.r1 = 2; + slot0.r3 = 0; + slot0.major = 0x9; + + slot0.s = 0; + slot0.imm9d = hypnum >> 7; + slot0.imm5c = hypnum >> 16; + slot0.imm7b = hypnum; + + /* slot1: break brkimm */ + slot1.inst = 0; + slot1.qp = 0; + slot1.x6 = 0; + slot1.x3 = 0; + slot1.major = 0x0; + slot1.i = brkimm >> 20; + slot1.imm20 = brkimm; + + /* slot2: nop.i */ + slot2.inst = 0; + slot2.qp = 0; + slot2.imm20 = 0; + slot2.y = 0; + slot2.x6 = 1; + slot2.x3 = 0; + slot2.i = 0; + slot2.major = 0; + + /* MII bundle */ + bundle.i64[0] = 0; + bundle.i64[1] = 0; + bundle.template = 0x0; /* MII */ + bundle.slot0 = slot0.inst; + bundle.slot1a = slot1.inst; + bundle.slot1b = slot1.inst >> 18; + bundle.slot2 = slot2.inst; + + imva[0] = bundle.i64[0]; + imva[1] = bundle.i64[1]; + ia64_fc(imva); + ia64_fc(imva + 1); +} + // builds a hypercall bundle at domain physical address static void dom_fpswa_hypercall_patch(struct domain *d, unsigned long imva) @@ -151,6 +262,10 @@ dom_fpswa_hypercall_patch(struct domain unsigned long *entry_imva, *patch_imva; const unsigned long entry_paddr = FW_HYPERCALL_FPSWA_ENTRY_PADDR; const unsigned long patch_paddr = FW_HYPERCALL_FPSWA_PATCH_PADDR; + const size_t stub_size = + (char*)xen_ia64_fpswa_call_stub_end - + (char*)xen_ia64_fpswa_call_stub; + size_t i; entry_imva = (unsigned long *)(imva + entry_paddr - FW_HYPERCALL_BASE_PADDR); @@ -160,9 +275,19 @@ dom_fpswa_hypercall_patch(struct domain /* Descriptor. */ *entry_imva++ = patch_paddr; *entry_imva = 0; + /* see dom_fw.h */ + BUILD_BUG_ON((char*)xen_ia64_fpswa_call_stub_end - + (char*)xen_ia64_fpswa_call_stub > 0xff - 16); + + /* call stub */ + memcpy(patch_imva, xen_ia64_fpswa_call_stub, stub_size); + for (i = 0; i < stub_size; i++) + ia64_fc(imva + i); + patch_imva += + xen_ia64_fpswa_call_stub_patch - xen_ia64_fpswa_call_stub; + build_fpswa_hypercall_bundle(patch_imva, d->arch.breakimm, + FW_HYPERCALL_FPSWA); - build_hypercall_bundle(patch_imva, d->arch.breakimm, - FW_HYPERCALL_FPSWA, 1); } // builds a hypercall bundle at domain physical address @@ -489,7 +614,7 @@ efi_mdt_cmp(const void *a, const void *b #define NFUNCPTRS 16 #define NUM_EFI_SYS_TABLES 6 -#define NUM_MEM_DESCS 64 //large enough +#define NUM_MEM_DESCS 256 //large enough struct fw_tables { efi_system_table_t efi_systab; diff -Naurp xen/arch/ia64/xen/faults.c xen-redhat/arch/ia64/xen/faults.c --- xen/arch/ia64/xen/faults.c +++ xen-redhat/arch/ia64/xen/faults.c @@ -93,6 +93,8 @@ void reflect_interruption(unsigned long regs->cr_ipsr = (regs->cr_ipsr & ~DELIVER_PSR_CLR) | DELIVER_PSR_SET; if (PSCB(v, dcr) & IA64_DCR_BE) regs->cr_ipsr |= IA64_PSR_BE; + else + regs->cr_ipsr &= ~IA64_PSR_BE; if (PSCB(v, hpsr_dfh)) regs->cr_ipsr |= IA64_PSR_DFH; @@ -158,6 +160,8 @@ void reflect_event(void) regs->cr_ipsr = (regs->cr_ipsr & ~DELIVER_PSR_CLR) | DELIVER_PSR_SET; if (PSCB(v, dcr) & IA64_DCR_BE) regs->cr_ipsr |= IA64_PSR_BE; + else + regs->cr_ipsr &= ~IA64_PSR_BE; if (PSCB(v, hpsr_dfh)) regs->cr_ipsr |= IA64_PSR_DFH; @@ -272,6 +276,11 @@ void ia64_do_page_fault(unsigned long ad regs->cr_ipsr = (regs->cr_ipsr & ~DELIVER_PSR_CLR) | DELIVER_PSR_SET; + if (PSCB(current, dcr) & IA64_DCR_BE) + regs->cr_ipsr |= IA64_PSR_BE; + else + regs->cr_ipsr &= ~IA64_PSR_BE; + if (PSCB(current, hpsr_dfh)) regs->cr_ipsr |= IA64_PSR_DFH; PSCB(current, vpsr_dfh) = 0; @@ -340,10 +349,10 @@ fp_emulate(int fp_fault, void *bundle, u unsigned long handle_fpu_swa(int fp_fault, struct pt_regs *regs, unsigned long isr) { - struct vcpu *v = current; IA64_BUNDLE bundle; unsigned long fault_ip; fpswa_ret_t ret; + unsigned long rc; fault_ip = regs->cr_iip; /* @@ -355,23 +364,25 @@ handle_fpu_swa(int fp_fault, struct pt_r fault_ip -= 16; if (VMX_DOMAIN(current)) { - if (IA64_RETRY == __vmx_get_domain_bundle(fault_ip, &bundle)) - return IA64_RETRY; - } else - bundle = __get_domain_bundle(fault_ip); - - if (!bundle.i64[0] && !bundle.i64[1]) { - printk("%s: floating-point bundle at 0x%lx not mapped\n", - __FUNCTION__, fault_ip); - return -1; + rc = __vmx_get_domain_bundle(fault_ip, &bundle); + } else { + rc = 0; + if (vcpu_get_domain_bundle(current, regs, fault_ip, + &bundle) == 0) + rc = IA64_RETRY; + } + if (rc == IA64_RETRY) { + gdprintk(XENLOG_DEBUG, + "%s(%s): floating-point bundle at 0x%lx not mapped\n", + __FUNCTION__, fp_fault ? "fault" : "trap", fault_ip); + return IA64_RETRY; } ret = fp_emulate(fp_fault, &bundle, ®s->cr_ipsr, ®s->ar_fpsr, &isr, ®s->pr, ®s->cr_ifs, regs); if (ret.status) { - PSCBX(v, fpswa_ret) = ret; - printk("%s(%s): fp_emulate() returned %ld\n", + gdprintk(XENLOG_ERR, "%s(%s): fp_emulate() returned %ld\n", __FUNCTION__, fp_fault ? "fault" : "trap", ret.status); } @@ -434,6 +445,13 @@ ia64_fault(unsigned long vector, unsigne printk("Dirty-bit.\n"); break; + case 10: + /* __domain_get_bundle() may cause fault. */ + if (ia64_done_with_exception(regs)) + return; + printk("Data Access-bit.\n"); + break; + case 20: printk("Page Not Found.\n"); break; @@ -584,6 +602,17 @@ ia64_handle_privop(unsigned long ifa, st } void +ia64_lazy_load_fpu(struct vcpu *v) +{ + if (PSCB(v, hpsr_dfh)) { + PSCB(v, hpsr_dfh) = 0; + PSCB(v, hpsr_mfh) = 1; + if (__ia64_per_cpu_var(fp_owner) != v) + __ia64_load_fpu(v->arch._thread.fph); + } +} + +void ia64_handle_reflection(unsigned long ifa, struct pt_regs *regs, unsigned long isr, unsigned long iim, unsigned long vector) @@ -622,12 +651,7 @@ ia64_handle_reflection(unsigned long ifa vector = IA64_GENEX_VECTOR; break; case 25: - if (PSCB(v, hpsr_dfh)) { - PSCB(v, hpsr_dfh) = 0; - PSCB(v, hpsr_mfh) = 1; - if (__ia64_per_cpu_var(fp_owner) != v) - __ia64_load_fpu(v->arch._thread.fph); - } + ia64_lazy_load_fpu(v); if (!PSCB(v, vpsr_dfh)) { regs->cr_ipsr &= ~IA64_PSR_DFH; return; @@ -638,8 +662,6 @@ ia64_handle_reflection(unsigned long ifa if (((isr >> 4L) & 0xfL) == 1) { /* Fault is due to a register NaT consumption fault. */ //regs->eml_unat = 0; FIXME: DO WE NEED THIS?? - printk("ia64_handle_reflection: handling regNaT " - "fault\n"); vector = IA64_NAT_CONSUMPTION_VECTOR; break; } @@ -674,6 +696,11 @@ ia64_handle_reflection(unsigned long ifa PSCB(current, iim) = iim; vector = IA64_SPECULATION_VECTOR; break; + case 29: + vector = IA64_DEBUG_VECTOR; + if (debugger_trap_entry(vector, regs)) + return; + break; case 30: // FIXME: Should we handle unaligned refs in Xen?? vector = IA64_UNALIGNED_REF_VECTOR; @@ -684,33 +711,31 @@ ia64_handle_reflection(unsigned long ifa vcpu_increment_iip(v); return; } - // fetch code fail - if (IA64_RETRY == status) - return; - printk("ia64_handle_reflection: handling FP fault\n"); vector = IA64_FP_FAULT_VECTOR; break; case 33: status = handle_fpu_swa(0, regs, isr); if (!status) return; - // fetch code fail - if (IA64_RETRY == status) - return; - printk("ia64_handle_reflection: handling FP trap\n"); vector = IA64_FP_TRAP_VECTOR; break; case 34: - printk("ia64_handle_reflection: handling lowerpriv trap\n"); + if (isr & (1UL << 4)) + printk("ia64_handle_reflection: handling " + "unimplemented instruction address %s\n", + (isr & (1UL<<32)) ? "fault" : "trap"); vector = IA64_LOWERPRIV_TRANSFER_TRAP_VECTOR; break; case 35: printk("ia64_handle_reflection: handling taken branch trap\n"); vector = IA64_TAKEN_BRANCH_TRAP_VECTOR; + if (debugger_trap_entry(vector,regs)) + return; break; case 36: - printk("ia64_handle_reflection: handling single step trap\n"); vector = IA64_SINGLE_STEP_TRAP_VECTOR; + if (debugger_trap_entry(vector,regs)) + return; break; default: diff -Naurp xen/arch/ia64/xen/fw_emul.c xen-redhat/arch/ia64/xen/fw_emul.c --- xen/arch/ia64/xen/fw_emul.c +++ xen-redhat/arch/ia64/xen/fw_emul.c @@ -35,6 +35,7 @@ #include <xen/hypercall.h> #include <xen/softirq.h> #include <xen/time.h> +#include <asm/vmx_phy_mode.h> static DEFINE_SPINLOCK(efi_time_services_lock); @@ -240,6 +241,8 @@ sal_emulator (long index, unsigned long } e = list_entry(sal_queue[in1].next, sal_queue_entry_t, list); + + list_del(&e->list); spin_unlock_irqrestore(&sal_queue_lock, flags); IA64_SAL_DEBUG("SAL_GET_STATE_INFO(%s <= %s) " @@ -275,10 +278,12 @@ sal_emulator (long index, unsigned long r9 = arg.ret; status = arg.status; if (r9 == 0) { + xfree(e); + } else { + /* Re-add the entry to sal_queue */ spin_lock_irqsave(&sal_queue_lock, flags); - list_del(&e->list); + list_add(&e->list, &sal_queue[in1]); spin_unlock_irqrestore(&sal_queue_lock, flags); - xfree(e); } } else { status = IA64_SAL_NO_INFORMATION_AVAILABLE; @@ -314,10 +319,10 @@ sal_emulator (long index, unsigned long "on CPU#%d.\n", rec_name[e->sal_info_type], rec_name[in1], e->cpuid); - arg.type = e->sal_info_type; arg.status = 0; + if (e->cpuid == smp_processor_id()) { IA64_SAL_DEBUG("SAL_CLEAR_STATE_INFO: local\n"); clear_state_info_on(&arg); @@ -446,6 +451,45 @@ sal_emulator (long index, unsigned long return ((struct sal_ret_values) {status, r9, r10, r11}); } +static int +safe_copy_to_guest(unsigned long to, void *from, long size) +{ + BUG_ON((unsigned)size > PAGE_SIZE); + + if (VMX_DOMAIN(current)) { + if (is_virtual_mode(current)) { + thash_data_t *data; + unsigned long gpa, poff; + + /* The caller must provide a DTR or DTC mapping */ + data = vtlb_lookup(current, to, DSIDE_TLB); + if (data) { + gpa = data->page_flags & _PAGE_PPN_MASK; + } else { + data = vhpt_lookup(to); + if (!data) + return -1; + gpa = __mpa_to_gpa( + data->page_flags & _PAGE_PPN_MASK); + gpa &= _PAGE_PPN_MASK; + } + poff = POFFSET(to, data->ps); + if (poff + size > PSIZE(data->ps)) + return -1; + to = PAGEALIGN(gpa, data->ps) | poff; + } + to |= XENCOMM_INLINE_FLAG; + if (xencomm_copy_to_guest((void *)to, from, size, 0) != 0) + return -1; + return 0; + } else { + /* check for vulnerability */ + if (IS_VMM_ADDRESS(to) || IS_VMM_ADDRESS(to + size - 1)) + panic_domain(NULL, "copy to bad address:0x%lx\n", to); + return copy_to_user((void __user *)to, from, size); + } +} + cpumask_t cpu_cache_coherent_map; struct cache_flush_args { @@ -468,6 +512,19 @@ remote_pal_cache_flush(void *v) args->status = status; } +static void +remote_pal_prefetch_visibility(void *v) +{ + s64 trans_type = (s64)v; + ia64_pal_prefetch_visibility(trans_type); +} + +static void +remote_pal_mc_drain(void *v) +{ + ia64_pal_mc_drain(); +} + struct ia64_pal_retval xen_pal_emulator(unsigned long index, u64 in1, u64 in2, u64 in3) { @@ -682,16 +739,13 @@ xen_pal_emulator(unsigned long index, u6 pm_buffer, (pal_perf_mon_info_u_t *) &r9); if (status != 0) { - while(1) printk("PAL_PERF_MON_INFO fails ret=%ld\n", status); break; } - if (copy_to_user((void __user *)in1,pm_buffer,128)) { - while(1) - printk("xen_pal_emulator: PAL_PERF_MON_INFO " - "can't copy to user!!!!\n"); - status = PAL_STATUS_UNIMPLEMENTED; - break; + if (safe_copy_to_guest( + in1, pm_buffer, sizeof(pm_buffer))) { + status = PAL_STATUS_EINVAL; + goto fail_to_copy; } } break; @@ -713,10 +767,11 @@ xen_pal_emulator(unsigned long index, u6 consumes 10 mW, implemented and cache/TLB coherent. */ unsigned long res = 1000UL | (1000UL << 16) | (10UL << 32) | (1UL << 61) | (1UL << 60); - if (copy_to_user ((void *)in1, &res, sizeof (res))) - status = PAL_STATUS_EINVAL; - else - status = PAL_STATUS_SUCCESS; + if (safe_copy_to_guest (in1, &res, sizeof (res))) { + status = PAL_STATUS_EINVAL; + goto fail_to_copy; + } + status = PAL_STATUS_SUCCESS; } break; case PAL_HALT: @@ -738,7 +793,35 @@ xen_pal_emulator(unsigned long index, u6 if (VMX_DOMAIN(current)) status = PAL_STATUS_SUCCESS; break; + case PAL_PREFETCH_VISIBILITY: + status = ia64_pal_prefetch_visibility(in1); + if (status == 0) { + /* must be performed on all remote processors + in the coherence domain. */ + smp_call_function(remote_pal_prefetch_visibility, + (void *)in1, 1, 1); + status = 1; /* no more necessary on remote processor */ + } + break; + case PAL_MC_DRAIN: + status = ia64_pal_mc_drain(); + /* FIXME: All vcpus likely call PAL_MC_DRAIN. + That causes the congestion. */ + smp_call_function(remote_pal_mc_drain, NULL, 1, 1); + break; + case PAL_BRAND_INFO: + if (in1 == 0) { + char brand_info[128]; + status = ia64_pal_get_brand_info(brand_info); + if (status == PAL_STATUS_SUCCESS) + copy_to_user((void *)in2, brand_info, 128); + } else { + status = PAL_STATUS_EINVAL; + } + break; case PAL_LOGICAL_TO_PHYSICAL: + case PAL_GET_PSTATE: + case PAL_CACHE_SHARED_INFO: /* Optional, no need to complain about being unimplemented */ break; default: @@ -747,6 +830,12 @@ xen_pal_emulator(unsigned long index, u6 break; } return ((struct ia64_pal_retval) {status, r9, r10, r11}); + +fail_to_copy: + gdprintk(XENLOG_WARNING, + "PAL(%ld) fail to copy!!! args 0x%lx 0x%lx 0x%lx\n", + index, in1, in2, in3); + return ((struct ia64_pal_retval) {status, r9, r10, r11}); } // given a current domain (virtual or metaphysical) address, return the virtual address @@ -1093,6 +1182,10 @@ efi_emulate_set_virtual_address_map( efi_desc_size = sizeof(efi_memory_desc_t); for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { + struct page_info *efi_runtime_page = NULL; + struct page_info *fpswa_inf_page = NULL; + struct page_info *fw_table_page = NULL; + if (copy_from_user(&entry, p, sizeof(efi_memory_desc_t))) { printk ("efi_emulate_set_virtual_address_map: copy_from_user() fault. addr=0x%p\n", p); return EFI_UNSUPPORTED; @@ -1102,6 +1195,27 @@ efi_emulate_set_virtual_address_map( if (md->type != EFI_PAL_CODE) continue; + /* get pages to prevend them from being freed + * during touching them. + * those entres are in [FW_TABLES_BASE_PADDR, ...] + * see dom_fw.h for its layout. + */ + efi_runtime_page = virt_to_page(efi_runtime); + fpswa_inf_page = virt_to_page(fpswa_inf); + fw_table_page = virt_to_page( + domain_mpa_to_imva(d, FW_TABLES_BASE_PADDR)); + if (get_page(efi_runtime_page, d) == 0) + return EFI_INVALID_PARAMETER; + if (get_page(fpswa_inf_page, d) == 0) { + put_page(efi_runtime_page); + return EFI_INVALID_PARAMETER; + } + if (get_page(fw_table_page, d) == 0) { + put_page(fpswa_inf_page); + put_page(efi_runtime_page); + return EFI_INVALID_PARAMETER; + } + #define EFI_HYPERCALL_PATCH_TO_VIRT(tgt,call) \ do { \ vfn = (unsigned long *) domain_mpa_to_imva(d, tgt); \ @@ -1124,6 +1238,10 @@ efi_emulate_set_virtual_address_map( *vfn++ = FW_HYPERCALL_FPSWA_PATCH_INDEX * 16UL + md->virt_addr; *vfn = 0; fpswa_inf->fpswa = (void *) (FW_HYPERCALL_FPSWA_ENTRY_INDEX * 16UL + md->virt_addr); + + put_page(fw_table_page); + put_page(fpswa_inf_page); + put_page(efi_runtime_page); break; } diff -Naurp xen/arch/ia64/xen/hypercall.c xen-redhat/arch/ia64/xen/hypercall.c --- xen/arch/ia64/xen/hypercall.c +++ xen-redhat/arch/ia64/xen/hypercall.c @@ -17,6 +17,7 @@ #include <asm/sal.h> /* FOR struct ia64_sal_retval */ #include <asm/fpswa.h> /* FOR struct fpswa_ret_t */ +#include <asm/vmx.h> #include <asm/vmx_vcpu.h> #include <asm/vcpu.h> #include <asm/dom_fw.h> @@ -121,14 +122,135 @@ fw_hypercall_ipi (struct pt_regs *regs) return; } +static int +fpswa_get_domain_addr(struct vcpu *v, unsigned long gpaddr, size_t size, + void **virt, struct page_info **page, const char *name) +{ + int cross_page_boundary; + + if (gpaddr == 0) { + *virt = 0; + return 0; + } + + cross_page_boundary = (((gpaddr & ~PAGE_MASK) + size) > PAGE_SIZE); + if (unlikely(cross_page_boundary)) { + /* this case isn't implemented */ + gdprintk(XENLOG_ERR, + "%s: fpswa hypercall is called with " + "page crossing argument %s 0x%lx\n", + __func__, name, gpaddr); + return -ENOSYS; + } + +again: + *virt = domain_mpa_to_imva(v->domain, gpaddr); + *page = virt_to_page(*virt); + if (get_page(*page, current->domain) == 0) { + if (page_get_owner(*page) != current->domain) { + *page = NULL; + return -EFAULT; + } + goto again; + } + + return 0; +} + static fpswa_ret_t -fw_hypercall_fpswa (struct vcpu *v) +fw_hypercall_fpswa (struct vcpu *v, struct pt_regs *regs) { - return PSCBX(v, fpswa_ret); + fpswa_ret_t ret = {-1, 0, 0, 0}; + unsigned long bundle[2] = { regs->r15, regs->r16}; + fp_state_t fp_state; + struct page_info *lp_page = NULL; + struct page_info *lv_page = NULL; + struct page_info *hp_page = NULL; + struct page_info *hv_page = NULL; + + if (unlikely(PSCBX(v, fpswa_ret).status != 0 && + PSCBX(v, fpswa_ret).status != IA64_RETRY)) { + ret = PSCBX(v, fpswa_ret); + PSCBX(v, fpswa_ret) = (fpswa_ret_t){0, 0, 0, 0}; + return ret; + } + + if (!fpswa_interface) + goto error; + + memset(&fp_state, 0, sizeof(fp_state)); + fp_state.bitmask_low64 = regs->r22; + fp_state.bitmask_high64 = regs->r23; + + /* bit6..bit11 */ + if ((fp_state.bitmask_low64 & 0xfc0) != 0xfc0) { + /* other cases isn't supported yet */ + gdprintk(XENLOG_ERR, "%s unsupported bitmask_low64 0x%lx\n", + __func__, fp_state.bitmask_low64); + goto error; + } + if (regs->r25 == 0) + /* fp_state.fp_state_low_volatile must be supplied */ + goto error; + + /* eager save/lazy restore fpu: f32...f127 */ + if ((~fp_state.bitmask_low64 & ((1UL << 31) - 1)) != 0 || + ~fp_state.bitmask_high64 != 0) { + if (VMX_DOMAIN(v)) + vmx_lazy_load_fpu(v); + else + ia64_lazy_load_fpu(v); + } + + if (fpswa_get_domain_addr(v, regs->r24, + sizeof(fp_state.fp_state_low_preserved), + (void*)&fp_state.fp_state_low_preserved, + &lp_page, "fp_state_low_preserved") < 0) + goto error; + if (fpswa_get_domain_addr(v, regs->r25, + sizeof(fp_state.fp_state_low_volatile), + (void*)&fp_state.fp_state_low_volatile, + &lv_page, "fp_state_low_volatile") < 0) + goto error; + if (fpswa_get_domain_addr(v, regs->r26, + sizeof(fp_state.fp_state_high_preserved), + (void*)&fp_state.fp_state_high_preserved, + &hp_page, "fp_state_low_preserved") < 0) + goto error; + if (fpswa_get_domain_addr(v, regs->r27, + sizeof(fp_state.fp_state_high_volatile), + (void*)&fp_state.fp_state_high_volatile, + &hv_page, "fp_state_high_volatile") < 0) + goto error; + + ret = (*fpswa_interface->fpswa)(regs->r14, + bundle, + ®s->r17, /* pipsr */ + ®s->r18, /* pfsr */ + ®s->r19, /* pisr */ + ®s->r20, /* ppreds */ + ®s->r21, /* pifs */ + &fp_state); + +error: + if (lp_page != NULL) + put_page(lp_page); + if (lv_page != NULL) + put_page(lv_page); + if (hp_page != NULL) + put_page(hp_page); + if (hv_page != NULL) + put_page(hv_page); + return ret; } -IA64FAULT -ia64_hypercall(struct pt_regs *regs) +static fpswa_ret_t +fw_hypercall_fpswa_error(void) +{ + return (fpswa_ret_t) {-1, 0, 0, 0}; +} + +IA64FAULT ia64_hypercall(struct pt_regs *regs) { struct vcpu *v = current; struct sal_ret_values x; @@ -177,6 +299,9 @@ ia64_hypercall(struct pt_regs *regs) /* do_block only pends a softirq */ do_softirq(); stop_timer(&v->arch.hlt_timer); + /* do_block() call local_event_delivery_enable(), + but PALL CALL is always called with psr.i = */ + local_event_delivery_disable(); } regs->r8 = 0; regs->r9 = 0; @@ -221,8 +346,24 @@ ia64_hypercall(struct pt_regs *regs) case FW_HYPERCALL_SET_SHARED_INFO_VA: regs->r8 = domain_set_shared_info_va (regs->r28); break; - case FW_HYPERCALL_FPSWA: - fpswa_ret = fw_hypercall_fpswa (v); + case FW_HYPERCALL_FPSWA_BASE: + switch (regs->r2) { + case FW_HYPERCALL_FPSWA_BROKEN: + gdprintk(XENLOG_WARNING, + "Old fpswa hypercall was called (0x%lx).\n" + "Please update your domain builder. ip 0x%lx\n", + FW_HYPERCALL_FPSWA_BROKEN, regs->cr_iip); + fpswa_ret = fw_hypercall_fpswa_error(); + break; + case FW_HYPERCALL_FPSWA: + fpswa_ret = fw_hypercall_fpswa(v, regs); + break; + default: + gdprintk(XENLOG_ERR, "unknown fpswa hypercall %lx\n", + regs->r2); + fpswa_ret = fw_hypercall_fpswa_error(); + break; + } regs->r8 = fpswa_ret.status; regs->r9 = fpswa_ret.err0; regs->r10 = fpswa_ret.err1; diff -Naurp xen/arch/ia64/xen/irq.c xen-redhat/arch/ia64/xen/irq.c --- xen/arch/ia64/xen/irq.c +++ xen-redhat/arch/ia64/xen/irq.c @@ -467,7 +467,7 @@ int pirq_guest_bind(struct vcpu *v, int return rc; } -int pirq_guest_unbind(struct domain *d, int irq) +void pirq_guest_unbind(struct domain *d, int irq) { irq_desc_t *desc = &irq_desc[irq]; irq_guest_action_t *action; @@ -501,7 +501,6 @@ int pirq_guest_unbind(struct domain *d, } spin_unlock_irqrestore(&desc->lock, flags); - return 0; } void diff -Naurp xen/arch/ia64/xen/ivt.S xen-redhat/arch/ia64/xen/ivt.S --- xen/arch/ia64/xen/ivt.S +++ xen-redhat/arch/ia64/xen/ivt.S @@ -977,10 +977,17 @@ ENTRY(daccess_bit) #ifdef XEN mov r16=cr.isr mov r17=cr.ifa + mov r18=cr.ipsr mov r31=pr mov r19=10 + ;; mov r20=0x2800 - br.sptk.many fast_access_reflect + extr.u r18=r18,IA64_PSR_CPL0_BIT,2 + ;; + cmp.ne p6,p0=r0,r18 /* cpl != 0? */ +(p6) br.sptk.many fast_access_reflect + /* __domain_get_bundle() may cause this fault. */ + br.sptk.few dispatch_to_fault_handler ;; #else // Like Entry 8, except for data access @@ -1230,6 +1237,7 @@ fast_hypercall: nop 0 bsw.1 // B (6 cyc) regs are saved, switch to bank 1 ;; + PT_REGS_UNWIND_INFO(-48) ssm psr.ic | PSR_DEFAULT_BITS // M2 now it's safe to re-enable intr.-collection // movl r3=ia64_ret_from_syscall // X @@ -2103,11 +2111,7 @@ END(speculation_vector) // 0x5900 Entry 29 (size 16 bundles) Debug (16,28,56) ENTRY(debug_vector) DBG_FAULT(29) -#ifdef XEN FAULT_OR_REFLECT(29) -#else - FAULT(29) -#endif END(debug_vector) .org ia64_ivt+0x5a00 diff -Naurp xen/arch/ia64/xen/machine_kexec.c xen-redhat/arch/ia64/xen/machine_kexec.c --- xen/arch/ia64/xen/machine_kexec.c +++ xen-redhat/arch/ia64/xen/machine_kexec.c @@ -1,6 +1,9 @@ #include <xen/lib.h> /* for printk() used in stubs */ #include <xen/types.h> #include <public/kexec.h> +#include <xen/mm.h> + +extern unsigned long frametable_pg_dir[]; int machine_kexec_load(int type, int slot, xen_kexec_image_t *image) { @@ -23,6 +26,14 @@ void machine_reboot_kexec(xen_kexec_imag printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); } +void arch_crash_save_vmcoreinfo(void) +{ + VMCOREINFO_SYMBOL(dom_xen); + VMCOREINFO_SYMBOL(dom_io); + VMCOREINFO_SYMBOL(xen_pstart); + VMCOREINFO_SYMBOL(frametable_pg_dir); +} + /* * Local variables: * mode: C diff -Naurp xen/arch/ia64/xen/Makefile xen-redhat/arch/ia64/xen/Makefile --- xen/arch/ia64/xen/Makefile +++ xen-redhat/arch/ia64/xen/Makefile @@ -31,6 +31,7 @@ obj-y += flushd.o obj-y += privop_stat.o obj-y += xenpatch.o obj-y += xencomm.o +obj-y += pci.o obj-$(crash_debug) += gdbstub.o obj-$(xen_ia64_tlb_track) += tlb_track.o diff -Naurp xen/arch/ia64/xen/mm.c xen-redhat/arch/ia64/xen/mm.c --- xen/arch/ia64/xen/mm.c +++ xen-redhat/arch/ia64/xen/mm.c @@ -172,6 +172,7 @@ #include <asm/vhpt.h> #include <asm/vcpu.h> #include <asm/shadow.h> +#include <asm/event.h> #include <asm/p2m_entry.h> #include <asm/tlb_track.h> #include <linux/efi.h> @@ -182,10 +183,11 @@ static void domain_page_flush_and_put(struct domain* d, unsigned long mpaddr, volatile pte_t* ptep, pte_t old_pte, struct page_info* page); +static int efi_ucwb(unsigned long physaddr, unsigned long size); extern unsigned long ia64_iobase; -static struct domain *dom_xen, *dom_io; +struct domain *dom_xen, *dom_io; // followings are stolen from arch_init_memory() @ xen/arch/x86/mm.c void @@ -208,6 +210,18 @@ alloc_dom_xen_and_dom_io(void) BUG_ON(dom_io == NULL); } +static int +mm_teardown_can_skip(struct domain* d, unsigned long offset) +{ + return d->arch.mm_teardown_offset > offset; +} + +static void +mm_teardown_update_offset(struct domain* d, unsigned long offset) +{ + d->arch.mm_teardown_offset = offset; +} + static void mm_teardown_pte(struct domain* d, volatile pte_t* pte, unsigned long offset) { @@ -248,46 +262,73 @@ mm_teardown_pte(struct domain* d, volati } } -static void +static int mm_teardown_pmd(struct domain* d, volatile pmd_t* pmd, unsigned long offset) { unsigned long i; volatile pte_t* pte = pte_offset_map(pmd, offset); for (i = 0; i < PTRS_PER_PTE; i++, pte++) { - if (!pte_present(*pte)) // acquire semantics + unsigned long cur_offset = offset + (i << PAGE_SHIFT); + if (mm_teardown_can_skip(d, cur_offset + PAGE_SIZE)) + continue; + if (!pte_present(*pte)) { // acquire semantics + mm_teardown_update_offset(d, cur_offset); continue; - mm_teardown_pte(d, pte, offset + (i << PAGE_SHIFT)); + } + mm_teardown_update_offset(d, cur_offset); + mm_teardown_pte(d, pte, cur_offset); + if (hypercall_preempt_check()) + return -EAGAIN; } + return 0; } -static void +static int mm_teardown_pud(struct domain* d, volatile pud_t *pud, unsigned long offset) { unsigned long i; volatile pmd_t *pmd = pmd_offset(pud, offset); for (i = 0; i < PTRS_PER_PMD; i++, pmd++) { - if (!pmd_present(*pmd)) // acquire semantics + unsigned long cur_offset = offset + (i << PMD_SHIFT); + if (mm_teardown_can_skip(d, cur_offset + PMD_SIZE)) + continue; + if (!pmd_present(*pmd)) { // acquire semantics + mm_teardown_update_offset(d, cur_offset); continue; - mm_teardown_pmd(d, pmd, offset + (i << PMD_SHIFT)); + } + if (mm_teardown_pmd(d, pmd, cur_offset)) + return -EAGAIN; } + return 0; } -static void +static int mm_teardown_pgd(struct domain* d, volatile pgd_t *pgd, unsigned long offset) { unsigned long i; volatile pud_t *pud = pud_offset(pgd, offset); for (i = 0; i < PTRS_PER_PUD; i++, pud++) { - if (!pud_present(*pud)) // acquire semantics + unsigned long cur_offset = offset + (i << PUD_SHIFT); +#ifndef __PAGETABLE_PUD_FOLDED + if (mm_teardown_can_skip(d, cur_offset + PUD_SIZE)) continue; - mm_teardown_pud(d, pud, offset + (i << PUD_SHIFT)); +#endif + if (!pud_present(*pud)) { // acquire semantics +#ifndef __PAGETABLE_PUD_FOLDED + mm_teardown_update_offset(d, cur_offset); +#endif + continue; + } + if (mm_teardown_pud(d, pud, cur_offset)) + return -EAGAIN; } + return 0; } -void +int mm_teardown(struct domain* d) { struct mm_struct* mm = &d->arch.mm; @@ -295,14 +336,22 @@ mm_teardown(struct domain* d) volatile pgd_t* pgd; if (mm->pgd == NULL) - return; + return 0; pgd = pgd_offset(mm, 0); for (i = 0; i < PTRS_PER_PGD; i++, pgd++) { - if (!pgd_present(*pgd)) // acquire semantics + unsigned long cur_offset = i << PGDIR_SHIFT; + + if (mm_teardown_can_skip(d, cur_offset + PGDIR_SIZE)) continue; - mm_teardown_pgd(d, pgd, i << PGDIR_SHIFT); + if (!pgd_present(*pgd)) { // acquire semantics + mm_teardown_update_offset(d, cur_offset); + continue; + } + if (mm_teardown_pgd(d, pgd, cur_offset)) + return -EAGAIN; } + return 0; } static void @@ -492,7 +541,9 @@ u64 translate_domain_pte(u64 pteval, u64 This can happen when domU tries to touch i/o port space. Also prevents possible address aliasing issues. */ - if (!(mpaddr - IO_PORTS_PADDR < IO_PORTS_SIZE)) + if (!(mpaddr - IO_PORTS_PADDR < IO_PORTS_SIZE) && + /* and also except UC|WB page */ + (d != dom0 || !efi_ucwb(mpaddr, PAGE_SIZE))) gdprintk(XENLOG_WARNING, "Warning: UC to WB " "for mpaddr=%lx\n", mpaddr); pteval = (pteval & ~_PAGE_MA_MASK) | _PAGE_MA_WB; @@ -666,19 +717,22 @@ unsigned long lookup_domain_mpa(struct d return GPFN_INV_MASK; } - if (mpaddr < d->arch.convmem_end) { + if (mpaddr < d->arch.convmem_end && !d->is_dying) { gdprintk(XENLOG_WARNING, "vcpu %d iip 0x%016lx: non-allocated mpa " - "0x%lx (< 0x%lx)\n", current->vcpu_id, PSCB(current, iip), - mpaddr, d->arch.convmem_end); + "d %"PRId16" 0x%lx (< 0x%lx)\n", + current->vcpu_id, PSCB(current, iip), + d->domain_id, mpaddr, d->arch.convmem_end); } else if (mpaddr - IO_PORTS_PADDR < IO_PORTS_SIZE) { /* Log I/O port probing, but complain less loudly about it */ gdprintk(XENLOG_INFO, "vcpu %d iip 0x%016lx: bad I/O port access " - "0x%lx\n", current->vcpu_id, PSCB(current, iip), + "d %"PRId16" 0x%lx\n", + current->vcpu_id, PSCB(current, iip), d->domain_id, IO_SPACE_SPARSE_DECODING(mpaddr - IO_PORTS_PADDR)); } else { - gdprintk(XENLOG_WARNING, "vcpu %d iip 0x%016lx: bad mpa 0x%lx " - "(=> 0x%lx)\n", current->vcpu_id, PSCB(current, iip), - mpaddr, d->arch.convmem_end); + gdprintk(XENLOG_WARNING, "vcpu %d iip 0x%016lx: bad mpa " + "d %"PRId16" 0x%lx (=> 0x%lx)\n", + current->vcpu_id, PSCB(current, iip), + d->domain_id, mpaddr, d->arch.convmem_end); } if (entry != NULL) @@ -834,15 +888,43 @@ __assign_domain_page(struct domain *d, // dom0 tries to map real machine's I/O region, but failed. // It is very likely that dom0 doesn't boot correctly because // it can't access I/O. So complain here. - if ((flags & ASSIGN_nocache) && - (pte_pfn(ret_pte) != (physaddr >> PAGE_SHIFT) || - !(pte_val(ret_pte) & _PAGE_MA_UC))) - printk("%s:%d WARNING can't assign page domain 0x%p id %d\n" - "\talready assigned pte_val 0x%016lx\n" - "\tmpaddr 0x%016lx physaddr 0x%016lx flags 0x%lx\n", - __func__, __LINE__, - d, d->domain_id, pte_val(ret_pte), - mpaddr, physaddr, flags); + if (flags & ASSIGN_nocache) { + int warn = 0; + + if (pte_pfn(ret_pte) != (physaddr >> PAGE_SHIFT)) + warn = 1; + else if (!(pte_val(ret_pte) & _PAGE_MA_UC)) { + u32 type; + u64 attr; + + warn = 1; + + /* + * See + * complete_dom0_memmap() + * case EFI_RUNTIME_SERVICES_CODE: + * case EFI_RUNTIME_SERVICES_DATA: + * case EFI_ACPI_RECLAIM_MEMORY: + * case EFI_ACPI_MEMORY_NVS: + * case EFI_RESERVED_TYPE: + * + * Currently only EFI_RUNTIME_SERVICES_CODE is found + * so that we suppress only EFI_RUNTIME_SERVICES_CODE case. + */ + type = efi_mem_type(physaddr); + attr = efi_mem_attributes(physaddr); + if (type == EFI_RUNTIME_SERVICES_CODE && + (attr & EFI_MEMORY_UC) && (attr & EFI_MEMORY_WB)) + warn = 0; + } + if (warn) + printk("%s:%d WARNING can't assign page domain 0x%p id %d\n" + "\talready assigned pte_val 0x%016lx\n" + "\tmpaddr 0x%016lx physaddr 0x%016lx flags 0x%lx\n", + __func__, __LINE__, + d, d->domain_id, pte_val(ret_pte), + mpaddr, physaddr, flags); + } return -EAGAIN; } @@ -1020,6 +1102,46 @@ assign_domain_same_page(struct domain *d } } +static int +efi_ucwb(unsigned long physaddr, unsigned long size) +{ + void *efi_map_start, *efi_map_end; + u64 efi_desc_size; + void* p; + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + + for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { + efi_memory_desc_t* md = (efi_memory_desc_t *)p; + unsigned long start = md->phys_addr; + unsigned long end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); + + if (start <= physaddr && physaddr < end) { + if ((physaddr + size) > end) { + gdprintk(XENLOG_INFO, "%s: physaddr 0x%lx size = 0x%lx\n", + __func__, physaddr, size); + return 0; + } + + // for UC|WB space + if( md->attribute & EFI_MEMORY_WB && + md->attribute & EFI_MEMORY_UC ) + return 1; + + return 0; + } + + if (physaddr < start) { + break; + } + } + + return 0; +} + + int efi_mmio(unsigned long physaddr, unsigned long size) { @@ -1760,28 +1882,40 @@ steal_page(struct domain *d, struct page return 0; } -void +int guest_physmap_add_page(struct domain *d, unsigned long gpfn, - unsigned long mfn) + unsigned long mfn, int order) { - BUG_ON(!mfn_valid(mfn)); - BUG_ON(mfn_to_page(mfn)->count_info != (PGC_allocated | 1)); - set_gpfn_from_mfn(mfn, gpfn); - smp_mb(); - assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn, - ASSIGN_writable | ASSIGN_pgc_allocated); + unsigned long i; + for ( i = 0; i < ( 1UL << order); i++) + { + BUG_ON(!mfn_valid(mfn)); + BUG_ON(mfn_to_page(mfn)->count_info != (PGC_allocated | 1)); + set_gpfn_from_mfn(mfn, gpfn); + smp_mb(); + assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn, + ASSIGN_writable | ASSIGN_pgc_allocated); - //BUG_ON(mfn != ((lookup_domain_mpa(d, gpfn << PAGE_SHIFT) & _PFN_MASK) >> PAGE_SHIFT)); + mfn++; + gpfn++; + } perfc_incr(guest_physmap_add_page); + + return 0; } void guest_physmap_remove_page(struct domain *d, unsigned long gpfn, - unsigned long mfn) + unsigned long mfn, int order) { + unsigned long i; + BUG_ON(mfn == 0);//XXX - zap_domain_page_one(d, gpfn << PAGE_SHIFT, 0, mfn); + + for ( i = 0; i < (1UL << order); i++ ) + zap_domain_page_one(d, (gpfn+i) << PAGE_SHIFT, 0, mfn+i); + perfc_incr(guest_physmap_remove_page); } @@ -2183,7 +2317,7 @@ arch_memory_op(int op, XEN_GUEST_HANDLE( if (prev_mfn && mfn_valid(prev_mfn)) { if (is_xen_heap_frame(mfn_to_page(prev_mfn))) /* Xen heap frames are simply unhooked from this phys slot. */ - guest_physmap_remove_page(d, xatp.gpfn, prev_mfn); + guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0); else /* Normal domain memory is freed, to avoid leaking memory. */ guest_remove_page(d, xatp.gpfn); @@ -2192,10 +2326,10 @@ arch_memory_op(int op, XEN_GUEST_HANDLE( /* Unmap from old location, if any. */ gpfn = get_gpfn_from_mfn(mfn); if (gpfn != INVALID_M2P_ENTRY) - guest_physmap_remove_page(d, gpfn, mfn); + guest_physmap_remove_page(d, gpfn, mfn, 0); /* Map at new location. */ - guest_physmap_add_page(d, xatp.gpfn, mfn); + guest_physmap_add_page(d, xatp.gpfn, mfn, 0); out: UNLOCK_BIGLOCK(d); diff -Naurp xen/arch/ia64/xen/oprofile/perfmon.c xen-redhat/arch/ia64/xen/oprofile/perfmon.c --- xen/arch/ia64/xen/oprofile/perfmon.c +++ xen-redhat/arch/ia64/xen/oprofile/perfmon.c @@ -85,6 +85,7 @@ static char * get_cpu_type(void) case 0x07: return "ia64/itanium"; case 0x1f: + case 0x20: return "ia64/itanium2"; default: return "ia64/ia64"; diff -Naurp xen/arch/ia64/xen/pci.c xen-redhat/arch/ia64/xen/pci.c --- xen/arch/ia64/xen/pci.c +++ xen-redhat/arch/ia64/xen/pci.c @@ -0,0 +1,134 @@ +/* + * pci.c - Low-Level PCI Access in IA-64 + * + * Derived from bios32.c of i386 tree. + * + * (c) Copyright 2002, 2005 Hewlett-Packard Development Company, L.P. + * David Mosberger-Tang <davidm@hpl.hp.com> + * Bjorn Helgaas <bjorn.helgaas@hp.com> + * Copyright (C) 2004 Silicon Graphics, Inc. + * + * Note: Above list of copyright holders is incomplete... + */ + +#include <xen/pci.h> +#include <xen/pci_regs.h> +#include <xen/spinlock.h> + +#include <asm/io.h> +#include <asm/sal.h> +#include <asm/hw_irq.h> + +/* + * Low-level SAL-based PCI configuration access functions. Note that SAL + * calls are already serialized (via sal_lock), so we don't need another + * synchronization mechanism here. + */ + +#define PCI_SAL_ADDRESS(seg, bus, devfn, reg) \ + (((u64) seg << 24) | (bus << 16) | (devfn << 8) | (reg)) + +/* SAL 3.2 adds support for extended config space. */ + +#define PCI_SAL_EXT_ADDRESS(seg, bus, devfn, reg) \ + (((u64) seg << 28) | (bus << 20) | (devfn << 12) | (reg)) + +static int +pci_sal_read (unsigned int seg, unsigned int bus, unsigned int devfn, + int reg, int len, u32 *value) +{ + u64 addr, data = 0; + int mode, result; + + if (!value || (seg > 65535) || (bus > 255) || (devfn > 255) || (reg > 4095)) + return -EINVAL; + + if ((seg | reg) <= 255) { + addr = PCI_SAL_ADDRESS(seg, bus, devfn, reg); + mode = 0; + } else { + addr = PCI_SAL_EXT_ADDRESS(seg, bus, devfn, reg); + mode = 1; + } + result = ia64_sal_pci_config_read(addr, mode, len, &data); + if (result != 0) + return -EINVAL; + + *value = (u32) data; + return 0; +} + +static int +pci_sal_write (unsigned int seg, unsigned int bus, unsigned int devfn, + int reg, int len, u32 value) +{ + u64 addr; + int mode, result; + + if ((seg > 65535) || (bus > 255) || (devfn > 255) || (reg > 4095)) + return -EINVAL; + + if ((seg | reg) <= 255) { + addr = PCI_SAL_ADDRESS(seg, bus, devfn, reg); + mode = 0; + } else { + addr = PCI_SAL_EXT_ADDRESS(seg, bus, devfn, reg); + mode = 1; + } + result = ia64_sal_pci_config_write(addr, mode, len, value); + if (result != 0) + return -EINVAL; + return 0; +} + + +uint8_t pci_conf_read8( + unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg) +{ + uint32_t value; + BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255)); + pci_sal_read(0, bus, (dev<<3)|func, reg, 1, &value); + return (uint8_t)value; +} + +uint16_t pci_conf_read16( + unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg) +{ + uint32_t value; + BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255)); + pci_sal_read(0, bus, (dev<<3)|func, reg, 2, &value); + return (uint16_t)value; +} + +uint32_t pci_conf_read32( + unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg) +{ + uint32_t value; + BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255)); + pci_sal_read(0, bus, (dev<<3)|func, reg, 4, &value); + return (uint32_t)value; +} + +void pci_conf_write8( + unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg, + uint8_t data) +{ + BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255)); + pci_sal_write(0, bus, (dev<<3)|func, reg, 1, data); +} + +void pci_conf_write16( + unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg, + uint16_t data) +{ + BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255)); + pci_sal_write(0, bus, (dev<<3)|func, reg, 2, data); +} + +void pci_conf_write32( + unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg, + uint32_t data) +{ + BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255)); + pci_sal_write(0, bus, (dev<<3)|func, reg, 4, data); +} diff -Naurp xen/arch/ia64/xen/regionreg.c xen-redhat/arch/ia64/xen/regionreg.c --- xen/arch/ia64/xen/regionreg.c +++ xen-redhat/arch/ia64/xen/regionreg.c @@ -270,8 +270,16 @@ int set_one_rr(unsigned long rr, unsigne return 1; } +void set_virtual_rr0(void) +{ + struct vcpu *v = current; + + ia64_set_rr(0, v->arch.metaphysical_saved_rr0); + ia64_srlz_d(); +} + // set rr0 to the passed rid (for metaphysical mode so don't use domain offset -int set_metaphysical_rr0(void) +void set_metaphysical_rr0(void) { struct vcpu *v = current; // ia64_rr rrv; @@ -279,7 +287,6 @@ int set_metaphysical_rr0(void) // rrv.ve = 1; FIXME: TURN ME BACK ON WHEN VHPT IS WORKING ia64_set_rr(0,v->arch.metaphysical_rr0); ia64_srlz_d(); - return 1; } void init_all_rr(struct vcpu *v) diff -Naurp xen/arch/ia64/xen/vcpu.c xen-redhat/arch/ia64/xen/vcpu.c --- xen/arch/ia64/xen/vcpu.c +++ xen-redhat/arch/ia64/xen/vcpu.c @@ -173,6 +173,11 @@ void vcpu_init_regs(struct vcpu *v) (unsigned char *)v->domain->arch.shared_info_va + INT_ENABLE_OFFSET(v); VCPU(v, itv) = (1 << 16); /* timer vector masked */ + + /* SAL specification 3.2.4 */ + VCPU(v, vpsr) = IA64_PSR_AC | IA64_PSR_IC | IA64_PSR_BN; + v->vcpu_info->evtchn_upcall_pending = 0; + v->vcpu_info->evtchn_upcall_mask = -1; } /* pta.size must not be 0. The minimum is 15 (32k) */ @@ -234,7 +239,7 @@ IA64FAULT vcpu_get_ar(VCPU * vcpu, u64 r VCPU processor status register access routines **************************************************************************/ -void vcpu_set_metaphysical_mode(VCPU * vcpu, BOOLEAN newmode) +static void vcpu_set_metaphysical_mode(VCPU * vcpu, BOOLEAN newmode) { /* only do something if mode changes */ if (!!newmode ^ !!PSCB(vcpu, metaphysical_mode)) { @@ -242,7 +247,7 @@ void vcpu_set_metaphysical_mode(VCPU * v if (newmode) set_metaphysical_rr0(); else if (PSCB(vcpu, rrs[0]) != -1) - set_one_rr(0, PSCB(vcpu, rrs[0])); + set_virtual_rr0(); } } @@ -392,54 +397,35 @@ IA64FAULT vcpu_set_psr_l(VCPU * vcpu, u6 newpsr = *(struct ia64_psr *)&val; ipsr = (struct ia64_psr *)®s->cr_ipsr; - // just handle psr.up and psr.pp for now - //if (val & ~(IA64_PSR_PP | IA64_PSR_UP | IA64_PSR_SP)) - // return IA64_ILLOP_FAULT; - // however trying to set other bits can't be an error as it is in ssm - if (newpsr.dfh) { - ipsr->dfh = 1; - PSCB(vcpu, vpsr_dfh) = 1; - } else { - ipsr->dfh = PSCB(vcpu, hpsr_dfh); - PSCB(vcpu, vpsr_dfh) = 0; - } - if (newpsr.dfl) - ipsr->dfl = 1; - if (newpsr.pp) { - ipsr->pp = 1; - PSCB(vcpu, vpsr_pp) = 1; - } else { - ipsr->pp = 1; - PSCB(vcpu, vpsr_pp) = 0; - } - if (newpsr.up) - ipsr->up = 1; - if (newpsr.sp) - ipsr->sp = 1; - if (newpsr.i) { - if (vcpu->vcpu_info->evtchn_upcall_mask) - enabling_interrupts = 1; - vcpu->vcpu_info->evtchn_upcall_mask = 0; - } - if (newpsr.ic) - PSCB(vcpu, interrupt_collection_enabled) = 1; - if (newpsr.mfl) - ipsr->mfl = 1; - if (newpsr.mfh) - ipsr->mfh = 1; - if (newpsr.ac) - ipsr->ac = 1; - if (newpsr.up) - ipsr->up = 1; - if (newpsr.dt && newpsr.rt) - vcpu_set_metaphysical_mode(vcpu, FALSE); - else - vcpu_set_metaphysical_mode(vcpu, TRUE); - if (newpsr.be) - ipsr->be = 1; + + ipsr->be = newpsr.be; + ipsr->up = newpsr.up; + ipsr->ac = newpsr.ac; + ipsr->mfl = newpsr.mfl; + ipsr->mfh = newpsr.mfh; + + PSCB(vcpu, interrupt_collection_enabled) = newpsr.ic; + + if (newpsr.i && vcpu->vcpu_info->evtchn_upcall_mask) + enabling_interrupts = 1; + + vcpu->vcpu_info->evtchn_upcall_mask = !(newpsr.i); + + vcpu_set_metaphysical_mode(vcpu, !(newpsr.dt && newpsr.rt)); + + ipsr->dfl = newpsr.dfl; + PSCB(vcpu, vpsr_dfh) = newpsr.dfh; + ipsr->dfh = newpsr.dfh ? 1 : PSCB(vcpu, hpsr_dfh); + + ipsr->sp = newpsr.sp; + + /* xenoprof: Don't change ipsr->pp, it is manipulated by xenoprof */ + PSCB(vcpu, vpsr_pp) = newpsr.pp; + if (enabling_interrupts && vcpu_check_pending_interrupts(vcpu) != SPURIOUS_VECTOR) PSCB(vcpu, pending_interruption) = 1; + return IA64_NO_FAULT; } @@ -1330,21 +1316,21 @@ IA64FAULT vcpu_rfi(VCPU * vcpu) { // TODO: Only allowed for current vcpu PSR psr; - u64 int_enable, ifs; + u64 ifs, int_enable, psr_ic; REGS *regs = vcpu_regs(vcpu); psr.i64 = PSCB(vcpu, ipsr); + int_enable = psr.ia64_psr.i; + psr_ic = psr.ia64_psr.ic; if (psr.ia64_psr.cpl < 3) psr.ia64_psr.cpl = 2; - int_enable = psr.ia64_psr.i; + if (psr.ia64_psr.dfh) { PSCB(vcpu, vpsr_dfh) = 1; } else { psr.ia64_psr.dfh = PSCB(vcpu, hpsr_dfh); PSCB(vcpu, vpsr_dfh) = 0; } - if (psr.ia64_psr.ic) - PSCB(vcpu, interrupt_collection_enabled) = 1; if (psr.ia64_psr.dt && psr.ia64_psr.rt && psr.ia64_psr.it) vcpu_set_metaphysical_mode(vcpu, FALSE); else @@ -1363,9 +1349,9 @@ IA64FAULT vcpu_rfi(VCPU * vcpu) regs->cr_ipsr = psr.i64; regs->cr_iip = PSCB(vcpu, iip); - PSCB(vcpu, interrupt_collection_enabled) = 1; vcpu_bsw1(vcpu); - vcpu->vcpu_info->evtchn_upcall_mask = !int_enable; + PSCB(vcpu, interrupt_collection_enabled) = !!(psr_ic); + vcpu->vcpu_info->evtchn_upcall_mask = !(int_enable); return IA64_NO_FAULT; } @@ -1526,6 +1512,26 @@ vcpu_get_domain_bundle(VCPU * vcpu, REGS // copy its value to the variable, tr, before use. TR_ENTRY tr; + // fast path: + // try to access gip with guest virtual address directly. + // This may cause tlb miss. see vcpu_translate(). Be careful! + swap_rr0 = (!region && PSCB(vcpu, metaphysical_mode)); + if (swap_rr0) { + set_virtual_rr0(); + } + *bundle = __get_domain_bundle(gip); + if (swap_rr0) { + set_metaphysical_rr0(); + } + + if (!bundle->i64[0] && !bundle->i64[1]) { + dprintk(XENLOG_INFO, "%s gip 0x%lx\n", __func__, gip); + } else { + // Okay, mDTC successed + return 1; + } + // mDTC failed, so try vTLB. + trp = vcpu_tr_lookup(vcpu, gip, rid, 0); if (trp != NULL) { tr = *trp; @@ -1545,28 +1551,13 @@ vcpu_get_domain_bundle(VCPU * vcpu, REGS tr = *trp; goto found; } -#if 0 tr = PSCBX(vcpu, dtlb); if (vcpu_match_tr_entry(&tr, gip, rid)) { goto found; } -#endif - // try to access gip with guest virtual address - // This may cause tlb miss. see vcpu_translate(). Be careful! - swap_rr0 = (!region && PSCB(vcpu, metaphysical_mode)); - if (swap_rr0) { - set_one_rr(0x0, PSCB(vcpu, rrs[0])); - } - *bundle = __get_domain_bundle(gip); - if (swap_rr0) { - set_metaphysical_rr0(); - } - if (bundle->i64[0] == 0 && bundle->i64[1] == 0) { - dprintk(XENLOG_INFO, "%s gip 0x%lx\n", __func__, gip); - return 0; - } - return 1; + // mDTC and vTLB failed. so reflect tlb miss into the guest. + return 0; found: gpip = ((tr.pte.ppn >> (tr.ps - 12)) << tr.ps) | @@ -1744,33 +1735,65 @@ IA64FAULT vcpu_tak(VCPU * vcpu, u64 vadr IA64FAULT vcpu_set_dbr(VCPU * vcpu, u64 reg, u64 val) { - // TODO: unimplemented DBRs return a reserved register fault - // TODO: Should set Logical CPU state, not just physical - ia64_set_dbr(reg, val); + if (reg >= IA64_NUM_DBG_REGS) + return IA64_RSVDREG_FAULT; + if ((reg & 1) == 0) { + /* Validate address. */ + if (val >= HYPERVISOR_VIRT_START && val <= HYPERVISOR_VIRT_END) + return IA64_ILLOP_FAULT; + } else { + if (!VMX_DOMAIN(vcpu)) { + /* Mask PL0. */ + val &= ~(1UL << 56); + } + } + if (val != 0) + vcpu->arch.dbg_used |= (1 << reg); + else + vcpu->arch.dbg_used &= ~(1 << reg); + vcpu->arch.dbr[reg] = val; + if (vcpu == current) + ia64_set_dbr(reg, val); return IA64_NO_FAULT; } IA64FAULT vcpu_set_ibr(VCPU * vcpu, u64 reg, u64 val) { - // TODO: unimplemented IBRs return a reserved register fault - // TODO: Should set Logical CPU state, not just physical - ia64_set_ibr(reg, val); + if (reg >= IA64_NUM_DBG_REGS) + return IA64_RSVDREG_FAULT; + if ((reg & 1) == 0) { + /* Validate address. */ + if (val >= HYPERVISOR_VIRT_START && val <= HYPERVISOR_VIRT_END) + return IA64_ILLOP_FAULT; + } else { + if (!VMX_DOMAIN(vcpu)) { + /* Mask PL0. */ + val &= ~(1UL << 56); + } + } + if (val != 0) + vcpu->arch.dbg_used |= (1 << (reg + IA64_NUM_DBG_REGS)); + else + vcpu->arch.dbg_used &= ~(1 << (reg + IA64_NUM_DBG_REGS)); + vcpu->arch.ibr[reg] = val; + if (vcpu == current) + ia64_set_ibr(reg, val); return IA64_NO_FAULT; } IA64FAULT vcpu_get_dbr(VCPU * vcpu, u64 reg, u64 * pval) { - // TODO: unimplemented DBRs return a reserved register fault - u64 val = ia64_get_dbr(reg); - *pval = val; + if (reg >= IA64_NUM_DBG_REGS) + return IA64_RSVDREG_FAULT; + *pval = vcpu->arch.dbr[reg]; return IA64_NO_FAULT; } IA64FAULT vcpu_get_ibr(VCPU * vcpu, u64 reg, u64 * pval) { - // TODO: unimplemented IBRs return a reserved register fault - u64 val = ia64_get_ibr(reg); - *pval = val; + if (reg >= IA64_NUM_DBG_REGS) + return IA64_RSVDREG_FAULT; + *pval = vcpu->arch.ibr[reg]; return IA64_NO_FAULT; } @@ -1973,8 +1996,8 @@ unsigned long vcpu_get_rr_ve(VCPU * vcpu IA64FAULT vcpu_set_rr(VCPU * vcpu, u64 reg, u64 val) { PSCB(vcpu, rrs)[reg >> 61] = val; - // warning: set_one_rr() does it "live" - set_one_rr(reg, val); + if (vcpu == current) + set_one_rr(reg, val); return IA64_NO_FAULT; } @@ -2203,7 +2226,7 @@ IA64FAULT vcpu_itc_d(VCPU * vcpu, u64 pt if (!pteval) return IA64_ILLOP_FAULT; if (swap_rr0) - set_one_rr(0x0, PSCB(vcpu, rrs[0])); + set_virtual_rr0(); vcpu_itc_no_srlz(vcpu, 2, ifa, pteval, pte, logps, &entry); if (swap_rr0) set_metaphysical_rr0(); @@ -2230,7 +2253,7 @@ IA64FAULT vcpu_itc_i(VCPU * vcpu, u64 pt if (!pteval) return IA64_ILLOP_FAULT; if (swap_rr0) - set_one_rr(0x0, PSCB(vcpu, rrs[0])); + set_virtual_rr0(); vcpu_itc_no_srlz(vcpu, 1, ifa, pteval, pte, logps, &entry); if (swap_rr0) set_metaphysical_rr0(); diff -Naurp xen/arch/ia64/xen/vhpt.c xen-redhat/arch/ia64/xen/vhpt.c --- xen/arch/ia64/xen/vhpt.c +++ xen-redhat/arch/ia64/xen/vhpt.c @@ -137,8 +137,8 @@ void vhpt_init(void) panic("vhpt_init: bad VHPT alignment!\n"); __get_cpu_var(vhpt_paddr) = paddr; __get_cpu_var(vhpt_pend) = paddr + (1 << VHPT_SIZE_LOG2) - 1; - printk("vhpt_init: vhpt paddr=0x%lx, end=0x%lx\n", - paddr, __get_cpu_var(vhpt_pend)); + printk(XENLOG_DEBUG "vhpt_init: vhpt paddr=0x%lx, end=0x%lx\n", + paddr, __get_cpu_var(vhpt_pend)); vhpt_erase(paddr); // we don't enable VHPT here. // context_switch() or schedule_tail() does it. @@ -220,31 +220,20 @@ domain_purge_swtc_entries_vcpu_dirty_mas // (e.g. vcpu == current), smp_mb() is unnecessary. void vcpu_flush_vtlb_all(struct vcpu *v) { - if (VMX_DOMAIN(v)) { - /* This code may be call for remapping shared_info and - grant_table share page from guest_physmap_remove_page() - in arch_memory_op() XENMEM_add_to_physmap to realize - PV-on-HVM feature. */ - /* FIXME: This is not SMP-safe yet about p2m table */ - /* Purge vTLB for VT-i domain */ - thash_purge_all(v); - } - else { - /* First VCPU tlb. */ - vcpu_purge_tr_entry(&PSCBX(v,dtlb)); - vcpu_purge_tr_entry(&PSCBX(v,itlb)); - smp_mb(); + /* First VCPU tlb. */ + vcpu_purge_tr_entry(&PSCBX(v,dtlb)); + vcpu_purge_tr_entry(&PSCBX(v,itlb)); + smp_mb(); - /* Then VHPT. */ - if (HAS_PERVCPU_VHPT(v->domain)) - vcpu_vhpt_flush(v); - else - local_vhpt_flush(); - smp_mb(); + /* Then VHPT. */ + if (HAS_PERVCPU_VHPT(v->domain)) + vcpu_vhpt_flush(v); + else + local_vhpt_flush(); + smp_mb(); - /* Then mTLB. */ - local_flush_tlb_all(); - } + /* Then mTLB. */ + local_flush_tlb_all(); /* We could clear bit in d->domain_dirty_cpumask only if domain d in not running on this processor. There is currently no easy way to @@ -268,6 +257,15 @@ void domain_flush_vtlb_all(struct domain if (!v->is_initialised) continue; + if (VMX_DOMAIN(v)) { + // This code may be called for remapping shared_info + // and grant_table from guest_physmap_remove_page() + // in arch_memory_op() XENMEM_add_to_physmap to realize + // PV-on-HVM feature. + vmx_vcpu_flush_vtlb_all(v); + continue; + } + if (v->processor == cpu) vcpu_flush_vtlb_all(v); else diff -Naurp xen/arch/ia64/xen/xensetup.c xen-redhat/arch/ia64/xen/xensetup.c --- xen/arch/ia64/xen/xensetup.c +++ xen-redhat/arch/ia64/xen/xensetup.c @@ -19,6 +19,7 @@ #include <xen/serial.h> #include <xen/trace.h> #include <xen/keyhandler.h> +#include <xen/vga.h> #include <asm/meminit.h> #include <asm/page.h> #include <asm/setup.h> @@ -46,7 +47,6 @@ extern long is_platform_hp_ski(void); extern void early_setup_arch(char **); extern void late_setup_arch(char **); extern void hpsim_serial_init(void); -extern void alloc_dom0(void); extern void setup_per_cpu_areas(void); extern void mem_init(void); extern void init_IRQ(void); @@ -81,8 +81,10 @@ boolean_param("xencons_poll", opt_xencon * elilo chooses 256M as alignment when relocating, alignment issue * on IPF can be addressed. */ -unsigned int opt_xenheap_megabytes = XENHEAP_DEFAULT_MB; -unsigned long xenheap_size = XENHEAP_DEFAULT_SIZE; +static unsigned int opt_xenheap_megabytes = XENHEAP_DEFAULT_MB; +integer_param("xenheap_megabytes", opt_xenheap_megabytes); + +unsigned long xenheap_size; extern long running_on_sim; unsigned long xen_pstart; void *xen_heap_start __read_mostly; @@ -274,6 +276,20 @@ void start_kernel(void) } serial_init_preirq(); +#ifdef CONFIG_VGA + /* Plug in a default VGA mode */ + vga_console_info.video_type = XEN_VGATYPE_TEXT_MODE_3; + vga_console_info.u.text_mode_3.font_height = 16; /* generic VGA? */ + vga_console_info.u.text_mode_3.cursor_x = + ia64_boot_param->console_info.orig_x; + vga_console_info.u.text_mode_3.cursor_y = + ia64_boot_param->console_info.orig_y; + vga_console_info.u.text_mode_3.rows = + ia64_boot_param->console_info.num_rows; + vga_console_info.u.text_mode_3.columns = + ia64_boot_param->console_info.num_cols; +#endif + init_console(); set_printk_prefix("(XEN) "); @@ -290,6 +306,7 @@ void start_kernel(void) printk("Xen command line: %s\n", saved_command_line); /* xenheap should be in same TR-covered range with xen image */ + xenheap_size = opt_xenheap_megabytes << 20; xenheap_phys_end = xen_pstart + xenheap_size; printk("xen image pstart: 0x%lx, xenheap pend: 0x%lx\n", xen_pstart, xenheap_phys_end); @@ -409,8 +426,6 @@ void start_kernel(void) trap_init(); - alloc_dom0(); - init_xenheap_pages(__pa(xen_heap_start), xenheap_phys_end); printk("Xen heap: %luMB (%lukB)\n", (xenheap_phys_end-__pa(xen_heap_start)) >> 20, diff -Naurp xen/arch/powerpc/domain.c xen-redhat/arch/powerpc/domain.c --- xen/arch/powerpc/domain.c +++ xen-redhat/arch/powerpc/domain.c @@ -313,13 +313,13 @@ static void relinquish_memory(struct dom spin_unlock_recursive(&d->page_alloc_lock); } -void domain_relinquish_resources(struct domain *d) +int domain_relinquish_resources(struct domain *d) { relinquish_memory(d, &d->xenpage_list); relinquish_memory(d, &d->page_list); xfree(d->arch.foreign_mfns); xfree(d->arch.p2m); - return; + return 0; } void arch_dump_domain_info(struct domain *d) diff -Naurp xen/arch/powerpc/mm.c xen-redhat/arch/powerpc/mm.c --- xen/arch/powerpc/mm.c +++ xen-redhat/arch/powerpc/mm.c @@ -338,7 +338,7 @@ uint allocate_extents(struct domain *d, /* Build p2m mapping for newly allocated extent. */ mfn = page_to_mfn(pg); for (i = 0; i < (1 << ext_order); i++) - guest_physmap_add_page(d, gpfn + i, mfn + i); + guest_physmap_add_page(d, gpfn + i, mfn + i, 0); /* Bump starting PFN by extent size pages. */ gpfn += ext_nrpages; @@ -383,7 +383,7 @@ int allocate_rma(struct domain *d, unsig clear_page((void *)page_to_maddr(&d->arch.rma_page[i])); /* Set up p2m mapping for RMA. */ - guest_physmap_add_page(d, i, mfn+i); + guest_physmap_add_page(d, i, mfn+i, 0); } /* shared_info uses last page of RMA */ @@ -579,7 +579,7 @@ void guest_physmap_add_page( } void guest_physmap_remove_page( - struct domain *d, unsigned long gpfn, unsigned long mfn) + struct domain *d, unsigned long gpfn, unsigned long mfn, int order) { if (page_get_owner(mfn_to_page(mfn)) != d) { printk("Won't unmap foreign MFN 0x%lx for DOM%d\n", mfn, d->domain_id); diff -Naurp xen/arch/powerpc/sysctl.c xen-redhat/arch/powerpc/sysctl.c --- xen/arch/powerpc/sysctl.c +++ xen-redhat/arch/powerpc/sysctl.c @@ -45,10 +45,10 @@ long arch_do_sysctl(struct xen_sysctl *s cpus_weight(cpu_sibling_map[0]); pi->cores_per_socket = cpus_weight(cpu_core_map[0]) / pi->threads_per_core; - pi->sockets_per_node = - num_online_cpus() / cpus_weight(cpu_core_map[0]); + pi->sockets_per_node = num_online_cpus() / + (num_online_nodes() * pi->cores_per_socket * pi->threads_per_core); - pi->nr_nodes = 1; + pi->nr_nodes = num_online_nodes(); pi->total_pages = total_pages; pi->free_pages = avail_domheap_pages(); pi->cpu_khz = cpu_khz; diff -Naurp xen/arch/x86/acpi/boot.c xen-redhat/arch/x86/acpi/boot.c --- xen/arch/x86/acpi/boot.c +++ xen-redhat/arch/x86/acpi/boot.c @@ -36,6 +36,7 @@ #include <asm/apic.h> #include <asm/io.h> #include <asm/mpspec.h> +#include <asm/processor.h> #include <mach_apic.h> #include <mach_mpparse.h> @@ -918,5 +919,21 @@ int __init acpi_boot_init(void) acpi_table_parse(ACPI_HPET, acpi_parse_hpet); + acpi_dmar_init(); + return 0; } + +unsigned int acpi_get_processor_id(unsigned int cpu) +{ + unsigned int acpiid, apicid; + + if ((apicid = x86_cpu_to_apicid[cpu]) == 0xff) + return 0xff; + + for (acpiid = 0; acpiid < ARRAY_SIZE(x86_acpiid_to_apicid); acpiid++) + if (x86_acpiid_to_apicid[acpiid] == apicid) + return acpiid; + + return 0xff; +} diff -Naurp xen/arch/x86/apic.c xen-redhat/arch/x86/apic.c --- xen/arch/x86/apic.c +++ xen-redhat/arch/x86/apic.c @@ -40,7 +40,7 @@ /* * Knob to control our willingness to enable the local APIC. */ -int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ +static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ /* * Debug level @@ -704,7 +704,7 @@ static void apic_pm_activate(void) static void __init lapic_disable(char *str) { enable_local_apic = -1; - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + setup_clear_cpu_cap(X86_FEATURE_APIC); } custom_param("nolapic", lapic_disable); @@ -737,7 +737,7 @@ static int __init detect_init_APIC (void switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) || - (boot_cpu_data.x86 == 15)) + (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x17)) break; goto no_apic; case X86_VENDOR_INTEL: diff -Naurp xen/arch/x86/boot/head.S xen-redhat/arch/x86/boot/head.S --- xen/arch/x86/boot/head.S +++ xen-redhat/arch/x86/boot/head.S @@ -98,6 +98,7 @@ __start: mov $0x80000001,%eax cpuid 1: mov %edx,sym_phys(cpuid_ext_features) + mov %edx,sym_phys(boot_cpu_data)+CPUINFO_ext_features #if defined(__x86_64__) /* Check for availability of long mode. */ diff -Naurp xen/arch/x86/boot/x86_32.S xen-redhat/arch/x86/boot/x86_32.S --- xen/arch/x86/boot/x86_32.S +++ xen-redhat/arch/x86/boot/x86_32.S @@ -78,7 +78,7 @@ idt_descr: .word 0 gdt_descr: .word LAST_RESERVED_GDT_BYTE - .long gdt_table - FIRST_RESERVED_GDT_BYTE + .long boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE .align PAGE_SIZE, 0 /* NB. Rings != 0 get access up to MACH2PHYS_VIRT_END. This allows access to */ @@ -86,16 +86,17 @@ gdt_descr: #define GUEST_DESC(d) \ .long ((MACH2PHYS_VIRT_END - 1) >> 12) & 0xffff, \ ((MACH2PHYS_VIRT_END - 1) >> 12) & (0xf << 16) | (d) -ENTRY(gdt_table) - .quad 0x0000000000000000 /* unused */ +ENTRY(boot_cpu_gdt_table) + .quad 0x0000000000000000 /* double fault TSS */ .quad 0x00cf9a000000ffff /* 0xe008 ring 0 4.00GB code at 0x0 */ .quad 0x00cf92000000ffff /* 0xe010 ring 0 4.00GB data at 0x0 */ GUEST_DESC(0x00c0ba00) /* 0xe019 ring 1 3.xxGB code at 0x0 */ GUEST_DESC(0x00c0b200) /* 0xe021 ring 1 3.xxGB data at 0x0 */ GUEST_DESC(0x00c0fa00) /* 0xe02b ring 3 3.xxGB code at 0x0 */ GUEST_DESC(0x00c0f200) /* 0xe033 ring 3 3.xxGB data at 0x0 */ - .quad 0x0000000000000000 /* unused */ - .fill 2*NR_CPUS,8,0 /* space for TSS and LDT per CPU */ + .fill (PER_CPU_GDT_ENTRY - FLAT_RING3_DS / 8 - 1), 8, 0 + .quad 0x0000910000000000 /* per-CPU entry (limit == cpu) */ + .align PAGE_SIZE,0 #ifdef CONFIG_X86_PAE .align 32 diff -Naurp xen/arch/x86/boot/x86_64.S xen-redhat/arch/x86/boot/x86_64.S --- xen/arch/x86/boot/x86_64.S +++ xen-redhat/arch/x86/boot/x86_64.S @@ -85,7 +85,7 @@ multiboot_ptr: .word 0 gdt_descr: .word LAST_RESERVED_GDT_BYTE - .quad gdt_table - FIRST_RESERVED_GDT_BYTE + .quad boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE .word 0,0,0 idt_descr: @@ -96,7 +96,7 @@ ENTRY(stack_start) .quad cpu0_stack .align PAGE_SIZE, 0 -ENTRY(gdt_table) +ENTRY(boot_cpu_gdt_table) .quad 0x0000000000000000 /* unused */ .quad 0x00af9a000000ffff /* 0xe008 ring 0 code, 64-bit mode */ .quad 0x00cf92000000ffff /* 0xe010 ring 0 data */ @@ -105,13 +105,13 @@ ENTRY(gdt_table) .quad 0x00cff2000000ffff /* 0xe02b ring 3 data */ .quad 0x00affa000000ffff /* 0xe033 ring 3 code, 64-bit mode */ .quad 0x00cf9a000000ffff /* 0xe038 ring 0 code, compatibility */ - .org gdt_table - FIRST_RESERVED_GDT_BYTE + __TSS(0) * 8 - .fill 4*NR_CPUS,8,0 /* space for TSS and LDT per CPU */ + .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0 + .quad 0x0000910000000000 /* per-CPU entry (limit == cpu) */ .align PAGE_SIZE, 0 /* NB. Even rings != 0 get access to the full 4Gb, as only the */ /* (compatibility) machine->physical mapping table lives there. */ -ENTRY(compat_gdt_table) +ENTRY(boot_cpu_compat_gdt_table) .quad 0x0000000000000000 /* unused */ .quad 0x00af9a000000ffff /* 0xe008 ring 0 code, 64-bit mode */ .quad 0x00cf92000000ffff /* 0xe010 ring 0 data */ @@ -120,5 +120,6 @@ ENTRY(compat_gdt_table) .quad 0x00cffa000000ffff /* 0xe02b ring 3 code, compatibility */ .quad 0x00cff2000000ffff /* 0xe033 ring 3 data */ .quad 0x00cf9a000000ffff /* 0xe038 ring 0 code, compatibility */ - .org compat_gdt_table - FIRST_RESERVED_GDT_BYTE + __TSS(0) * 8 - .fill 4*NR_CPUS,8,0 /* space for TSS and LDT per CPU */ + .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0 + .quad 0x0000910000000000 /* per-CPU entry (limit == cpu) */ + .align PAGE_SIZE, 0 diff -Naurp xen/arch/x86/cpu/amd.c xen-redhat/arch/x86/cpu/amd.c --- xen/arch/x86/cpu/amd.c +++ xen-redhat/arch/x86/cpu/amd.c @@ -3,6 +3,7 @@ #include <xen/bitops.h> #include <xen/mm.h> #include <xen/smp.h> +#include <xen/pci.h> #include <asm/io.h> #include <asm/msr.h> #include <asm/processor.h> @@ -66,19 +67,6 @@ static int c1_ramping_may_cause_clock_dr return 1; } -/* PCI access functions. Should be safe to use 0xcf8/0xcfc port accesses here. */ -static u8 pci_read_byte(u32 bus, u32 dev, u32 fn, u32 reg) -{ - outl((1U<<31) | (bus << 16) | (dev << 11) | (fn << 8) | (reg & ~3), 0xcf8); - return inb(0xcfc + (reg & 3)); -} - -static void pci_write_byte(u32 bus, u32 dev, u32 fn, u32 reg, u8 val) -{ - outl((1U<<31) | (bus << 16) | (dev << 11) | (fn << 8) | (reg & ~3), 0xcf8); - outb(val, 0xcfc + (reg & 3)); -} - /* * Disable C1-Clock ramping if enabled in PMM7.CpuLowPwrEnh on 8th-generation * cores only. Assume BIOS has setup all Northbridges equivalently. @@ -86,18 +74,20 @@ static void pci_write_byte(u32 bus, u32 static void disable_c1_ramping(void) { u8 pmm7; - int node; + int node, nr_nodes; - for (node=0; node < NR_CPUS; node++) { - /* PMM7: bus=0, dev=0x18+node, function=0x3, register=0x87. */ - pmm7 = pci_read_byte(0, 0x18+node, 0x3, 0x87); - /* Invalid read means we've updated every Northbridge. */ - if (pmm7 == 0xFF) - break; - pmm7 &= 0xFC; /* clear pmm7[1:0] */ - pci_write_byte(0, 0x18+node, 0x3, 0x87, pmm7); - printk ("AMD: Disabling C1 Clock Ramping Node #%x\n", node); - } + /* Read the number of nodes from the first Northbridge. */ + nr_nodes = ((pci_conf_read32(0, 0x18, 0x0, 0x60)>>4)&0x07)+1; + for (node = 0; node < nr_nodes; node++) { + /* PMM7: bus=0, dev=0x18+node, function=0x3, register=0x87. */ + pmm7 = pci_conf_read8(0, 0x18+node, 0x3, 0x87); + /* Invalid read means we've updated every Northbridge. */ + if (pmm7 == 0xFF) + break; + pmm7 &= 0xFC; /* clear pmm7[1:0] */ + pci_conf_write8(0, 0x18+node, 0x3, 0x87, pmm7); + printk ("AMD: Disabling C1 Clock Ramping Node #%x\n", node); + } } static void __init init_amd(struct cpuinfo_x86 *c) @@ -278,7 +268,7 @@ static void __init init_amd(struct cpuin } switch (c->x86) { - case 15: + case 0xf ... 0x17: set_bit(X86_FEATURE_K8, c->x86_capability); break; case 6: @@ -303,11 +293,8 @@ static void __init init_amd(struct cpuin display_cacheinfo(c); - if (cpuid_eax(0x80000000) >= 0x80000008) { + if (cpuid_eax(0x80000000) >= 0x80000008) c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; - if (c->x86_max_cores & (c->x86_max_cores - 1)) - c->x86_max_cores = 1; - } if (cpuid_eax(0x80000000) >= 0x80000007) { c->x86_power = cpuid_edx(0x80000007); @@ -317,15 +304,18 @@ static void __init init_amd(struct cpuin #ifdef CONFIG_X86_HT /* - * On a AMD dual core setup the lower bits of the APIC id + * On a AMD multi core setup the lower bits of the APIC id * distingush the cores. Assumes number of cores is a power * of two. */ if (c->x86_max_cores > 1) { int cpu = smp_processor_id(); - unsigned bits = 0; - while ((1 << bits) < c->x86_max_cores) - bits++; + unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf; + + if (bits == 0) { + while ((1 << bits) < c->x86_max_cores) + bits++; + } cpu_core_id[cpu] = phys_proc_id[cpu] & ((1<<bits)-1); phys_proc_id[cpu] >>= bits; printk(KERN_INFO "CPU %d(%d) -> Core %d\n", diff -Naurp xen/arch/x86/cpu/common.c xen-redhat/arch/x86/cpu/common.c --- xen/arch/x86/cpu/common.c +++ xen-redhat/arch/x86/cpu/common.c @@ -23,6 +23,20 @@ static int disable_x86_serial_nr __devin struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; +/* + * Default host IA32_CR_PAT value to cover all memory types. + * BIOS usually sets it to 0x07040600070406. + */ +u64 host_pat = 0x050100070406; + +static unsigned int __cpuinitdata cleared_caps[NCAPINTS]; + +void __init setup_clear_cpu_cap(unsigned int cap) +{ + __clear_bit(cap, boot_cpu_data.x86_capability); + __set_bit(cap, cleared_caps); +} + static void default_init(struct cpuinfo_x86 * c) { /* Not much we can do here... */ @@ -220,6 +234,7 @@ static void __init early_cpu_detect(void if (c->x86 >= 0x6) c->x86_model += ((tfms >> 16) & 0xF) << 4; c->x86_mask = tfms & 15; + cap0 &= ~cleared_caps[0]; if (cap0 & (1<<19)) c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; c->x86_capability[0] = cap0; /* Added for Xen bootstrap */ @@ -250,10 +265,10 @@ void __devinit generic_identify(struct c c->x86_capability[4] = excap; c->x86 = (tfms >> 8) & 15; c->x86_model = (tfms >> 4) & 15; - if (c->x86 == 0xf) { + if (c->x86 == 0xf) c->x86 += (tfms >> 20) & 0xff; + if (c->x86 >= 0x6) c->x86_model += ((tfms >> 16) & 0xF) << 4; - } c->x86_mask = tfms & 15; } else { /* Have CPUID level 0 only - unheard of */ @@ -378,6 +393,9 @@ void __devinit identify_cpu(struct cpuin if (disable_pse) clear_bit(X86_FEATURE_PSE, c->x86_capability); + for (i = 0 ; i < NCAPINTS ; ++i) + c->x86_capability[i] &= ~cleared_caps[i]; + /* If the model name is still unset, do table lookup. */ if ( !c->x86_model_id[0] ) { char *p; @@ -422,8 +440,6 @@ void __devinit identify_cpu(struct cpuin if (c == &boot_cpu_data) mtrr_bp_init(); - else - mtrr_ap_init(); } #ifdef CONFIG_X86_HT @@ -549,7 +565,10 @@ void __devinit cpu_init(void) { int cpu = smp_processor_id(); struct tss_struct *t = &init_tss[cpu]; - char gdt_load[10]; + struct desc_ptr gdt_desc = { + .base = (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY), + .limit = LAST_RESERVED_GDT_BYTE + }; if (cpu_test_and_set(cpu, cpu_initialized)) { printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); @@ -557,12 +576,16 @@ void __devinit cpu_init(void) } printk(KERN_INFO "Initializing CPU#%d\n", cpu); + if (cpu_has_pat) + wrmsrl(MSR_IA32_CR_PAT, host_pat); + if (cpu_has_vme || cpu_has_tsc || cpu_has_de) clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE; - *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(current); - __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) ); + /* Install correct page table. */ + write_ptbase(current); + + __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_desc) ); /* No nested task. */ __asm__("pushf ; andw $0xbfff,(%"__OP"sp) ; popf"); @@ -590,7 +613,4 @@ void __devinit cpu_init(void) #define CD(register) __asm__("mov %0,%%db" #register ::"r"(0UL) ); CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7); #undef CD - - /* Install correct page table. */ - write_ptbase(current); } diff -Naurp xen/arch/x86/cpu/intel.c xen-redhat/arch/x86/cpu/intel.c --- xen/arch/x86/cpu/intel.c +++ xen-redhat/arch/x86/cpu/intel.c @@ -118,6 +118,12 @@ static void __devinit init_intel(struct select_idle_routine(c); l2 = init_intel_cacheinfo(c); + if (c->cpuid_level > 9) { + unsigned eax = cpuid_eax(10); + /* Check for version and the number of counters */ + if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) + set_bit(X86_FEATURE_ARCH_PERFMON, c->x86_capability); + } /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */ if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) diff -Naurp xen/arch/x86/cpu/mcheck/k7.c xen-redhat/arch/x86/cpu/mcheck/k7.c --- xen/arch/x86/cpu/mcheck/k7.c +++ xen-redhat/arch/x86/cpu/mcheck/k7.c @@ -75,6 +75,9 @@ void amd_mcheck_init(struct cpuinfo_x86 machine_check_vector = k7_machine_check; wmb(); + if (!cpu_has(c, X86_FEATURE_MCE)) + return; + printk (KERN_INFO "Intel machine check architecture supported.\n"); rdmsr (MSR_IA32_MCG_CAP, l, h); if (l & (1<<8)) /* Control register present ? */ diff -Naurp xen/arch/x86/cpu/mcheck/mce.c xen-redhat/arch/x86/cpu/mcheck/mce.c --- xen/arch/x86/cpu/mcheck/mce.c +++ xen-redhat/arch/x86/cpu/mcheck/mce.c @@ -34,8 +34,7 @@ void mcheck_init(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_AMD: - if (c->x86==6 || c->x86==15) - amd_mcheck_init(c); + amd_mcheck_init(c); break; case X86_VENDOR_INTEL: diff -Naurp xen/arch/x86/cpu/mtrr/generic.c xen-redhat/arch/x86/cpu/mtrr/generic.c --- xen/arch/x86/cpu/mtrr/generic.c +++ xen-redhat/arch/x86/cpu/mtrr/generic.c @@ -202,7 +202,9 @@ static int set_mtrr_var_ranges(unsigned return changed; } -static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi) +static u32 deftype_lo, deftype_hi; + +static unsigned long set_mtrr_state(void) /* [SUMMARY] Set the MTRR state for this CPU. <state> The MTRR state information to read. <ctxt> Some relevant CPU context. @@ -233,7 +235,6 @@ static unsigned long set_mtrr_state(u32 static unsigned long cr4 = 0; -static u32 deftype_lo, deftype_hi; static DEFINE_SPINLOCK(set_atomicity_lock); /* @@ -300,7 +301,7 @@ static void generic_set_all(void) prepare_set(); /* Actually set the state */ - mask = set_mtrr_state(deftype_lo,deftype_hi); + mask = set_mtrr_state(); post_set(); local_irq_restore(flags); diff -Naurp xen/arch/x86/cpu/mtrr/main.c xen-redhat/arch/x86/cpu/mtrr/main.c --- xen/arch/x86/cpu/mtrr/main.c +++ xen-redhat/arch/x86/cpu/mtrr/main.c @@ -55,7 +55,7 @@ u32 num_var_ranges = 0; unsigned int *usage_table; static DECLARE_MUTEX(mtrr_sem); -u32 size_or_mask, size_and_mask; +u64 size_or_mask, size_and_mask; static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {}; @@ -134,6 +134,17 @@ struct set_mtrr_data { mtrr_type smp_type; }; +/* As per the IA32 SDM vol-3: 10.11.8 MTRR Considerations in MP Systems section + * MTRRs updates must to be synchronized across all the processors. + * This flags avoids multiple cpu synchronization while booting each cpu. + * At the boot & resume time, this flag is turned on in mtrr_aps_sync_begin(). + * Using this flag the mtrr initialization (and the all cpus sync up) in the + * mtrr_ap_init() is avoided while booting each cpu. + * After all the cpus have came up, then mtrr_aps_sync_end() synchronizes all + * the cpus and updates mtrrs on all of them. Then this flag is turned off. + */ +int hold_mtrr_updates_on_aps; + #ifdef CONFIG_SMP static void ipi_handler(void *info) @@ -151,11 +162,13 @@ static void ipi_handler(void *info) cpu_relax(); /* The master has cleared me to execute */ - if (data->smp_reg != ~0U) + if (data->smp_reg == ~0U) /* update all mtrr registers */ + /* At the cpu hot-add time this will reinitialize mtrr + * registres on the existing cpus. It is ok. */ + mtrr_if->set_all(); + else /* single mtrr register update */ mtrr_if->set(data->smp_reg, data->smp_base, data->smp_size, data->smp_type); - else - mtrr_if->set_all(); atomic_dec(&data->count); while(atomic_read(&data->gate)) @@ -240,7 +253,11 @@ static void set_mtrr(unsigned int reg, u * to replicate across all the APs. * If we're doing that @reg is set to something special... */ - if (reg != ~0U) + if (reg == ~0U) /* update all mtrr registers */ + /* at boot or resume time, this will reinitialize the mtrrs on + * the bp. It is ok. */ + mtrr_if->set_all(); + else /* update the single mtrr register */ mtrr_if->set(reg,base,size,type); /* wait for the others */ @@ -589,8 +606,8 @@ void __init mtrr_bp_init(void) boot_cpu_data.x86_mask == 0x4)) phys_addr = 36; - size_or_mask = ~((1 << (phys_addr - PAGE_SHIFT)) - 1); - size_and_mask = ~size_or_mask & 0xfff00000; + size_or_mask = ~((1ULL << (phys_addr - PAGE_SHIFT)) - 1); + size_and_mask = ~size_or_mask & 0xfffff00000ULL; } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && boot_cpu_data.x86 == 6) { /* VIA C* family have Intel style MTRRs, but @@ -639,9 +656,7 @@ void __init mtrr_bp_init(void) void mtrr_ap_init(void) { - unsigned long flags; - - if (!mtrr_if || !use_intel()) + if (!mtrr_if || !use_intel() || hold_mtrr_updates_on_aps) return; /* * Ideally we should hold mtrr_sem here to avoid mtrr entries changed, @@ -651,11 +666,22 @@ void mtrr_ap_init(void) * 2.cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug lock to * prevent mtrr entry changes */ - local_irq_save(flags); + set_mtrr(~0U, 0, 0, 0); +} - mtrr_if->set_all(); +void mtrr_aps_sync_begin(void) +{ + if (!use_intel()) + return; + hold_mtrr_updates_on_aps = 1; +} - local_irq_restore(flags); +void mtrr_aps_sync_end(void) +{ + if (!use_intel()) + return; + set_mtrr(~0U, 0, 0, 0); + hold_mtrr_updates_on_aps = 0; } static int __init mtrr_init_finialize(void) diff -Naurp xen/arch/x86/cpu/mtrr/mtrr.h xen-redhat/arch/x86/cpu/mtrr/mtrr.h --- xen/arch/x86/cpu/mtrr/mtrr.h +++ xen-redhat/arch/x86/cpu/mtrr/mtrr.h @@ -83,7 +83,7 @@ void get_mtrr_state(void); extern void set_mtrr_ops(struct mtrr_ops * ops); -extern u32 size_or_mask, size_and_mask; +extern u64 size_or_mask, size_and_mask; extern struct mtrr_ops * mtrr_if; #define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) diff -Naurp xen/arch/x86/crash.c xen-redhat/arch/x86/crash.c --- xen/arch/x86/crash.c +++ xen-redhat/arch/x86/crash.c @@ -102,6 +102,7 @@ void machine_crash_shutdown(void) hvm_disable(); info = kexec_crash_save_info(); + info->xen_phys_start = xen_phys_start; info->dom0_pfn_to_mfn_frame_list_list = arch_get_pfn_to_mfn_frame_list_list(dom0); } diff -Naurp xen/arch/x86/debug.c xen-redhat/arch/x86/debug.c --- xen/arch/x86/debug.c +++ xen-redhat/arch/x86/debug.c @@ -0,0 +1,257 @@ +/* + * Copyright (C) 2009, Mukesh Rathor, Oracle Corp. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <xen/config.h> +#include <xen/sched.h> +#include <xen/compile.h> +#include <xen/mm.h> +#include <xen/domain_page.h> +#include <xen/guest_access.h> +#include <asm/p2m.h> + +/* + * This file for general routines common to more than one debugger, like kdb, + * gdbsx, etc.. + */ + +#ifdef DBGDEBUG +#define DBGP1(...) {(DBGDEBUG) ? printk(__VA_ARGS__) : 0;} +#define DBGP2(...) {(DBGDEBUG > 1) ? printk(__VA_ARGS__) : 0;} +#else +#define DBGP1(...) {0;} +#define DBGP2(...) {0;} +#endif + +typedef unsigned long dbgva_t; +typedef unsigned char dbgbyte_t; + +/* Returns: mfn for the given (hvm guest) vaddr */ +static unsigned long +dbg_hvm_va2mfn(dbgva_t vaddr, struct domain *dp, int toaddr) +{ + unsigned long mfn, gfn; + + DBGP2("vaddr:%lx domid:%d\n", vaddr, dp->domain_id); + + gfn = paging_gva_to_gfn(dp->vcpu[0], vaddr); + if ( gfn == INVALID_GFN ) + { + DBGP2("kdb:bad gfn from gva_to_gfn\n"); + return INVALID_MFN; + } + + mfn = mfn_x(gfn_to_mfn(dp, gfn)); + + DBGP2("X: vaddr:%lx domid:%d mfn:%lx\n", vaddr, dp->domain_id, mfn); + return mfn; +} + +#if defined(__x86_64__) + +/* + * pgd3val: this is the value of init_mm.pgd[3] in a PV guest. It is optional. + * This to assist debug of modules in the guest. The kernel address + * space seems is always mapped, but modules are not necessarily + * mapped in any arbitraty guest cr3 that we pick if pgd3val is 0. + * Modules should always be addressible if we use cr3 from init_mm. + * Since pgd3val is already a pgd value, cr3->pgd[3], we just need to + * do 2 level lookups. + * + * NOTE: 4 level paging works for 32 PAE guests also because cpu runs in IA32-e + * mode. + * Returns: mfn for the given (pv guest) vaddr + */ +static unsigned long +dbg_pv_va2mfn(dbgva_t vaddr, struct domain *dp, uint64_t pgd3val) +{ + l4_pgentry_t l4e, *l4t; + l3_pgentry_t l3e, *l3t; + l2_pgentry_t l2e, *l2t; + l1_pgentry_t l1e, *l1t; + unsigned long cr3 = (pgd3val ? pgd3val : dp->vcpu[0]->arch.cr3); + unsigned long mfn = cr3 >> PAGE_SHIFT; + + DBGP2("vaddr:%lx domid:%d cr3:%lx pgd3:%lx\n", vaddr, dp->domain_id, + cr3, pgd3val); + + if ( pgd3val == 0 ) + { + l4t = mfn_to_virt(mfn); + l4e = l4t[l4_table_offset(vaddr)]; + mfn = l4e_get_pfn(l4e); + DBGP2("l4t:%p l4to:%lx l4e:%"PRIpte" mfn:%lx\n", l4t, + l4_table_offset(vaddr), l4e_get_intpte(l4e), mfn); + if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) + { + DBGP1("l4 PAGE not present. vaddr:%lx cr3:%lx\n", vaddr, cr3); + return INVALID_MFN; + } + + l3t = mfn_to_virt(mfn); + l3e = l3t[l3_table_offset(vaddr)]; + mfn = l3e_get_pfn(l3e); + DBGP2("l3t:%p l3to:%lx l3e:%"PRIpte" mfn:%lx\n", l3t, + l3_table_offset(vaddr), l3e_get_intpte(l3e), mfn); + if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) + { + DBGP1("l3 PAGE not present. vaddr:%lx cr3:%lx\n", vaddr, cr3); + return INVALID_MFN; + } + } + + l2t = mfn_to_virt(mfn); + l2e = l2t[l2_table_offset(vaddr)]; + mfn = l2e_get_pfn(l2e); + DBGP2("l2t:%p l2to:%lx l2e:%"PRIpte" mfn:%lx\n", l2t, l2_table_offset(vaddr), + l2e_get_intpte(l2e), mfn); + if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || + (l2e_get_flags(l2e) & _PAGE_PSE) ) + { + DBGP1("l2 PAGE not present. vaddr:%lx cr3:%lx\n", vaddr, cr3); + return INVALID_MFN; + } + l1t = mfn_to_virt(mfn); + l1e = l1t[l1_table_offset(vaddr)]; + mfn = l1e_get_pfn(l1e); + DBGP2("l1t:%p l1to:%lx l1e:%"PRIpte" mfn:%lx\n", l1t, l1_table_offset(vaddr), + l1e_get_intpte(l1e), mfn); + + return mfn_valid(mfn) ? mfn : INVALID_MFN; +} + +#else + +/* Returns: mfn for the given (pv guest) vaddr */ +static unsigned long +dbg_pv_va2mfn(dbgva_t vaddr, struct domain *dp, uint64_t pgd3val) +{ +#if CONFIG_PAGING_LEVELS >= 3 + l3_pgentry_t l3e, *l3t; +#endif + l2_pgentry_t l2e, *l2t; + l1_pgentry_t l1e, *l1t; + unsigned long cr3 = (pgd3val ? pgd3val : dp->vcpu[0]->arch.cr3); + unsigned long mfn = cr3 >> PAGE_SHIFT; + + DBGP2("vaddr:%lx domid:%d cr3:%lx pgd3:%lx\n", vaddr, dp->domain_id, + cr3, pgd3val); + +#if CONFIG_PAGING_LEVELS >= 3 + if ( pgd3val == 0 ) + { + l3t = map_domain_page(mfn); + l3t += (cr3 & 0xFE0UL) >> 3; + l3e = l3t[l3_table_offset(vaddr)]; + mfn = l3e_get_pfn(l3e); + unmap_domain_page(l3t); + if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) + return INVALID_MFN; + } +#endif + + l2t = map_domain_page(mfn); + l2e = l2t[l2_table_offset(vaddr)]; + mfn = l2e_get_pfn(l2e); + unmap_domain_page(l2t); + if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || + (l2e_get_flags(l2e) & _PAGE_PSE) ) + return INVALID_MFN; + + l1t = map_domain_page(mfn); + l1e = l1t[l1_table_offset(vaddr)]; + mfn = l1e_get_pfn(l1e); + unmap_domain_page(l1t); + + return mfn_valid(mfn) ? mfn : INVALID_MFN; +} +#endif /* defined(__x86_64__) */ + +/* Returns: number of bytes remaining to be copied */ +static int +dbg_rw_guest_mem(dbgva_t addr, dbgbyte_t *buf, int len, struct domain *dp, + int toaddr, uint64_t pgd3) +{ + while ( len > 0 ) + { + char *va; + unsigned long mfn, pagecnt; + + pagecnt = min_t(long, PAGE_SIZE - (addr & ~PAGE_MASK), len); + + mfn = (dp->is_hvm + ? dbg_hvm_va2mfn(addr, dp, toaddr) + : dbg_pv_va2mfn(addr, dp, pgd3)); + if ( mfn == INVALID_MFN ) + break; + + va = map_domain_page(mfn); + va = va + (addr & (PAGE_SIZE-1)); + + if ( toaddr ) + { + memcpy(va, buf, pagecnt); /* va = buf */ + paging_mark_dirty(dp, mfn); + } + else + { + memcpy(buf, va, pagecnt); /* buf = va */ + } + + unmap_domain_page(va); + + addr += pagecnt; + buf += pagecnt; + len -= pagecnt; + } + + return len; +} + +/* + * addr is hypervisor addr if domid == IDLE_DOMAIN_ID, else it's guest addr + * buf is debugger buffer. + * if toaddr, then addr = buf (write to addr), else buf = addr (rd from guest) + * pgd3: value of init_mm.pgd[3] in guest. see above. + * Returns: number of bytes remaining to be copied. + */ +int +dbg_rw_mem(dbgva_t addr, dbgbyte_t *buf, int len, domid_t domid, int toaddr, + uint64_t pgd3) +{ + struct domain *dp = get_domain_by_id(domid); + int hyp = (domid == IDLE_DOMAIN_ID); + + DBGP2("gmem:addr:%lx buf:%p len:$%d domid:%x toaddr:%x dp:%p\n", + addr, buf, len, domid, toaddr, dp); + if ( hyp ) + { + if ( toaddr ) + len = __copy_to_user((void *)addr, buf, len); + else + len = __copy_from_user(buf, (void *)addr, len); + } + else if ( dp ) + { + if ( !dp->is_dying ) /* make sure guest is still there */ + len= dbg_rw_guest_mem(addr, buf, len, dp, toaddr, pgd3); + put_domain(dp); + } + + DBGP2("gmem:exit:len:$%d\n", len); + return len; +} diff -Naurp xen/arch/x86/dmi_scan.c xen-redhat/arch/x86/dmi_scan.c --- xen/arch/x86/dmi_scan.c +++ xen-redhat/arch/x86/dmi_scan.c @@ -102,23 +102,32 @@ inline static int __init dmi_checksum(u8 return (sum==0); } +int __init dmi_get_table(u32 *base, u32 *len) +{ + u8 buf[15]; + char __iomem *p, *q; + + p = maddr_to_virt(0xF0000); + for (q = p; q < p + 0x10000; q += 16) { + memcpy_fromio(buf, q, 15); + if (memcmp(buf, "_DMI_", 5)==0 && dmi_checksum(buf)) { + *base=buf[11]<<24|buf[10]<<16|buf[9]<<8|buf[8]; + *len=buf[7]<<8|buf[6]; + return 0; + } + } + return -1; +} + static int __init dmi_iterate(void (*decode)(struct dmi_header *)) { u8 buf[15]; char __iomem *p, *q; - /* - * no iounmap() for that ioremap(); it would be a no-op, but it's - * so early in setup that sucker gets confused into doing what - * it shouldn't if we actually call it. - */ - p = ioremap(0xF0000, 0x10000); - if (p == NULL) - return -1; + p = maddr_to_virt(0xF0000); for (q = p; q < p + 0x10000; q += 16) { memcpy_fromio(buf, q, 15); - if(memcmp(buf, "_DMI_", 5)==0 && dmi_checksum(buf)) - { + if (memcmp(buf, "_DMI_", 5)==0 && dmi_checksum(buf)) { u16 num=buf[13]<<8|buf[12]; u16 len=buf[7]<<8|buf[6]; u32 base=buf[11]<<24|buf[10]<<16|buf[9]<<8|buf[8]; diff -Naurp xen/arch/x86/domain_build.c xen-redhat/arch/x86/domain_build.c --- xen/arch/x86/domain_build.c +++ xen-redhat/arch/x86/domain_build.c @@ -35,7 +35,13 @@ extern void discard_initial_images(void) static long __initdata dom0_nrpages; static long __initdata dom0_min_nrpages; -static long __initdata dom0_max_nrpages = LONG_MAX; +/* + * Limit dom0 memory allocation to 32GB. This should be large + * enough for anything, yet small enough that on the very largest + * NUMA systems we do not waste too much memory on dom0 bookkeeping + * and keep dom0 memory mostly on one node. + */ +static long __initdata dom0_max_nrpages = 32L << (30 - PAGE_SHIFT); /* * dom0_mem=[min:<min_amt>,][max:<max_amt>,][<amt>] @@ -132,13 +138,16 @@ static unsigned long __init compute_dom0 /* * If domain 0 allocation isn't specified, reserve 1/16th of available * memory for things like DMA buffers. This reservation is clamped to - * a maximum of 128MB. + * a maximum of 384MB. */ if ( dom0_nrpages == 0 ) { dom0_nrpages = avail; - dom0_nrpages = min(dom0_nrpages / 16, 128L << (20 - PAGE_SHIFT)); + dom0_nrpages = min(dom0_nrpages / 8, 384L << (20 - PAGE_SHIFT)); dom0_nrpages = -dom0_nrpages; + } else { + /* User specified a dom0_size. Do not clamp the maximum. */ + dom0_max_nrpages = LONG_MAX; } /* Negative memory specification means "all memory - specified amount". */ @@ -326,24 +335,11 @@ int __init construct_dom0( #ifdef CONFIG_COMPAT if ( compat32 ) { - l1_pgentry_t gdt_l1e; - d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1; v->vcpu_info = (void *)&d->shared_info->compat.vcpu_info[0]; if ( nr_pages != (unsigned int)nr_pages ) nr_pages = UINT_MAX; - - /* - * Map compatibility Xen segments into every VCPU's GDT. See - * arch_domain_create() for further comments. - */ - gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table), - PAGE_HYPERVISOR); - for ( i = 0; i < MAX_VIRT_CPUS; i++ ) - d->arch.mm_perdomain_pt[((i << GDT_LDT_VCPU_SHIFT) + - FIRST_RESERVED_GDT_PAGE)] = gdt_l1e; - local_flush_tlb_one(GDT_LDT_VIRT_START + FIRST_RESERVED_GDT_BYTE); } #endif if ( parms.pae == PAEKERN_extended_cr3 ) @@ -363,9 +359,9 @@ int __init construct_dom0( #ifdef CONFIG_COMPAT HYPERVISOR_COMPAT_VIRT_START(d) = max_t(unsigned int, m2p_compat_vstart, value); - d->arch.physaddr_bitsize = !is_pv_32on64_domain(d) ? 64 : - fls((1UL << 32) - HYPERVISOR_COMPAT_VIRT_START(d)) - 1 - + (PAGE_SIZE - 2); + + domain_set_alloc_bitsize(d); + if ( value > (!is_pv_32on64_domain(d) ? HYPERVISOR_VIRT_START : __HYPERVISOR_COMPAT_VIRT_START) ) @@ -772,16 +768,22 @@ int __init construct_dom0( if ( opt_dom0_max_vcpus == 0 ) opt_dom0_max_vcpus = num_online_cpus(); - if ( opt_dom0_max_vcpus > num_online_cpus() ) - opt_dom0_max_vcpus = num_online_cpus(); if ( opt_dom0_max_vcpus > MAX_VIRT_CPUS ) opt_dom0_max_vcpus = MAX_VIRT_CPUS; - if ( opt_dom0_max_vcpus > BITS_PER_GUEST_LONG(d) ) - opt_dom0_max_vcpus = BITS_PER_GUEST_LONG(d); printk("Dom0 has maximum %u VCPUs\n", opt_dom0_max_vcpus); + /* + * If dom0 has fewer VCPUs than there are physical CPUs on the system, + * we need to disable cpu frequency scaling. + */ + if ( opt_dom0_max_vcpus != num_online_cpus() ) { + extern unsigned int opt_dom0_vcpus_pin; + cpufreq_controller = FREQCTL_none; + opt_dom0_vcpus_pin = 0; + } + for ( i = 1; i < opt_dom0_max_vcpus; i++ ) - (void)alloc_vcpu(d, i, i); + (void)alloc_vcpu(d, i, i % num_online_cpus()); /* Set up CR3 value for write_ptbase */ if ( paging_mode_enabled(v->domain) ) @@ -966,6 +968,8 @@ int __init construct_dom0( rc |= ioports_deny_access(dom0, 0x40, 0x43); /* PIT Channel 2 / PC Speaker Control. */ rc |= ioports_deny_access(dom0, 0x61, 0x61); + /* PCI configuration space (NB. 0xcf8 has special treatment). */ + rc |= ioports_deny_access(dom0, 0xcfc, 0xcff); /* Command-line I/O ranges. */ process_dom0_ioports_disable(); diff -Naurp xen/arch/x86/domain.c xen-redhat/arch/x86/domain.c --- xen/arch/x86/domain.c +++ xen-redhat/arch/x86/domain.c @@ -29,6 +29,7 @@ #include <xen/console.h> #include <xen/percpu.h> #include <xen/compat.h> +#include <xen/acpi.h> #include <asm/regs.h> #include <asm/mc146818rtc.h> #include <asm/system.h> @@ -43,14 +44,14 @@ #include <asm/hvm/hvm.h> #include <asm/hvm/support.h> #include <asm/msr.h> +#include <xen/iommu.h> #ifdef CONFIG_COMPAT #include <compat/vcpu.h> #endif DEFINE_PER_CPU(struct vcpu *, curr_vcpu); -DEFINE_PER_CPU(__u64, efer); - -static void unmap_vcpu_info(struct vcpu *v); +DEFINE_PER_CPU(u64, efer); +DEFINE_PER_CPU(unsigned long, cr4); static void paravirt_ctxt_switch_from(struct vcpu *v); static void paravirt_ctxt_switch_to(struct vcpu *v); @@ -80,7 +81,6 @@ void idle_loop(void) { for ( ; ; ) { - page_scrub_schedule_work(); default_idle(); do_softirq(); } @@ -266,6 +266,18 @@ static void release_compat_l4(struct vcp v->arch.guest_table_user = pagetable_null(); } +void domain_set_alloc_bitsize(struct domain *d) +{ + if ( !is_pv_32on64_domain(d) || + (MACH2PHYS_COMPAT_NR_ENTRIES(d) >= max_page) ) + return; + d->arch.physaddr_bitsize = + /* 2^n entries can be contained in guest's p2m mapping space */ + fls(MACH2PHYS_COMPAT_NR_ENTRIES(d)) - 1 + /* 2^n pages -> 2^(n+PAGE_SHIFT) bits */ + + PAGE_SHIFT; +} + static inline int may_switch_mode(struct domain *d) { return (!is_hvm_domain(d) && (d->tot_pages == 0)); @@ -273,7 +285,6 @@ static inline int may_switch_mode(struct int switch_native(struct domain *d) { - l1_pgentry_t gdt_l1e; unsigned int vcpuid; if ( d == NULL ) @@ -286,24 +297,17 @@ int switch_native(struct domain *d) d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0; release_arg_xlat_area(d); - /* switch gdt */ - gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR); for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ ) { - d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) + - FIRST_RESERVED_GDT_PAGE)] = gdt_l1e; if (d->vcpu[vcpuid]) release_compat_l4(d->vcpu[vcpuid]); } - d->arch.physaddr_bitsize = 64; - return 0; } int switch_compat(struct domain *d) { - l1_pgentry_t gdt_l1e; unsigned int vcpuid; if ( d == NULL ) @@ -315,33 +319,23 @@ int switch_compat(struct domain *d) d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1; - /* switch gdt */ - gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table), PAGE_HYPERVISOR); for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ ) { if ( (d->vcpu[vcpuid] != NULL) && (setup_compat_l4(d->vcpu[vcpuid]) != 0) ) goto undo_and_fail; - d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) + - FIRST_RESERVED_GDT_PAGE)] = gdt_l1e; } - d->arch.physaddr_bitsize = - fls((1UL << 32) - HYPERVISOR_COMPAT_VIRT_START(d)) - 1 - + (PAGE_SIZE - 2); + domain_set_alloc_bitsize(d); return 0; undo_and_fail: d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0; - release_arg_xlat_area(d); - gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR); while ( vcpuid-- != 0 ) { if ( d->vcpu[vcpuid] != NULL ) release_compat_l4(d->vcpu[vcpuid]); - d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) + - FIRST_RESERVED_GDT_PAGE)] = gdt_l1e; } return -ENOMEM; } @@ -388,6 +382,9 @@ int vcpu_initialise(struct vcpu *v) v->arch.schedule_tail = continue_idle_domain; v->arch.cr3 = __pa(idle_pg_table); } + + v->arch.guest_context.ctrlreg[4] = + real_cr4_to_pv_guest_cr4(mmu_cr4_features); } v->arch.perdomain_ptes = @@ -401,8 +398,6 @@ void vcpu_destroy(struct vcpu *v) if ( is_pv_32on64_vcpu(v) ) release_compat_l4(v); - unmap_vcpu_info(v); - if ( is_hvm_vcpu(v) ) hvm_vcpu_destroy(v); } @@ -413,35 +408,28 @@ int arch_domain_create(struct domain *d) struct page_info *pg; int i; #endif - l1_pgentry_t gdt_l1e; - int vcpuid, pdpt_order; + int pdpt_order; int rc = -ENOMEM; + INIT_LIST_HEAD(&d->arch.pdev_list); + + d->arch.relmem = RELMEM_not_started; + INIT_LIST_HEAD(&d->arch.relmem_list); + pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)); d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order); if ( d->arch.mm_perdomain_pt == NULL ) goto fail; memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order); - /* - * Map Xen segments into every VCPU's GDT, irrespective of whether every - * VCPU will actually be used. This avoids an NMI race during context - * switch: if we take an interrupt after switching CR3 but before switching - * GDT, and the old VCPU# is invalid in the new domain, we would otherwise - * try to load CS from an invalid table. - */ - gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR); - for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ ) - d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) + - FIRST_RESERVED_GDT_PAGE)] = gdt_l1e; - #if defined(__i386__) mapcache_init(d); #else /* __x86_64__ */ - if ( (pg = alloc_domheap_page(NULL)) == NULL ) + pg = alloc_domheap_page(NULL); + if (pg == NULL) goto fail; d->arch.mm_perdomain_l2 = page_to_virt(pg); clear_page(d->arch.mm_perdomain_l2); @@ -450,7 +438,8 @@ int arch_domain_create(struct domain *d) l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i, __PAGE_HYPERVISOR); - if ( (pg = alloc_domheap_page(NULL)) == NULL ) + pg = alloc_domheap_page(NULL); + if ( pg == NULL ) goto fail; d->arch.mm_perdomain_l3 = page_to_virt(pg); clear_page(d->arch.mm_perdomain_l3); @@ -470,6 +459,7 @@ int arch_domain_create(struct domain *d) { d->arch.ioport_caps = rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex); + rc = -ENOMEM; if ( d->arch.ioport_caps == NULL ) goto fail; @@ -479,12 +469,18 @@ int arch_domain_create(struct domain *d) clear_page(d->shared_info); share_xen_page_with_guest( virt_to_page(d->shared_info), d, XENSHARE_writable); + + if ( (rc = iommu_domain_init(d)) != 0 ) + goto fail; } if ( is_hvm_domain(d) ) { if ( (rc = hvm_domain_initialise(d)) != 0 ) + { + iommu_domain_destroy(d); goto fail; + } } else { @@ -492,7 +488,6 @@ int arch_domain_create(struct domain *d) d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = (CONFIG_PAGING_LEVELS != 4); } - return 0; @@ -513,6 +508,11 @@ void arch_domain_destroy(struct domain * if ( is_hvm_domain(d) ) hvm_domain_destroy(d); + pci_release_devices(d); + free_domain_pirqs(d); + if ( !is_idle_domain(d) ) + iommu_domain_destroy(d); + paging_final_teardown(d); free_xenheap_pages( @@ -530,13 +530,29 @@ void arch_domain_destroy(struct domain * free_xenheap_page(d->shared_info); } +unsigned long pv_guest_cr4_fixup(unsigned long guest_cr4) +{ + unsigned long hv_cr4_mask, hv_cr4 = real_cr4_to_pv_guest_cr4(read_cr4()); + + hv_cr4_mask = ~X86_CR4_TSD; + if ( cpu_has_de ) + hv_cr4_mask &= ~X86_CR4_DE; + + if ( (guest_cr4 & hv_cr4_mask) != (hv_cr4 & hv_cr4_mask) ) + gdprintk(XENLOG_WARNING, + "Attempt to change CR4 flags %08lx -> %08lx\n", + hv_cr4 & ~(X86_CR4_PGE|X86_CR4_PSE), guest_cr4); + + return (hv_cr4 & hv_cr4_mask) | (guest_cr4 & ~hv_cr4_mask); +} + /* This is called by arch_final_setup_guest and do_boot_vcpu */ int arch_set_info_guest( struct vcpu *v, vcpu_guest_context_u c) { struct domain *d = v->domain; unsigned long cr3_pfn = INVALID_MFN; - unsigned long flags; + unsigned long flags, cr4; int i, rc = 0, compat; /* The context is a compat-mode one if the target domain is compat-mode; @@ -625,10 +641,28 @@ int arch_set_info_guest( /* Ensure real hardware interrupts are enabled. */ v->arch.guest_context.user_regs.eflags |= EF_IE; + + cr4 = v->arch.guest_context.ctrlreg[4]; + v->arch.guest_context.ctrlreg[4] = cr4 ? pv_guest_cr4_fixup(cr4) : + real_cr4_to_pv_guest_cr4(mmu_cr4_features); + } else { + u32* ident_pt; + hvm_load_cpu_guest_regs(v, &v->arch.guest_context.user_regs); + /* Fill it with 32-bit, non-PAE superpage entries, each mapping 4MB + * of virtual address space onto the same physical address range */ + if ( v->vcpu_id == 0 ) + { + ident_pt = map_domain_page(mfn_x(gfn_to_mfn(v->domain, + (HVM_IDENT_PT_PAGE >> PAGE_SHIFT)))); + for ( i = 0; i < PAGE_SIZE / sizeof(*ident_pt); i++ ) + ident_pt[i] = (i << 22) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER + | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE; + unmap_domain_page(ident_pt); + } } memset(v->arch.guest_context.debugreg, 0, @@ -701,6 +735,11 @@ int arch_set_info_guest( v->arch.guest_table_user = pagetable_from_pfn(cr3_pfn); } + else if ( !(flags & VGCF_in_kernel) ) + { + destroy_gdt(v); + return -EINVAL; + } #endif } #ifdef CONFIG_COMPAT @@ -827,7 +866,7 @@ map_vcpu_info(struct vcpu *v, unsigned l * lost. The domain will get a spurious event, but it can cope. */ vcpu_info(v, evtchn_upcall_pending) = 1; - for ( i = 0; i < BITS_PER_GUEST_LONG(d); i++ ) + for ( i = 0; i < BITS_PER_EVTCHN_WORD(d); i++ ) set_bit(i, vcpu_info_addr(v, evtchn_pending_sel)); /* @@ -893,6 +932,25 @@ arch_do_vcpu_op( break; } + case VCPUOP_get_physid: + { + struct vcpu_get_physid cpu_id; + + rc = -EINVAL; + if ( !v->domain->is_pinned ) + break; + + cpu_id.phys_id = (x86_cpu_to_apicid[v->vcpu_id] | + (acpi_get_processor_id(v->vcpu_id) << 8)); + + rc = -EFAULT; + if ( copy_to_guest(arg, &cpu_id, 1) ) + break; + + rc = 0; + break; + } + default: rc = -ENOSYS; break; @@ -1169,9 +1227,15 @@ static void paravirt_ctxt_switch_from(st static void paravirt_ctxt_switch_to(struct vcpu *v) { + unsigned long cr4; + set_int80_direct_trap(v); switch_kernel_stack(v); + cr4 = pv_guest_cr4_to_real_cr4(v->arch.guest_context.ctrlreg[4]); + if ( unlikely(cr4 != read_cr4()) ) + write_cr4(cr4); + if ( unlikely(v->arch.guest_context.debugreg[7]) ) { write_debugreg(0, v->arch.guest_context.debugreg[0]); @@ -1183,12 +1247,19 @@ static void paravirt_ctxt_switch_to(stru } } +static inline int need_full_gdt(struct vcpu *v) +{ + return (!is_hvm_vcpu(v) && !is_idle_vcpu(v)); +} + static void __context_switch(void) { struct cpu_user_regs *stack_regs = guest_cpu_user_regs(); unsigned int cpu = smp_processor_id(); struct vcpu *p = per_cpu(curr_vcpu, cpu); struct vcpu *n = current; + struct desc_struct *gdt; + struct desc_ptr gdt_desc; ASSERT(p != n); ASSERT(cpus_empty(n->vcpu_dirty_cpumask)); @@ -1214,14 +1285,35 @@ static void __context_switch(void) cpu_set(cpu, n->domain->domain_dirty_cpumask); cpu_set(cpu, n->vcpu_dirty_cpumask); + gdt = !is_pv_32on64_vcpu(n) ? per_cpu(gdt_table, cpu) : + per_cpu(compat_gdt_table, cpu); + if ( need_full_gdt(n) ) + { + struct page_info *page = virt_to_page(gdt); + unsigned int i; + for ( i = 0; i < NR_RESERVED_GDT_PAGES; i++ ) + l1e_write(n->domain->arch.mm_perdomain_pt + + (n->vcpu_id << GDT_LDT_VCPU_SHIFT) + + FIRST_RESERVED_GDT_PAGE + i, + l1e_from_page(page + i, __PAGE_HYPERVISOR)); + } + + if ( need_full_gdt(p) && + ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(n)) ) + { + gdt_desc.limit = LAST_RESERVED_GDT_BYTE; + gdt_desc.base = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY); + asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); + } + write_ptbase(n); - if ( p->vcpu_id != n->vcpu_id ) + if ( need_full_gdt(n) && + ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(p)) ) { - char gdt_load[10]; - *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE; - *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n); - __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) ); + gdt_desc.limit = LAST_RESERVED_GDT_BYTE; + gdt_desc.base = GDT_VIRT_START(n); + asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); } if ( p->domain != n->domain ) @@ -1251,7 +1343,7 @@ void context_switch(struct vcpu *prev, s local_irq_disable(); if ( is_hvm_vcpu(prev) && !list_empty(&prev->arch.hvm_vcpu.tm_list) ) - pt_freeze_time(prev); + pt_save_timer(prev); set_current(next); @@ -1271,9 +1363,6 @@ void context_switch(struct vcpu *prev, s { uint64_t efer = read_efer(); - local_flush_tlb_one(GDT_VIRT_START(next) + - FIRST_RESERVED_GDT_BYTE); - if ( !is_pv_32on64_vcpu(next) == !(efer & EFER_SCE) ) write_efer(efer ^ EFER_SCE); } @@ -1346,6 +1435,65 @@ void sync_vcpu_execstate(struct vcpu *v) flush_tlb_mask(v->vcpu_dirty_cpumask); } +struct migrate_info { + long (*func)(void *data); + void *data; + void (*saved_schedule_tail)(struct vcpu *); + cpumask_t saved_affinity; +}; + +static void continue_hypercall_on_cpu_helper(struct vcpu *v) +{ + struct cpu_user_regs *regs = guest_cpu_user_regs(); + struct migrate_info *info = v->arch.continue_info; + cpumask_t mask = info->saved_affinity; + + regs->eax = info->func(info->data); + + v->arch.schedule_tail = info->saved_schedule_tail; + v->arch.continue_info = NULL; + + xfree(info); + + vcpu_unlock_affinity(v, &mask); + schedule_tail(v); +} + +int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data) +{ + struct vcpu *v = current; + struct migrate_info *info; + int rc; + + if ( cpu == smp_processor_id() ) + return func(data); + + info = xmalloc(struct migrate_info); + if ( info == NULL ) + return -ENOMEM; + + info->func = func; + info->data = data; + info->saved_schedule_tail = v->arch.schedule_tail; + info->saved_affinity = cpumask_of_cpu(cpu); + + v->arch.schedule_tail = continue_hypercall_on_cpu_helper; + v->arch.continue_info = info; + + rc = vcpu_lock_affinity(v, &info->saved_affinity); + if ( rc ) + { + v->arch.schedule_tail = info->saved_schedule_tail; + v->arch.continue_info = NULL; + xfree(info); + return rc; + } + + /* Dummy return value will be overwritten by new schedule_tail. */ + BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id()))); + return 0; +} + #define next_arg(fmt, args) ({ \ unsigned long __arg; \ switch ( *(fmt)++ ) \ @@ -1529,12 +1677,13 @@ int hypercall_xlat_continuation(unsigned } #endif -static void relinquish_memory(struct domain *d, struct list_head *list, +static int relinquish_memory(struct domain *d, struct list_head *list, unsigned long type) { struct list_head *ent; struct page_info *page; unsigned long x, y; + int ret = 0; /* Use a recursive lock, as we may enter 'free_domheap_page'. */ spin_lock_recursive(&d->page_alloc_lock); @@ -1549,44 +1698,98 @@ static void relinquish_memory(struct dom { /* Couldn't get a reference -- someone is freeing this page. */ ent = ent->next; + list_move_tail(&page->list, &d->arch.relmem_list); continue; } if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) ) - put_page_and_type(page); + ret = put_page_and_type_preemptible(page, 1); + switch ( ret ) + { + case 0: + break; + case -EAGAIN: + case -EINTR: + list_move(&page->list, list); + set_bit(_PGT_pinned, &page->u.inuse.type_info); + put_page(page); + goto out; + default: + BUG(); + } if ( test_and_clear_bit(_PGC_allocated, &page->count_info) ) put_page(page); /* * Forcibly invalidate top-most, still valid page tables at this point - * to break circular 'linear page table' references. This is okay - * because MMU structures are not shared across domains and this domain - * is now dead. Thus top-most valid tables are not in use so a non-zero - * count means circular reference. + * to break circular 'linear page table' references as well as clean up + * partially validated pages. This is okay because MMU structures are + * not shared across domains and this domain is now dead. Thus top-most + * valid tables are not in use so a non-zero count means circular + * reference or partially validated. */ y = page->u.inuse.type_info; for ( ; ; ) { x = y; - if ( likely((x & (PGT_type_mask|PGT_validated)) != - (type|PGT_validated)) ) + if ( likely((x & PGT_type_mask) != type) || + likely(!(x & (PGT_validated|PGT_partial))) ) break; - y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated); + y = cmpxchg(&page->u.inuse.type_info, x, + x & ~(PGT_validated|PGT_partial)); if ( likely(y == x) ) { - free_page_type(page, type); + /* No need for atomic update of type_info here: noone else updates it. */ + switch ( ret = free_page_type(page, x, 1) ) + { + case 0: + break; + case -EINTR: + list_move(&page->list, list); + page->u.inuse.type_info |= PGT_validated; + if ( x & PGT_partial ) + put_page(page); + put_page(page); + ret = -EAGAIN; + goto out; + case -EAGAIN: + list_move(&page->list, list); + page->u.inuse.type_info |= PGT_partial; + if ( x & PGT_partial ) + put_page(page); + goto out; + default: + BUG(); + } + if ( x & PGT_partial ) + { + page->u.inuse.type_info--; + put_page(page); + } break; } } - /* Follow the list chain and /then/ potentially free the page. */ + /* Put the page on the list and /then/ potentially free it. */ ent = ent->next; + list_move_tail(&page->list, &d->arch.relmem_list); put_page(page); + + if ( hypercall_preempt_check() ) + { + ret = -EAGAIN; + goto out; + } } + /* list is empty at this point. */ + list_splice_init(&d->arch.relmem_list, list); + + out: spin_unlock_recursive(&d->page_alloc_lock); + return ret; } static void vcpu_destroy_pagetables(struct vcpu *v) @@ -1624,10 +1827,6 @@ static void vcpu_destroy_pagetables(stru put_page(mfn_to_page(pfn)); else put_page_and_type(mfn_to_page(pfn)); -#ifdef __x86_64__ - if ( pfn == pagetable_get_pfn(v->arch.guest_table_user) ) - v->arch.guest_table_user = pagetable_null(); -#endif v->arch.guest_table = pagetable_null(); } @@ -1636,10 +1835,13 @@ static void vcpu_destroy_pagetables(stru pfn = pagetable_get_pfn(v->arch.guest_table_user); if ( pfn != 0 ) { - if ( paging_mode_refcounts(d) ) - put_page(mfn_to_page(pfn)); - else - put_page_and_type(mfn_to_page(pfn)); + if ( !is_pv_32bit_vcpu(v) ) + { + if ( paging_mode_refcounts(d) ) + put_page(mfn_to_page(pfn)); + else + put_page_and_type(mfn_to_page(pfn)); + } v->arch.guest_table_user = pagetable_null(); } #endif @@ -1647,43 +1849,83 @@ static void vcpu_destroy_pagetables(stru v->arch.cr3 = 0; } -void domain_relinquish_resources(struct domain *d) +int domain_relinquish_resources(struct domain *d) { + int ret; struct vcpu *v; BUG_ON(!cpus_empty(d->domain_dirty_cpumask)); - /* Drop the in-use references to page-table bases. */ - for_each_vcpu ( d, v ) - vcpu_destroy_pagetables(v); - - /* Tear down paging-assistance stuff. */ - paging_teardown(d); - - /* - * Relinquish GDT mappings. No need for explicit unmapping of the LDT as - * it automatically gets squashed when the guest's mappings go away. - */ - for_each_vcpu(d, v) - destroy_gdt(v); - - /* Relinquish every page of memory. */ + switch ( d->arch.relmem ) + { + case RELMEM_not_started: + /* Tear down paging-assistance stuff. */ + paging_teardown(d); + + for_each_vcpu ( d, v ) + { + /* Drop the in-use references to page-table bases. */ + vcpu_destroy_pagetables(v); + + /* + * Relinquish GDT mappings. No need for explicit unmapping of the + * LDT as it automatically gets squashed with the guest mappings. + */ + destroy_gdt(v); + + unmap_vcpu_info(v); + } + + d->arch.relmem = RELMEM_xen; + /* fallthrough */ + + /* Relinquish every page of memory. */ + case RELMEM_xen: + ret = relinquish_memory(d, &d->xenpage_list, ~0UL); + if ( ret ) + return ret; #if CONFIG_PAGING_LEVELS >= 4 - relinquish_memory(d, &d->xenpage_list, PGT_l4_page_table); - relinquish_memory(d, &d->page_list, PGT_l4_page_table); + d->arch.relmem = RELMEM_l4; + /* fallthrough */ + + case RELMEM_l4: + ret = relinquish_memory(d, &d->page_list, PGT_l4_page_table); + if ( ret ) + return ret; #endif #if CONFIG_PAGING_LEVELS >= 3 - relinquish_memory(d, &d->xenpage_list, PGT_l3_page_table); - relinquish_memory(d, &d->page_list, PGT_l3_page_table); -#endif - relinquish_memory(d, &d->xenpage_list, PGT_l2_page_table); - relinquish_memory(d, &d->page_list, PGT_l2_page_table); + d->arch.relmem = RELMEM_l3; + /* fallthrough */ + + case RELMEM_l3: + ret = relinquish_memory(d, &d->page_list, PGT_l3_page_table); + if ( ret ) + return ret; +#endif + d->arch.relmem = RELMEM_l2; + /* fallthrough */ + + case RELMEM_l2: + ret = relinquish_memory(d, &d->page_list, PGT_l2_page_table); + if ( ret ) + return ret; + d->arch.relmem = RELMEM_done; + /* fallthrough */ + + case RELMEM_done: + break; + + default: + BUG(); + } /* Free page used by xen oprofile buffer. */ free_xenoprof_pages(d); if ( is_hvm_domain(d) ) hvm_domain_relinquish_resources(d); + + return 0; } void arch_dump_domain_info(struct domain *d) diff -Naurp xen/arch/x86/domctl.c xen-redhat/arch/x86/domctl.c --- xen/arch/x86/domctl.c +++ xen-redhat/arch/x86/domctl.c @@ -24,6 +24,44 @@ #include <asm/hvm/hvm.h> #include <asm/hvm/support.h> #include <asm/processor.h> +#include <xen/iommu.h> + +typedef unsigned long kdbva_t; +typedef unsigned char kdbbyt_t; +extern int dbg_rw_mem(kdbva_t, kdbbyt_t *, int, domid_t, int, uint64_t); + +static int +gdbsx_guest_mem_io(domid_t domid, struct xen_domctl_gdbsx_memio *iop) +{ + ulong l_uva = (ulong)iop->uva; + iop->remain = dbg_rw_mem( + (kdbva_t)iop->gva, (kdbbyt_t *)l_uva, iop->len, domid, + iop->gwr, iop->pgd3val); + return (iop->remain ? -EFAULT : 0); +} + +/* On success, this will have added a reference to the domain. */ +static int prep_assign_device(struct domain **d, u8 *bus, u8 *devfn, + const struct xen_domctl *domctl, const char *domctl_name) +{ + struct domain *tmp; + + if ( !iommu_enabled ) + return -ENOSYS; + + tmp = get_domain_by_id(domctl->domain); + + if ( unlikely(tmp == NULL) ) + { + gdprintk(XENLOG_ERR, "%s: get_domain_by_id() failed\n", domctl_name); + return -EINVAL; + } + + *d = tmp; + *bus = (domctl->u.assign_device.machine_bdf >> 16) & 0xff; + *devfn = (domctl->u.assign_device.machine_bdf >> 8) & 0xff; + return 0; +} long arch_do_domctl( struct xen_domctl *domctl, @@ -230,10 +268,14 @@ long arch_do_domctl( ret = -EINVAL; if ( d != NULL ) { - ret = 0; - spin_lock(&d->page_alloc_lock); + if ( unlikely(d->is_dying) ) { + spin_unlock(&d->page_alloc_lock); + goto getmemlist_out; + } + + ret = 0; list_ent = d->page_list.next; for ( i = 0; (i < max_pfns) && (list_ent != &d->page_list); i++ ) { @@ -253,6 +295,7 @@ long arch_do_domctl( domctl->u.getmemlist.num_pfns = i; copy_to_guest(u_domctl, domctl, 1); + getmemlist_out: rcu_unlock_domain(d); } } @@ -382,6 +425,29 @@ long arch_do_domctl( } break; + case XEN_DOMCTL_gethvmcontext_partial: + { + struct domain *d; + + ret = -ESRCH; + if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL ) + break; + + ret = -EINVAL; + if ( !is_hvm_domain(d) ) + goto gethvmcontext_partial_out; + + domain_pause(d); + ret = hvm_save_one(d, domctl->u.hvmcontext_partial.type, + domctl->u.hvmcontext_partial.instance, + domctl->u.hvmcontext_partial.buffer); + domain_unpause(d); + + gethvmcontext_partial_out: + rcu_unlock_domain(d); + } + break; + case XEN_DOMCTL_set_address_size: { struct domain *d; @@ -417,13 +483,427 @@ long arch_do_domctl( if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL ) break; - domctl->u.address_size.size = BITS_PER_GUEST_LONG(d); + domctl->u.address_size.size = + is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG; + + ret = 0; + rcu_unlock_domain(d); + + if ( copy_to_guest(u_domctl, domctl, 1) ) + ret = -EFAULT; + } + break; + + case XEN_DOMCTL_get_device_group: + { + struct domain *d; + u32 max_sdevs; + u8 bus, devfn; + XEN_GUEST_HANDLE_64(uint32_t) sdevs; + int num_sdevs; + + ret = -ENOSYS; + if ( !iommu_enabled ) + break; + + ret = -EINVAL; + if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL ) + break; + + bus = (domctl->u.get_device_group.machine_bdf >> 16) & 0xff; + devfn = (domctl->u.get_device_group.machine_bdf >> 8) & 0xff; + max_sdevs = domctl->u.get_device_group.max_sdevs; + sdevs = domctl->u.get_device_group.sdev_array; + + num_sdevs = iommu_get_device_group(d, bus, devfn, sdevs, max_sdevs); + if ( num_sdevs < 0 ) + { + dprintk(XENLOG_ERR, "iommu_get_device_group() failed!\n"); + ret = -EFAULT; + domctl->u.get_device_group.num_sdevs = 0; + } + else + { + ret = 0; + domctl->u.get_device_group.num_sdevs = num_sdevs; + } + if ( copy_to_guest(u_domctl, domctl, 1) ) + ret = -EFAULT; + rcu_unlock_domain(d); + } + break; + + case XEN_DOMCTL_test_assign_device: + { + struct domain *d; + u8 bus, devfn; + + ret = prep_assign_device(&d, &bus, &devfn, domctl, + "XEN_DOMCTL_test_assign_device"); + if ( ret ) + break; + + ret = device_assignable(d, bus, devfn); + if ( ret ) + gdprintk(XENLOG_ERR, "XEN_DOMCTL_test_assign_device: " + "%x:%x:%x already assigned, or non-existent, or denied\n", + bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + + put_domain(d); + } + break; + + case XEN_DOMCTL_assign_device: + { + struct domain *d; + u8 bus, devfn; + + ret = prep_assign_device(&d, &bus, &devfn, domctl, + "XEN_DOMCTL_assign_device"); + if ( ret ) + break; + + ret = assign_device(d, bus, devfn); + if ( ret ) + gdprintk(XENLOG_ERR, "XEN_DOMCTL_assign_device: " + "assign device (%x:%x:%x) failed\n", + bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + + put_domain(d); + } + break; + + case XEN_DOMCTL_deassign_device: + { + struct domain *d; + u8 bus, devfn; + + ret = -ENOSYS; + if ( !iommu_enabled ) + break; + + ret = -EINVAL; + if ( unlikely((d = get_domain_by_id(domctl->domain)) == NULL) ) + { + gdprintk(XENLOG_ERR, + "XEN_DOMCTL_deassign_device: get_domain_by_id() failed\n"); + break; + } + + bus = (domctl->u.assign_device.machine_bdf >> 16) & 0xff; + devfn = (domctl->u.assign_device.machine_bdf >> 8) & 0xff; + + spin_lock(&pcidevs_lock); + ret = deassign_device(d, bus, devfn); + spin_unlock(&pcidevs_lock); + gdprintk(XENLOG_INFO, "XEN_DOMCTL_deassign_device: bdf = %x:%x:%x\n", + bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + + put_domain(d); + } + break; + + case XEN_DOMCTL_bind_pt_irq: + { + struct domain * d; + xen_domctl_bind_pt_irq_t * bind; + + ret = -ESRCH; + if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL ) + break; + bind = &(domctl->u.bind_pt_irq); + + ret = -ESRCH; + if ( iommu_enabled ) + { + spin_lock(&pcidevs_lock); + ret = pt_irq_create_bind_vtd(d, bind); + spin_unlock(&pcidevs_lock); + } + if ( ret < 0 ) + gdprintk(XENLOG_ERR, "pt_irq_create_bind failed!\n"); + + rcu_unlock_domain(d); + } + break; + + case XEN_DOMCTL_unbind_pt_irq: + { + struct domain * d; + xen_domctl_bind_pt_irq_t * bind; + + ret = -ESRCH; + if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL ) + break; + bind = &(domctl->u.bind_pt_irq); + if ( iommu_enabled ) + { + spin_lock(&pcidevs_lock); + ret = pt_irq_destroy_bind_vtd(d, bind); + spin_unlock(&pcidevs_lock); + } + if ( ret < 0 ) + gdprintk(XENLOG_ERR, "pt_irq_destroy_bind failed!\n"); + rcu_unlock_domain(d); + } + break; + + case XEN_DOMCTL_memory_mapping: + { + struct domain *d; + unsigned long gfn = domctl->u.memory_mapping.first_gfn; + unsigned long mfn = domctl->u.memory_mapping.first_mfn; + unsigned long nr_mfns = domctl->u.memory_mapping.nr_mfns; + int i; + + ret = -EINVAL; + if ( (mfn + nr_mfns - 1) < mfn ) /* wrap? */ + break; + + ret = -ESRCH; + if ( unlikely((d = rcu_lock_domain_by_id(domctl->domain)) == NULL) ) + break; + + ret=0; + if ( domctl->u.memory_mapping.add_mapping ) + { + gdprintk(XENLOG_INFO, + "memory_map:add: gfn=%lx mfn=%lx nr_mfns=%lx\n", + gfn, mfn, nr_mfns); + + ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1); + for ( i = 0; i < nr_mfns; i++ ) + set_mmio_p2m_entry(d, gfn+i, _mfn(mfn+i)); + } + else + { + gdprintk(XENLOG_INFO, + "memory_map:remove: gfn=%lx mfn=%lx nr_mfns=%lx\n", + gfn, mfn, nr_mfns); + + for ( i = 0; i < nr_mfns; i++ ) + clear_mmio_p2m_entry(d, gfn+i); + ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1); + } + + rcu_unlock_domain(d); + } + break; + + case XEN_DOMCTL_ioport_mapping: + { +#define MAX_IOPORTS 0x10000 + struct domain *d; + struct hvm_iommu *hd; + unsigned int fgp = domctl->u.ioport_mapping.first_gport; + unsigned int fmp = domctl->u.ioport_mapping.first_mport; + unsigned int np = domctl->u.ioport_mapping.nr_ports; + struct g2m_ioport *g2m_ioport; + int found = 0; + + ret = -EINVAL; + if ( (np == 0) || (fgp > MAX_IOPORTS) || (fmp > MAX_IOPORTS) || + ((fgp + np) > MAX_IOPORTS) || ((fmp + np) > MAX_IOPORTS) ) + { + gdprintk(XENLOG_ERR, + "ioport_map:invalid:gport=%x mport=%x nr_ports=%x\n", + fgp, fmp, np); + break; + } + + ret = -ESRCH; + if ( unlikely((d = rcu_lock_domain_by_id(domctl->domain)) == NULL) ) + break; + + hd = domain_hvm_iommu(d); + if ( domctl->u.ioport_mapping.add_mapping ) + { + gdprintk(XENLOG_INFO, + "ioport_map:add f_gport=%x f_mport=%x np=%x\n", + fgp, fmp, np); + + list_for_each_entry(g2m_ioport, &hd->g2m_ioport_list, list) + if (g2m_ioport->mport == fmp ) + { + g2m_ioport->gport = fgp; + g2m_ioport->np = np; + found = 1; + break; + } + if ( !found ) + { + g2m_ioport = xmalloc(struct g2m_ioport); + g2m_ioport->gport = fgp; + g2m_ioport->mport = fmp; + g2m_ioport->np = np; + list_add_tail(&g2m_ioport->list, &hd->g2m_ioport_list); + } + ret = ioports_permit_access(d, fmp, fmp + np - 1); + } + else + { + gdprintk(XENLOG_INFO, + "ioport_map:remove f_gport=%x f_mport=%x np=%x\n", + fgp, fmp, np); + list_for_each_entry(g2m_ioport, &hd->g2m_ioport_list, list) + if ( g2m_ioport->mport == fmp ) + { + list_del(&g2m_ioport->list); + xfree(g2m_ioport); + break; + } + ret = ioports_deny_access(d, fmp, fmp + np - 1); + } + rcu_unlock_domain(d); + } + break; + + case XEN_DOMCTL_sendtrigger: + { + struct domain *d; + struct vcpu *v; + + ret = -ESRCH; + if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL ) + break; + + ret = -EINVAL; + if ( domctl->u.sendtrigger.vcpu >= MAX_VIRT_CPUS ) + goto sendtrigger_out; + + ret = -ESRCH; + if ( (v = d->vcpu[domctl->u.sendtrigger.vcpu]) == NULL ) + goto sendtrigger_out; + + switch ( domctl->u.sendtrigger.trigger ) + { + case XEN_DOMCTL_SENDTRIGGER_NMI: + { + ret = -ENOSYS; + if ( !is_hvm_domain(d) ) + break; + + ret = 0; + if ( !test_and_set_bool(v->arch.hvm_vcpu.nmi_pending) ) + vcpu_kick(v); + } + break; + + default: + ret = -ENOSYS; + } + + sendtrigger_out: + rcu_unlock_domain(d); + } + break; + + case XEN_DOMCTL_gdbsx_guestmemio: + { + struct domain *d; + + ret = -ESRCH; + if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL ) + break; + + domctl->u.gdbsx_guest_memio.remain = + domctl->u.gdbsx_guest_memio.len; + + ret = gdbsx_guest_mem_io(domctl->domain, &domctl->u.gdbsx_guest_memio); + if ( !ret && copy_to_guest(u_domctl, domctl, 1) ) + ret = -EFAULT; + + rcu_unlock_domain(d); + } + break; + + case XEN_DOMCTL_gdbsx_pausevcpu: + { + struct domain *d; + struct vcpu *v; + + ret = -ESRCH; + if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL ) + break; + + ret = -EBUSY; + if ( !d->is_paused_by_controller ) + { + rcu_unlock_domain(d); + break; + } + ret = -EINVAL; + if ( domctl->u.gdbsx_pauseunp_vcpu.vcpu >= MAX_VIRT_CPUS || + (v = d->vcpu[domctl->u.gdbsx_pauseunp_vcpu.vcpu]) == NULL ) + { + rcu_unlock_domain(d); + break; + } + vcpu_pause(v); + ret = 0; + rcu_unlock_domain(d); + } + break; + + case XEN_DOMCTL_gdbsx_unpausevcpu: + { + struct domain *d; + struct vcpu *v; + ret = -ESRCH; + if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL ) + break; + + ret = -EBUSY; + if ( !d->is_paused_by_controller ) + { + rcu_unlock_domain(d); + break; + } + ret = -EINVAL; + if ( domctl->u.gdbsx_pauseunp_vcpu.vcpu >= MAX_VIRT_CPUS || + (v = d->vcpu[domctl->u.gdbsx_pauseunp_vcpu.vcpu]) == NULL ) + { + rcu_unlock_domain(d); + break; + } + if ( !atomic_read(&v->pause_count) ) + printk("WARN: Unpausing vcpu:%d which is not paused\n", v->vcpu_id); + vcpu_unpause(v); ret = 0; rcu_unlock_domain(d); + } + break; + case XEN_DOMCTL_gdbsx_domstatus: + { + struct domain *d; + struct vcpu *v; + + ret = -ESRCH; + if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL ) + break; + + domctl->u.gdbsx_domstatus.vcpu_id = -1; + domctl->u.gdbsx_domstatus.paused = d->is_paused_by_controller; + if ( domctl->u.gdbsx_domstatus.paused ) + { + for_each_vcpu ( d, v ) + { + if ( v->arch.gdbsx_vcpu_event ) + { + domctl->u.gdbsx_domstatus.vcpu_id = v->vcpu_id; + domctl->u.gdbsx_domstatus.vcpu_ev = + v->arch.gdbsx_vcpu_event; + v->arch.gdbsx_vcpu_event = 0; + break; + } + } + } + ret = 0; if ( copy_to_guest(u_domctl, domctl, 1) ) ret = -EFAULT; + rcu_unlock_domain(d); } break; @@ -485,9 +965,9 @@ void arch_get_info_guest(struct vcpu *v, c.nat->ctrlreg[3] = xen_pfn_to_cr3( pagetable_get_pfn(v->arch.guest_table)); #ifdef __x86_64__ - if ( !pagetable_is_null(v->arch.guest_table_user) ) - c.nat->ctrlreg[1] = xen_pfn_to_cr3( - pagetable_get_pfn(v->arch.guest_table_user)); + c.nat->ctrlreg[1] = + pagetable_is_null(v->arch.guest_table_user) ? 0 + : xen_pfn_to_cr3(pagetable_get_pfn(v->arch.guest_table_user)); #endif } #ifdef CONFIG_COMPAT diff -Naurp xen/arch/x86/e820.c xen-redhat/arch/x86/e820.c --- xen/arch/x86/e820.c +++ xen-redhat/arch/x86/e820.c @@ -2,6 +2,7 @@ #include <xen/init.h> #include <xen/lib.h> #include <xen/compat.h> +#include <xen/dmi.h> #include <asm/e820.h> #include <asm/page.h> @@ -367,6 +368,15 @@ static void __init clip_mem(void) } } +static void __init reserve_dmi_region(void) +{ + u32 base, len; + if ( (dmi_get_table(&base, &len) == 0) && ((base + len) > base) && + reserve_e820_ram(&e820, base, base + len) ) + printk("WARNING: DMI table located in E820 RAM %08x-%08x. Fixed.\n", + base, base+len); +} + static void __init machine_specific_memory_setup( struct e820entry *raw, int *raw_nr) { @@ -376,6 +386,73 @@ static void __init machine_specific_memo (void)copy_e820_map(raw, nr); clip_4gb(); clip_mem(); + reserve_dmi_region(); +} + +/* Reserve RAM area (@s,@e) in the specified e820 map. */ +int __init reserve_e820_ram(struct e820map *e820, uint64_t s, uint64_t e) +{ + uint64_t rs = 0, re = 0; + int i; + + for ( i = 0; i < e820->nr_map; i++ ) + { + /* Have we found the e820 region that includes the specified range? */ + rs = e820->map[i].addr; + re = rs + e820->map[i].size; + if ( (s >= rs) && (e <= re) ) + break; + } + + if ( (i == e820->nr_map) || (e820->map[i].type != E820_RAM) ) + return 0; + + if ( (s == rs) && (e == re) ) + { + /* Complete excision. */ + memmove(&e820->map[i], &e820->map[i+1], + (e820->nr_map-i-1) * sizeof(e820->map[0])); + e820->nr_map--; + } + else if ( s == rs ) + { + /* Truncate start. */ + e820->map[i].addr += e - s; + e820->map[i].size -= e - s; + } + else if ( e == re ) + { + /* Truncate end. */ + e820->map[i].size -= e - s; + } + else if ( e820->nr_map < ARRAY_SIZE(e820->map) ) + { + /* Split in two. */ + memmove(&e820->map[i+1], &e820->map[i], + (e820->nr_map-i) * sizeof(e820->map[0])); + e820->nr_map++; + e820->map[i].size = s - rs; + i++; + e820->map[i].addr = e; + e820->map[i].size = re - e; + } + else + { + /* e820map is at maximum size. We have to leak some space. */ + if ( (s - rs) > (re - e) ) + { + printk("e820 overflow: leaking RAM %"PRIx64"-%"PRIx64"\n", e, re); + e820->map[i].size = s - rs; + } + else + { + printk("e820 overflow: leaking RAM %"PRIx64"-%"PRIx64"\n", rs, s); + e820->map[i].addr = e; + e820->map[i].size = re - e; + } + } + + return 1; } unsigned long __init init_e820( diff -Naurp xen/arch/x86/flushtlb.c xen-redhat/arch/x86/flushtlb.c --- xen/arch/x86/flushtlb.c +++ xen-redhat/arch/x86/flushtlb.c @@ -83,9 +83,12 @@ void write_cr3(unsigned long cr3) hvm_flush_guest_tlbs(); #ifdef USER_MAPPINGS_ARE_GLOBAL - __pge_off(); - __asm__ __volatile__ ( "mov %0, %%cr3" : : "r" (cr3) : "memory" ); - __pge_on(); + { + unsigned long cr4 = read_cr4(); + write_cr4(cr4 & ~X86_CR4_PGE); + asm volatile ( "mov %0, %%cr3" : : "r" (cr3) : "memory" ); + write_cr4(cr4); + } #else __asm__ __volatile__ ( "mov %0, %%cr3" : : "r" (cr3) : "memory" ); #endif @@ -108,8 +111,12 @@ void local_flush_tlb(void) hvm_flush_guest_tlbs(); #ifdef USER_MAPPINGS_ARE_GLOBAL - __pge_off(); - __pge_on(); + { + unsigned long cr4 = read_cr4(); + write_cr4(cr4 & ~X86_CR4_PGE); + barrier(); + write_cr4(cr4); + } #else __asm__ __volatile__ ( "mov %0, %%cr3" : : "r" (read_cr3()) : "memory" ); #endif diff -Naurp xen/arch/x86/genapic/es7000plat.c xen-redhat/arch/x86/genapic/es7000plat.c --- xen/arch/x86/genapic/es7000plat.c +++ xen-redhat/arch/x86/genapic/es7000plat.c @@ -36,6 +36,7 @@ #include <asm/io.h> #include <asm/smp.h> #include <asm/apicdef.h> +#include <asm/processor.h> #include "es7000.h" /* diff -Naurp xen/arch/x86/hvm/hpet.c xen-redhat/arch/x86/hvm/hpet.c --- xen/arch/x86/hvm/hpet.c +++ xen-redhat/arch/x86/hvm/hpet.c @@ -24,6 +24,12 @@ #include <xen/sched.h> #include <xen/event.h> +#define domain_vhpet(x) (&(x)->arch.hvm_domain.pl_time.vhpet) +#define vcpu_vhpet(x) (domain_vhpet((x)->domain)) +#define vhpet_domain(x) (container_of((x), struct domain, \ + arch.hvm_domain.pl_time.vhpet)) +#define vhpet_vcpu(x) (pt_global_vcpu_target(vhpet_domain(x))) + #define HPET_BASE_ADDRESS 0xfed00000ULL #define HPET_MMAP_SIZE 1024 #define S_TO_NS 1000000000ULL /* 1s = 10^9 ns */ @@ -31,7 +37,7 @@ /* Frequency_of_TSC / frequency_of_HPET = 32 */ #define TSC_PER_HPET_TICK 32 -#define guest_time_hpet(v) (hvm_get_guest_time(v) / TSC_PER_HPET_TICK) +#define guest_time_hpet(hpet) (hvm_get_guest_tsc(vhpet_vcpu(hpet)) / TSC_PER_HPET_TICK) #define HPET_ID 0x000 #define HPET_PERIOD 0x004 @@ -71,8 +77,9 @@ #define HPET_TN_INT_ROUTE_CAP_MASK (0xffffffffULL \ << HPET_TN_INT_ROUTE_CAP_SHIFT) -#define hpet_tick_to_ns(h, tick) ((s_time_t)(tick)* \ - (S_TO_NS*TSC_PER_HPET_TICK)/h->tsc_freq) +#define hpet_tick_to_ns(h, tick) \ + ((s_time_t)((((tick) > (h)->hpet_to_ns_limit) ? \ + ~0ULL : (tick) * (h)->hpet_to_ns_scale) >> 10)) #define timer_config(h, n) (h->hpet.timers[n].config) #define timer_enabled(h, n) (timer_config(h, n) & HPET_TN_ENABLE) @@ -116,22 +123,26 @@ static inline uint64_t hpet_read_maincou ASSERT(spin_is_locked(&h->lock)); if ( hpet_enabled(h) ) - return guest_time_hpet(h->vcpu) + h->mc_offset; + return guest_time_hpet(h) + h->mc_offset; else return h->hpet.mc64; } -static unsigned long hpet_read( - struct vcpu *v, unsigned long addr, unsigned long length) +static int hpet_read( + struct vcpu *v, unsigned long addr, unsigned long length, + unsigned long *pval) { - HPETState *h = &v->domain->arch.hvm_domain.pl_time.vhpet; + HPETState *h = vcpu_vhpet(v); unsigned long result; uint64_t val; addr &= HPET_MMAP_SIZE-1; if ( hpet_check_access_length(addr, length) != 0 ) - return ~0UL; + { + result = ~0ul; + goto out; + } spin_lock(&h->lock); @@ -145,7 +156,9 @@ static unsigned long hpet_read( spin_unlock(&h->lock); - return result; + out: + *pval = result; + return 1; } static void hpet_stop_timer(HPETState *h, unsigned int tn) @@ -173,7 +186,7 @@ static void hpet_set_timer(HPETState *h, { /* HPET specification requires PIT shouldn't generate * interrupts if LegacyReplacementRoute is set for timer0 */ - PITState *pit = &h->vcpu->domain->arch.hvm_domain.pl_time.vpit; + PITState *pit = &vhpet_domain(h)->arch.hvm_domain.pl_time.vpit; pit_stop_channel0_irq(pit); } @@ -208,18 +221,18 @@ static inline uint64_t hpet_fixup_reg( return new; } -static void hpet_write( +static int hpet_write( struct vcpu *v, unsigned long addr, unsigned long length, unsigned long val) { - HPETState *h = &v->domain->arch.hvm_domain.pl_time.vhpet; + HPETState *h = vcpu_vhpet(v); uint64_t old_val, new_val; int tn, i; addr &= HPET_MMAP_SIZE-1; if ( hpet_check_access_length(addr, length) != 0 ) - return; + goto out; spin_lock(&h->lock); @@ -241,14 +254,14 @@ static void hpet_write( if ( !(old_val & HPET_CFG_ENABLE) && (new_val & HPET_CFG_ENABLE) ) { /* Enable main counter and interrupt generation. */ - h->mc_offset = h->hpet.mc64 - guest_time_hpet(h->vcpu); + h->mc_offset = h->hpet.mc64 - guest_time_hpet(h); for ( i = 0; i < HPET_TIMER_NUM; i++ ) hpet_set_timer(h, i); } else if ( (old_val & HPET_CFG_ENABLE) && !(new_val & HPET_CFG_ENABLE) ) { /* Halt main counter and disable interrupt generation. */ - h->hpet.mc64 = h->mc_offset + guest_time_hpet(h->vcpu); + h->hpet.mc64 = h->mc_offset + guest_time_hpet(h); for ( i = 0; i < HPET_TIMER_NUM; i++ ) hpet_stop_timer(h, i); } @@ -314,11 +327,15 @@ static void hpet_write( } spin_unlock(&h->lock); + + out: + return 1; } static int hpet_range(struct vcpu *v, unsigned long addr) { - return ((addr >= HPET_BASE_ADDRESS) && + return (v->domain->arch.hvm_domain.params[HVM_PARAM_HPET_ENABLED] && + (addr >= HPET_BASE_ADDRESS) && (addr < (HPET_BASE_ADDRESS + HPET_MMAP_SIZE))); } @@ -331,7 +348,7 @@ struct hvm_mmio_handler hpet_mmio_handle static void hpet_route_interrupt(HPETState *h, unsigned int tn) { unsigned int tn_int_route = timer_int_route(h, tn); - struct domain *d = h->vcpu->domain; + struct domain *d = vhpet_domain(h); ASSERT(spin_is_locked(&h->lock)); @@ -399,25 +416,25 @@ static void hpet_timer_fn(void *opaque) void hpet_migrate_timers(struct vcpu *v) { - struct HPETState *h = &v->domain->arch.hvm_domain.pl_time.vhpet; + struct HPETState *h = vcpu_vhpet(v); int i; - if ( v != h->vcpu ) + if (v != vhpet_vcpu (h)) return; - + for ( i = 0; i < HPET_TIMER_NUM; i++ ) migrate_timer(&h->timers[i], v->processor); } static int hpet_save(struct domain *d, hvm_domain_context_t *h) { - HPETState *hp = &d->arch.hvm_domain.pl_time.vhpet; + HPETState *hp = domain_vhpet(d); int rc; spin_lock(&hp->lock); /* Write the proper value into the main counter */ - hp->hpet.mc64 = hp->mc_offset + guest_time_hpet(hp->vcpu); + hp->hpet.mc64 = hp->mc_offset + guest_time_hpet(hp); /* Save the HPET registers */ rc = hvm_save_entry(HPET, 0, h, &hp->hpet); @@ -429,7 +446,7 @@ static int hpet_save(struct domain *d, h static int hpet_load(struct domain *d, hvm_domain_context_t *h) { - HPETState *hp = &d->arch.hvm_domain.pl_time.vhpet; + HPETState *hp = domain_vhpet(d); int i; spin_lock(&hp->lock); @@ -442,7 +459,7 @@ static int hpet_load(struct domain *d, h } /* Recalculate the offset between the main counter and guest time */ - hp->mc_offset = hp->hpet.mc64 - guest_time_hpet(hp->vcpu); + hp->mc_offset = hp->hpet.mc64 - guest_time_hpet(hp); /* Restart the timers */ for ( i = 0; i < HPET_TIMER_NUM; i++ ) @@ -457,16 +474,18 @@ HVM_REGISTER_SAVE_RESTORE(HPET, hpet_sav void hpet_init(struct vcpu *v) { - HPETState *h = &v->domain->arch.hvm_domain.pl_time.vhpet; + HPETState *h = vcpu_vhpet(v); int i; memset(h, 0, sizeof(HPETState)); spin_lock_init(&h->lock); - h->vcpu = v; h->tsc_freq = ticks_per_sec(v); + h->hpet_to_ns_scale = ((S_TO_NS * TSC_PER_HPET_TICK) << 10) / h->tsc_freq; + h->hpet_to_ns_limit = (~0ULL >> 1) / h->hpet_to_ns_scale; + /* 64-bit main counter; 3 timers supported; LegacyReplacementRoute. */ h->hpet.capability = 0x8086A201ULL; @@ -489,7 +508,7 @@ void hpet_init(struct vcpu *v) void hpet_deinit(struct domain *d) { int i; - HPETState *h = &d->arch.hvm_domain.pl_time.vhpet; + HPETState *h = domain_vhpet(d); for ( i = 0; i < HPET_TIMER_NUM; i++ ) kill_timer(&h->timers[i]); diff -Naurp xen/arch/x86/hvm/hvm.c xen-redhat/arch/x86/hvm/hvm.c --- xen/arch/x86/hvm/hvm.c +++ xen-redhat/arch/x86/hvm/hvm.c @@ -49,6 +49,16 @@ #include <public/version.h> #include <public/memory.h> +/* + * Xen command-line option to allow/disallow hardware-assisted paging. + * Since the phys-to-machine table of AMD NPT is in host format, 32-bit Xen + * can only support guests using NPT with up to a 4GB memory map. Therefore + * we disallow HAP by default on PAE Xen (by default we want to support an + * 8GB pseudophysical memory map for HVM guests on a PAE host). + */ +static int opt_hap_permitted = (CONFIG_PAGING_LEVELS != 3); +boolean_param("hap", opt_hap_permitted); + int hvm_enabled __read_mostly; unsigned int opt_hvm_debug_level __read_mostly; @@ -74,6 +84,14 @@ void hvm_enable(struct hvm_function_tabl hvm_funcs = *fns; hvm_enabled = 1; + + if ( hvm_funcs.hap_supported ) + { + if ( !opt_hap_permitted ) + hvm_funcs.hap_supported = 0; + printk("HVM: Hardware Assisted Paging detected %s.\n", + hvm_funcs.hap_supported ? "and enabled" : "but disabled"); + } } void hvm_disable(void) @@ -89,17 +107,17 @@ void hvm_stts(struct vcpu *v) hvm_funcs.stts(v); } -void hvm_set_guest_time(struct vcpu *v, u64 gtime) +void hvm_set_guest_tsc(struct vcpu *v, u64 guest_tsc) { u64 host_tsc; rdtscll(host_tsc); - v->arch.hvm_vcpu.cache_tsc_offset = gtime - host_tsc; + v->arch.hvm_vcpu.cache_tsc_offset = guest_tsc - host_tsc; hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset); } -u64 hvm_get_guest_time(struct vcpu *v) +u64 hvm_get_guest_tsc(struct vcpu *v) { u64 host_tsc; @@ -120,7 +138,7 @@ void hvm_do_resume(struct vcpu *v) hvm_stts(v); - pt_thaw_time(v); + pt_restore_timer(v); /* NB. Optimised for common case (p->state == STATE_IOREQ_NONE). */ p = &get_ioreq(v)->vp_ioreq; @@ -222,10 +240,19 @@ int hvm_domain_initialise(struct domain return -EINVAL; } + d->arch.hvm_domain.vmx_apic_access_mfn = INVALID_MFN; + spin_lock_init(&d->arch.hvm_domain.pbuf_lock); spin_lock_init(&d->arch.hvm_domain.irq_lock); spin_lock_init(&d->arch.hvm_domain.vapic_access_lock); + INIT_LIST_HEAD(&d->arch.hvm_domain.msixtbl_list); + spin_lock_init(&d->arch.hvm_domain.msixtbl_list_lock); + + hvm_init_guest_time(d); + + d->arch.hvm_domain.params[HVM_PARAM_HPET_ENABLED] = 1; + rc = paging_enable(d, PG_refcounts|PG_translate|PG_external); if ( rc != 0 ) return rc; @@ -236,14 +263,21 @@ int hvm_domain_initialise(struct domain hvm_init_ioreq_page(d, &d->arch.hvm_domain.ioreq); hvm_init_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq); - return 0; + if ( hvm_funcs.domain_initialise ) + rc = hvm_funcs.domain_initialise(d); + + return rc; } +extern void msixtbl_pt_cleanup(struct domain *d); + void hvm_domain_relinquish_resources(struct domain *d) { hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.ioreq); hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq); + msixtbl_pt_cleanup(d); + pit_deinit(d); rtc_deinit(d); pmtimer_deinit(d); @@ -252,6 +286,8 @@ void hvm_domain_relinquish_resources(str void hvm_domain_destroy(struct domain *d) { + if ( hvm_funcs.domain_destroy ) + hvm_funcs.domain_destroy(d); } static int hvm_save_cpu_ctxt(struct domain *d, hvm_domain_context_t *h) @@ -446,7 +482,7 @@ int hvm_vcpu_initialise(struct vcpu *v) hpet_init(v); /* Init guest TSC to start from zero. */ - hvm_set_guest_time(v, 0); + hvm_set_guest_tsc(v, 0); return 0; } @@ -553,6 +589,403 @@ void hvm_triple_fault(void) domain_shutdown(v->domain, SHUTDOWN_reboot); } +int hvm_virtual_to_linear_addr( + enum x86_segment seg, + struct segment_register *reg, + unsigned long offset, + unsigned int bytes, + enum hvm_access_type access_type, + unsigned int addr_size, + unsigned long *linear_addr) +{ + unsigned long addr = offset; + uint32_t last_byte; + + if ( addr_size != 64 ) + { + /* + * COMPATIBILITY MODE: Apply segment checks and add base. + */ + + switch ( access_type ) + { + case hvm_access_read: + if ( (reg->attr.fields.type & 0xa) == 0x8 ) + goto gpf; /* execute-only code segment */ + break; + case hvm_access_write: + if ( (reg->attr.fields.type & 0xa) != 0x2 ) + goto gpf; /* not a writable data segment */ + break; + default: + break; + } + + last_byte = offset + bytes - 1; + + /* Is this a grows-down data segment? Special limit check if so. */ + if ( (reg->attr.fields.type & 0xc) == 0x4 ) + { + /* Is upper limit 0xFFFF or 0xFFFFFFFF? */ + if ( !reg->attr.fields.db ) + last_byte = (uint16_t)last_byte; + + /* Check first byte and last byte against respective bounds. */ + if ( (offset <= reg->limit) || (last_byte < offset) ) + goto gpf; + } + else if ( (last_byte > reg->limit) || (last_byte < offset) ) + goto gpf; /* last byte is beyond limit or wraps 0xFFFFFFFF */ + + /* + * Hardware truncates to 32 bits in compatibility mode. + * It does not truncate to 16 bits in 16-bit address-size mode. + */ + addr = (uint32_t)(addr + reg->base); + } + else + { + /* + * LONG MODE: FS and GS add segment base. Addresses must be canonical. + */ + + if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) ) + addr += reg->base; + + if ( !is_canonical_address(addr) ) + goto gpf; + } + + *linear_addr = addr; + return 1; + + gpf: + return 0; +} + +static void *hvm_map(unsigned long va, int size) +{ + unsigned long gfn, mfn; + + if ( ((va & ~PAGE_MASK) + size) > PAGE_SIZE ) + { + hvm_inject_exception(TRAP_page_fault, PFEC_write_access, + (va + PAGE_SIZE - 1) & PAGE_MASK); + return NULL; + } + + gfn = paging_gva_to_gfn(current, va); + mfn = mfn_x(gfn_to_mfn_current(gfn)); + + ASSERT(mfn_valid(mfn)); + + paging_mark_dirty(current->domain, mfn); + + return (char *)map_domain_page(mfn) + (va & ~PAGE_MASK); +} + +static void hvm_unmap(void *p) +{ + if ( p ) + unmap_domain_page(p); +} + +static int hvm_load_segment_selector( + struct vcpu *v, enum x86_segment seg, uint16_t sel) +{ + struct segment_register desctab, cs, segr; + struct desc_struct *pdesc, desc; + u8 dpl, rpl, cpl; + int fault_type = TRAP_invalid_tss; + + /* NULL selector? */ + if ( (sel & 0xfffc) == 0 ) + { + if ( (seg == x86_seg_cs) || (seg == x86_seg_ss) ) + goto fail; + memset(&segr, 0, sizeof(segr)); + hvm_set_segment_register(v, seg, &segr); + return 0; + } + + /* LDT descriptor must be in the GDT. */ + if ( (seg == x86_seg_ldtr) && (sel & 4) ) + goto fail; + + hvm_get_segment_register(v, x86_seg_cs, &cs); + hvm_get_segment_register( + v, (sel & 4) ? x86_seg_ldtr : x86_seg_gdtr, &desctab); + + /* Check against descriptor table limit. */ + if ( ((sel & 0xfff8) + 7) > desctab.limit ) + goto fail; + + pdesc = hvm_map(desctab.base + (sel & 0xfff8), 8); + if ( pdesc == NULL ) + goto hvm_map_fail; + + do { + desc = *pdesc; + + /* Segment present in memory? */ + if ( !(desc.b & (1u<<15)) ) + { + fault_type = TRAP_no_segment; + goto unmap_and_fail; + } + + /* LDT descriptor is a system segment. All others are code/data. */ + if ( (desc.b & (1u<<12)) == ((seg == x86_seg_ldtr) << 12) ) + goto unmap_and_fail; + + dpl = (desc.b >> 13) & 3; + rpl = sel & 3; + cpl = cs.sel & 3; + + switch ( seg ) + { + case x86_seg_cs: + /* Code segment? */ + if ( !(desc.b & (1u<<11)) ) + goto unmap_and_fail; + /* Non-conforming segment: check DPL against RPL. */ + if ( ((desc.b & (6u<<9)) != 6) && (dpl != rpl) ) + goto unmap_and_fail; + break; + case x86_seg_ss: + /* Writable data segment? */ + if ( (desc.b & (5u<<9)) != (1u<<9) ) + goto unmap_and_fail; + if ( (dpl != cpl) || (dpl != rpl) ) + goto unmap_and_fail; + break; + case x86_seg_ldtr: + /* LDT system segment? */ + if ( (desc.b & (15u<<8)) != (2u<<8) ) + goto unmap_and_fail; + goto skip_accessed_flag; + default: + /* Readable code or data segment? */ + if ( (desc.b & (5u<<9)) == (4u<<9) ) + goto unmap_and_fail; + /* Non-conforming segment: check DPL against RPL and CPL. */ + if ( ((desc.b & (6u<<9)) != 6) && ((dpl < cpl) || (dpl < rpl)) ) + goto unmap_and_fail; + break; + } + } while ( !(desc.b & 0x100) && /* Ensure Accessed flag is set */ + (cmpxchg(&pdesc->b, desc.b, desc.b | 0x100) != desc.b) ); + + /* Force the Accessed flag in our local copy. */ + desc.b |= 0x100; + + skip_accessed_flag: + hvm_unmap(pdesc); + + segr.base = (((desc.b << 0) & 0xff000000u) | + ((desc.b << 16) & 0x00ff0000u) | + ((desc.a >> 16) & 0x0000ffffu)); + segr.attr.bytes = (((desc.b >> 8) & 0x00ffu) | + ((desc.b >> 12) & 0x0f00u)); + segr.limit = (desc.b & 0x000f0000u) | (desc.a & 0x0000ffffu); + if ( segr.attr.fields.g ) + segr.limit = (segr.limit << 12) | 0xfffu; + segr.sel = sel; + hvm_set_segment_register(v, seg, &segr); + + return 0; + + unmap_and_fail: + hvm_unmap(pdesc); + fail: + hvm_inject_exception(fault_type, sel & 0xfffc, 0); + hvm_map_fail: + return 1; +} + +void hvm_task_switch( + uint16_t tss_sel, enum hvm_task_switch_reason taskswitch_reason, + int32_t errcode) +{ + struct vcpu *v = current; + struct cpu_user_regs *regs = guest_cpu_user_regs(); + struct segment_register gdt, tr, prev_tr, segr; + struct desc_struct *optss_desc = NULL, *nptss_desc = NULL, tss_desc; + unsigned long eflags; + int exn_raised; + struct { + u16 back_link,__blh; + u32 esp0; + u16 ss0, _0; + u32 esp1; + u16 ss1, _1; + u32 esp2; + u16 ss2, _2; + u32 cr3, eip, eflags, eax, ecx, edx, ebx, esp, ebp, esi, edi; + u16 es, _3, cs, _4, ss, _5, ds, _6, fs, _7, gs, _8, ldt, _9; + u16 trace, iomap; + } *ptss, tss; + + hvm_get_segment_register(v, x86_seg_gdtr, &gdt); + hvm_get_segment_register(v, x86_seg_tr, &prev_tr); + + if ( ((tss_sel & 0xfff8) + 7) > gdt.limit ) + { + hvm_inject_exception((taskswitch_reason == TSW_iret) ? + TRAP_invalid_tss : TRAP_gp_fault, + tss_sel & 0xfff8, 0); + goto out; + } + + optss_desc = hvm_map(gdt.base + (prev_tr.sel & 0xfff8), 8); + if ( optss_desc == NULL ) + goto out; + + nptss_desc = hvm_map(gdt.base + (tss_sel & 0xfff8), 8); + if ( nptss_desc == NULL ) + goto out; + + tss_desc = *nptss_desc; + tr.sel = tss_sel; + tr.base = (((tss_desc.b << 0) & 0xff000000u) | + ((tss_desc.b << 16) & 0x00ff0000u) | + ((tss_desc.a >> 16) & 0x0000ffffu)); + tr.attr.bytes = (((tss_desc.b >> 8) & 0x00ffu) | + ((tss_desc.b >> 12) & 0x0f00u)); + tr.limit = (tss_desc.b & 0x000f0000u) | (tss_desc.a & 0x0000ffffu); + if ( tr.attr.fields.g ) + tr.limit = (tr.limit << 12) | 0xfffu; + + if ( !tr.attr.fields.p ) + { + hvm_inject_exception(TRAP_no_segment, tss_sel & 0xfff8, 0); + goto out; + } + + if ( tr.attr.fields.type != ((taskswitch_reason == TSW_iret) ? 0xb : 0x9) ) + { + hvm_inject_exception( + (taskswitch_reason == TSW_iret) ? TRAP_invalid_tss : TRAP_gp_fault, + tss_sel & 0xfff8, 0); + goto out; + } + + if ( !tr.attr.fields.g && (tr.limit < (sizeof(tss)-1)) ) + { + hvm_inject_exception(TRAP_invalid_tss, tss_sel & 0xfff8, 0); + goto out; + } + + hvm_store_cpu_guest_regs(v, regs, NULL); + + ptss = hvm_map(prev_tr.base, sizeof(tss)); + if ( ptss == NULL ) + goto out; + + eflags = regs->eflags; + if ( taskswitch_reason == TSW_iret ) + eflags &= ~X86_EFLAGS_NT; + + ptss->cr3 = hvm_get_guest_ctrl_reg(v, 3); + ptss->eip = regs->eip; + ptss->eflags = eflags; + ptss->eax = regs->eax; + ptss->ecx = regs->ecx; + ptss->edx = regs->edx; + ptss->ebx = regs->ebx; + ptss->esp = regs->esp; + ptss->ebp = regs->ebp; + ptss->esi = regs->esi; + ptss->edi = regs->edi; + + hvm_get_segment_register(v, x86_seg_es, &segr); + ptss->es = segr.sel; + hvm_get_segment_register(v, x86_seg_cs, &segr); + ptss->cs = segr.sel; + hvm_get_segment_register(v, x86_seg_ss, &segr); + ptss->ss = segr.sel; + hvm_get_segment_register(v, x86_seg_ds, &segr); + ptss->ds = segr.sel; + hvm_get_segment_register(v, x86_seg_fs, &segr); + ptss->fs = segr.sel; + hvm_get_segment_register(v, x86_seg_gs, &segr); + ptss->gs = segr.sel; + hvm_get_segment_register(v, x86_seg_ldtr, &segr); + ptss->ldt = segr.sel; + + hvm_unmap(ptss); + + ptss = hvm_map(tr.base, sizeof(tss)); + if ( ptss == NULL ) + goto out; + + if ( hvm_set_cr3(ptss->cr3) ) + goto out; + + regs->eip = ptss->eip; + regs->eflags = ptss->eflags | 2; + regs->eax = ptss->eax; + regs->ecx = ptss->ecx; + regs->edx = ptss->edx; + regs->ebx = ptss->ebx; + regs->esp = ptss->esp; + regs->ebp = ptss->ebp; + regs->esi = ptss->esi; + regs->edi = ptss->edi; + + if ( (taskswitch_reason == TSW_call_or_int) ) + { + regs->eflags |= X86_EFLAGS_NT; + ptss->back_link = prev_tr.sel; + } + + exn_raised = 0; + if ( hvm_load_segment_selector(v, x86_seg_es, ptss->es) || + hvm_load_segment_selector(v, x86_seg_cs, ptss->cs) || + hvm_load_segment_selector(v, x86_seg_ss, ptss->ss) || + hvm_load_segment_selector(v, x86_seg_ds, ptss->ds) || + hvm_load_segment_selector(v, x86_seg_fs, ptss->fs) || + hvm_load_segment_selector(v, x86_seg_gs, ptss->gs) || + hvm_load_segment_selector(v, x86_seg_ldtr, ptss->ldt) ) + exn_raised = 1; + + if ( (ptss->trace & 1) && !exn_raised ) + hvm_inject_exception(TRAP_debug, tss_sel & 0xfff8, 0); + + hvm_unmap(ptss); + + tr.attr.fields.type = 0xb; /* busy 32-bit tss */ + hvm_set_segment_register(v, x86_seg_tr, &tr); + + hvm_stts(v); + + if ( (taskswitch_reason == TSW_iret) || + (taskswitch_reason == TSW_jmp) ) + clear_bit(41, optss_desc); /* clear B flag of old task */ + + if ( taskswitch_reason != TSW_iret ) + set_bit(41, nptss_desc); /* set B flag of new task */ + + if ( errcode >= 0 ) + { + struct segment_register reg; + unsigned long linear_addr; + regs->esp -= 4; + hvm_get_segment_register(current, x86_seg_ss, ®); + /* Todo: do not ignore access faults here. */ + if ( hvm_virtual_to_linear_addr(x86_seg_ss, ®, regs->esp, + 4, hvm_access_write, 32, + &linear_addr) ) + hvm_copy_to_guest_virt(linear_addr, &errcode, 4); + } + + hvm_load_cpu_guest_regs(v, regs); + + out: + hvm_unmap(optss_desc); + hvm_unmap(nptss_desc); +} + /* * __hvm_copy(): * @buf = hypervisor buffer @@ -580,7 +1013,8 @@ static int __hvm_copy(void *buf, paddr_t mfn = get_mfn_from_gpfn(gfn); - if ( mfn == INVALID_MFN ) + if ( (mfn == current->domain->arch.hvm_domain.vmx_apic_access_mfn) || + (mfn == INVALID_MFN) ) return todo; p = (char *)map_domain_page(mfn) + (addr & ~PAGE_MASK); @@ -663,6 +1097,9 @@ void hvm_cpuid(unsigned int input, unsig #endif clear_bit(X86_FEATURE_PAE & 31, edx); clear_bit(X86_FEATURE_PSE36 & 31, edx); + + /* "Hypervisor present" bit required for Microsoft SVVP. */ + set_bit (X86_FEATURE_HYPERVISOR & 31, ecx); } else if ( input == 0x80000001 ) { @@ -1106,6 +1543,11 @@ long do_hvm_op(unsigned long op, XEN_GUE hvm_set_callback_via(d, a.value); hvm_latch_shinfo_size(d); break; + case HVM_PARAM_TIMER_MODE: + rc = -EINVAL; + if ( a.value > HVMPTM_one_missed_tick_pending ) + goto param_fail; + break; } d->arch.hvm_domain.params[a.index] = a.value; rc = 0; @@ -1144,6 +1586,15 @@ long do_hvm_op(unsigned long op, XEN_GUE rc = guest_handle_is_null(arg) ? hvmop_flush_tlb_all() : -ENOSYS; break; + case HVMOP_get_time: { + xen_hvm_get_time_t gxt; + + gxt.now = NOW(); + if ( copy_to_guest(arg, &gxt, 1) ) + rc = -EFAULT; + break; + } + default: { gdprintk(XENLOG_WARNING, "Bad HVM op %ld.\n", op); diff -Naurp xen/arch/x86/hvm/i8254.c xen-redhat/arch/x86/hvm/i8254.c --- xen/arch/x86/hvm/i8254.c +++ xen-redhat/arch/x86/hvm/i8254.c @@ -31,6 +31,7 @@ #include <xen/lib.h> #include <xen/errno.h> #include <xen/sched.h> +#include <asm/time.h> #include <asm/hvm/hvm.h> #include <asm/hvm/io.h> #include <asm/hvm/support.h> @@ -41,7 +42,7 @@ #define vcpu_vpit(vcpu) (domain_vpit((vcpu)->domain)) #define vpit_domain(pit) (container_of((pit), struct domain, \ arch.hvm_domain.pl_time.vpit)) -#define vpit_vcpu(pit) (vpit_domain(pit)->vcpu[0]) +#define vpit_vcpu(pit) (pt_global_vcpu_target(vpit_domain(pit))) #define RW_STATE_LSB 1 #define RW_STATE_MSB 2 @@ -51,6 +52,9 @@ static int handle_pit_io(ioreq_t *p); static int handle_speaker_io(ioreq_t *p); +#define get_guest_time(v) \ + (is_hvm_vcpu(v) ? hvm_get_guest_time(v) : (u64)get_s_time()) + /* Compute with 96 bit intermediate result: (a*b)/c */ static uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) { @@ -84,8 +88,8 @@ static int pit_get_count(PITState *pit, ASSERT(spin_is_locked(&pit->lock)); - d = muldiv64(hvm_get_guest_time(v) - pit->count_load_time[channel], - PIT_FREQ, ticks_per_sec(v)); + d = muldiv64(get_guest_time(v) - pit->count_load_time[channel], + PIT_FREQ, SYSTEM_TIME_HZ); switch ( c->mode ) { @@ -115,8 +119,8 @@ static int pit_get_out(PITState *pit, in ASSERT(spin_is_locked(&pit->lock)); - d = muldiv64(hvm_get_guest_time(v) - pit->count_load_time[channel], - PIT_FREQ, ticks_per_sec(v)); + d = muldiv64(get_guest_time(v) - pit->count_load_time[channel], + PIT_FREQ, SYSTEM_TIME_HZ); switch ( s->mode ) { @@ -162,7 +166,7 @@ static void pit_set_gate(PITState *pit, case 3: /* Restart counting on rising edge. */ if ( s->gate < val ) - pit->count_load_time[channel] = hvm_get_guest_time(v); + pit->count_load_time[channel] = get_guest_time(v); break; } @@ -178,7 +182,7 @@ int pit_get_gate(PITState *pit, int chan static void pit_time_fired(struct vcpu *v, void *priv) { uint64_t *count_load_time = priv; - *count_load_time = hvm_get_guest_time(v); + *count_load_time = get_guest_time(v); } static void pit_load_count(PITState *pit, int channel, int val) @@ -194,11 +198,11 @@ static void pit_load_count(PITState *pit val = 0x10000; if ( v == NULL ) - rdtscll(pit->count_load_time[channel]); + pit->count_load_time[channel] = 0; else - pit->count_load_time[channel] = hvm_get_guest_time(v); + pit->count_load_time[channel] = get_guest_time(v); s->count = val; - period = DIV_ROUND((val * 1000000000ULL), PIT_FREQ); + period = DIV_ROUND(val * SYSTEM_TIME_HZ, PIT_FREQ); if ( (v == NULL) || !is_hvm_vcpu(v) || (channel != 0) ) return; @@ -484,7 +488,7 @@ static int pit_load(struct domain *d, hv for ( i = 0; i < 3; i++ ) { pit_load_count(pit, i, pit->hw.channels[i].count); - pit->pt[i].last_plt_gtime = hvm_get_guest_time(d->vcpu[0]); + pit->pt[i].last_plt_gtime = get_guest_time(d->vcpu[0]); } pit_info(pit); @@ -517,6 +521,7 @@ void pit_init(struct vcpu *v, unsigned l s->mode = 0xff; /* the init mode */ s->gate = (i != 2); pit_load_count(pit, i, 0); + pit->pt[i].source = PTSRC_isa; } spin_unlock(&pit->lock); @@ -598,11 +603,13 @@ int pv_pit_handler(int port, int data, i .size = 1, .type = IOREQ_TYPE_PIO, .addr = port, - .dir = write ? 0 : 1, - .data = write ? data : 0, + .dir = write ? IOREQ_WRITE : IOREQ_READ, + .data = data }; - if ( port == 0x61 ) + if ( (current->domain->domain_id == 0) && dom0_pit_access(&ioreq) ) + /* nothing to do */; + else if ( port == 0x61 ) handle_speaker_io(&ioreq); else handle_pit_io(&ioreq); diff -Naurp xen/arch/x86/hvm/intercept.c xen-redhat/arch/x86/hvm/intercept.c --- xen/arch/x86/hvm/intercept.c +++ xen-redhat/arch/x86/hvm/intercept.c @@ -34,14 +34,16 @@ extern struct hvm_mmio_handler hpet_mmio_handler; extern struct hvm_mmio_handler vlapic_mmio_handler; extern struct hvm_mmio_handler vioapic_mmio_handler; +extern struct hvm_mmio_handler msixtbl_mmio_handler; -#define HVM_MMIO_HANDLER_NR 3 +#define HVM_MMIO_HANDLER_NR 4 static struct hvm_mmio_handler *hvm_mmio_handlers[HVM_MMIO_HANDLER_NR] = { &hpet_mmio_handler, &vlapic_mmio_handler, - &vioapic_mmio_handler + &vioapic_mmio_handler, + &msixtbl_mmio_handler }; struct hvm_buffered_io_range { @@ -58,30 +60,33 @@ static struct hvm_buffered_io_range &buffered_stdvga_range }; -static inline void hvm_mmio_access(struct vcpu *v, - ioreq_t *p, - hvm_mmio_read_t read_handler, - hvm_mmio_write_t write_handler) +static inline int hvm_mmio_access(struct vcpu *v, + ioreq_t *p, + hvm_mmio_read_t read_handler, + hvm_mmio_write_t write_handler) { - unsigned int tmp1, tmp2; + unsigned long tmp1, tmp2; unsigned long data; + int rc = 1; switch ( p->type ) { case IOREQ_TYPE_COPY: { if ( !p->data_is_ptr ) { - if ( p->dir == IOREQ_READ ) - p->data = read_handler(v, p->addr, p->size); + if ( p->dir == IOREQ_READ ) { + rc = read_handler(v, p->addr, p->size, &data); + p->data = data; + } else /* p->dir == IOREQ_WRITE */ - write_handler(v, p->addr, p->size, p->data); + rc = write_handler(v, p->addr, p->size, p->data); } else { /* p->data_is_ptr */ int i, sign = (p->df) ? -1 : 1; if ( p->dir == IOREQ_READ ) { for ( i = 0; i < p->count; i++ ) { - data = read_handler(v, + rc = read_handler(v, p->addr + (sign * i * p->size), - p->size); + p->size, &data); (void)hvm_copy_to_guest_phys( p->data + (sign * i * p->size), &data, @@ -93,7 +98,7 @@ static inline void hvm_mmio_access(struc &data, p->data + (sign * i * p->size), p->size); - write_handler(v, + rc = write_handler(v, p->addr + (sign * i * p->size), p->size, data); } @@ -103,37 +108,37 @@ static inline void hvm_mmio_access(struc } case IOREQ_TYPE_AND: - tmp1 = read_handler(v, p->addr, p->size); - if ( p->dir == IOREQ_WRITE ) { + rc = read_handler(v, p->addr, p->size, &tmp1); + if ( rc && p->dir == IOREQ_WRITE ) { tmp2 = tmp1 & (unsigned long) p->data; - write_handler(v, p->addr, p->size, tmp2); + rc = write_handler(v, p->addr, p->size, tmp2); } p->data = tmp1; break; case IOREQ_TYPE_ADD: - tmp1 = read_handler(v, p->addr, p->size); - if (p->dir == IOREQ_WRITE) { + rc = read_handler(v, p->addr, p->size, &tmp1); + if ( rc && p->dir == IOREQ_WRITE) { tmp2 = tmp1 + (unsigned long) p->data; - write_handler(v, p->addr, p->size, tmp2); + rc = write_handler(v, p->addr, p->size, tmp2); } p->data = tmp1; break; case IOREQ_TYPE_OR: - tmp1 = read_handler(v, p->addr, p->size); - if ( p->dir == IOREQ_WRITE ) { + rc = read_handler(v, p->addr, p->size, &tmp1); + if ( rc && p->dir == IOREQ_WRITE ) { tmp2 = tmp1 | (unsigned long) p->data; - write_handler(v, p->addr, p->size, tmp2); + rc = write_handler(v, p->addr, p->size, tmp2); } p->data = tmp1; break; case IOREQ_TYPE_XOR: - tmp1 = read_handler(v, p->addr, p->size); - if ( p->dir == IOREQ_WRITE ) { + rc = read_handler(v, p->addr, p->size, &tmp1); + if ( rc && p->dir == IOREQ_WRITE ) { tmp2 = tmp1 ^ (unsigned long) p->data; - write_handler(v, p->addr, p->size, tmp2); + rc = write_handler(v, p->addr, p->size, tmp2); } p->data = tmp1; break; @@ -143,25 +148,29 @@ static inline void hvm_mmio_access(struc * Note that we don't need to be atomic here since VCPU is accessing * its own local APIC. */ - tmp1 = read_handler(v, p->addr, p->size); - write_handler(v, p->addr, p->size, (unsigned long) p->data); + rc = read_handler(v, p->addr, p->size, &tmp1); + if ( rc ) + rc = write_handler(v, p->addr, p->size, (unsigned long) p->data); p->data = tmp1; break; case IOREQ_TYPE_SUB: - tmp1 = read_handler(v, p->addr, p->size); - if ( p->dir == IOREQ_WRITE ) { + rc = read_handler(v, p->addr, p->size, &tmp1); + if ( rc && p->dir == IOREQ_WRITE ) { tmp2 = tmp1 - (unsigned long) p->data; - write_handler(v, p->addr, p->size, tmp2); + rc = write_handler(v, p->addr, p->size, tmp2); } p->data = tmp1; break; default: + rc = 0; printk("hvm_mmio_access: error ioreq type %x\n", p->type); domain_crash_synchronous(); break; } + + return rc; } int hvm_buffered_io_send(ioreq_t *p) @@ -218,15 +227,11 @@ int hvm_mmio_intercept(ioreq_t *p) int i; for ( i = 0; i < HVM_MMIO_HANDLER_NR; i++ ) - { if ( hvm_mmio_handlers[i]->check_handler(v, p->addr) ) - { - hvm_mmio_access(v, p, - hvm_mmio_handlers[i]->read_handler, - hvm_mmio_handlers[i]->write_handler); - return 1; - } - } + return hvm_mmio_access( + v, p, + hvm_mmio_handlers[i]->read_handler, + hvm_mmio_handlers[i]->write_handler); return 0; } @@ -243,6 +248,9 @@ int hvm_io_intercept(ioreq_t *p, int typ int i; unsigned long addr, size; + if ( (type == HVM_PORTIO) && (dpci_ioport_intercept(p)) ) + return 1; + for (i = 0; i < handler->num_slot; i++) { if( type != handler->hdl_list[i].type) continue; diff -Naurp xen/arch/x86/hvm/io.c xen-redhat/arch/x86/hvm/io.c --- xen/arch/x86/hvm/io.c +++ xen-redhat/arch/x86/hvm/io.c @@ -43,6 +43,7 @@ #include <public/sched.h> #include <public/hvm/ioreq.h> +#include <xen/iocap.h> #if defined (__i386__) static void set_reg_value (int size, int index, int seg, struct cpu_user_regs *regs, long value) @@ -873,6 +874,108 @@ void hvm_io_assist(void) vcpu_end_shutdown_deferral(v); } +void dpci_ioport_read(uint32_t mport, ioreq_t *p) +{ + int i, sign = p->df ? -1 : 1; + uint32_t data = 0; + + for ( i = 0; i < p->count; i++ ) + { + switch ( p->size ) + { + case 1: + data = inb(mport); + break; + case 2: + data = inw(mport); + break; + case 4: + data = inl(mport); + break; + default: + BUG(); + } + + if ( p->data_is_ptr ) + (void)hvm_copy_to_guest_phys( + p->data + (sign * i * p->size), &data, p->size); + else + p->data = data; + } +} + +void dpci_ioport_write(uint32_t mport, ioreq_t *p) +{ + int i, sign = p->df ? -1 : 1; + uint32_t data; + + for ( i = 0; i < p->count; i++ ) + { + data = p->data; + if ( p->data_is_ptr ) + (void)hvm_copy_from_guest_phys( + &data, p->data + (sign * i * p->size), p->size); + + switch ( p->size ) + { + case 1: + outb(data, mport); + break; + case 2: + outw(data, mport); + break; + case 4: + outl(data, mport); + break; + default: + BUG(); + } + } +} + +int dpci_ioport_intercept(ioreq_t *p) +{ + struct domain *d = current->domain; + struct hvm_iommu *hd = domain_hvm_iommu(d); + struct g2m_ioport *g2m_ioport; + unsigned int mport, gport = p->addr; + unsigned int s = 0, e = 0; + + list_for_each_entry( g2m_ioport, &hd->g2m_ioport_list, list ) + { + s = g2m_ioport->gport; + e = s + g2m_ioport->np; + if ( (gport >= s) && (gport < e) ) + goto found; + } + + return 0; + + found: + mport = (gport - s) + g2m_ioport->mport; + + if ( !ioports_access_permitted(d, mport, mport + p->size - 1) ) + { + gdprintk(XENLOG_ERR, "Error: access to gport=0x%x denied!\n", + (uint32_t)p->addr); + return 0; + } + + switch ( p->dir ) + { + case IOREQ_READ: + dpci_ioport_read(mport, p); + break; + case IOREQ_WRITE: + dpci_ioport_write(mport, p); + break; + default: + gdprintk(XENLOG_ERR, "Error: couldn't handle p->dir = %d", p->dir); + } + + return 1; +} + /* * Local variables: * mode: C diff -Naurp xen/arch/x86/hvm/irq.c xen-redhat/arch/x86/hvm/irq.c --- xen/arch/x86/hvm/irq.c +++ xen-redhat/arch/x86/hvm/irq.c @@ -125,17 +125,13 @@ void hvm_isa_irq_deassert( spin_unlock(&d->arch.hvm_domain.irq_lock); } -void hvm_set_callback_irq_level(void) +static void hvm_set_callback_irq_level(struct vcpu *v) { - struct vcpu *v = current; struct domain *d = v->domain; struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; unsigned int gsi, pdev, pintx, asserted; - /* Fast lock-free tests. */ - if ( (v->vcpu_id != 0) || - (hvm_irq->callback_via_type == HVMIRQ_callback_none) ) - return; + ASSERT(v->vcpu_id == 0); spin_lock(&d->arch.hvm_domain.irq_lock); @@ -177,6 +173,22 @@ void hvm_set_callback_irq_level(void) spin_unlock(&d->arch.hvm_domain.irq_lock); } +void hvm_maybe_deassert_evtchn_irq(void) +{ + struct domain *d = current->domain; + struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; + + if ( hvm_irq->callback_via_asserted && + !vcpu_info(d->vcpu[0], evtchn_upcall_pending) ) + hvm_set_callback_irq_level(d->vcpu[0]); +} + +void hvm_assert_evtchn_irq(struct vcpu *v) +{ + if ( v->vcpu_id == 0 ) + hvm_set_callback_irq_level(v); +} + void hvm_set_pci_link_route(struct domain *d, u8 link, u8 isa_irq) { struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; @@ -285,71 +297,69 @@ void hvm_set_callback_via(struct domain } } -int cpu_has_pending_irq(struct vcpu *v) +enum hvm_intack hvm_vcpu_has_pending_irq(struct vcpu *v) { struct hvm_domain *plat = &v->domain->arch.hvm_domain; - /* APIC */ + if ( unlikely(v->arch.hvm_vcpu.nmi_pending) ) + return hvm_intack_nmi; + if ( vlapic_has_interrupt(v) != -1 ) - return 1; + return hvm_intack_lapic; - /* PIC */ if ( !vlapic_accept_pic_intr(v) ) - return 0; + return hvm_intack_none; - return plat->vpic[0].int_output; + return plat->vpic[0].int_output ? hvm_intack_pic : hvm_intack_none; } -int cpu_get_interrupt(struct vcpu *v, int *type) +int hvm_vcpu_ack_pending_irq(struct vcpu *v, enum hvm_intack type, int *vector) { - int vector; - - if ( (vector = cpu_get_apic_interrupt(v, type)) != -1 ) - return vector; - - if ( (v->vcpu_id == 0) && - ((vector = cpu_get_pic_interrupt(v, type)) != -1) ) - return vector; + switch ( type ) + { + case hvm_intack_nmi: + return test_and_clear_bool(v->arch.hvm_vcpu.nmi_pending); + case hvm_intack_lapic: + return ((*vector = cpu_get_apic_interrupt(v)) != -1); + case hvm_intack_pic: + ASSERT(v->vcpu_id == 0); + return ((*vector = cpu_get_pic_interrupt(v)) != -1); + default: + break; + } - return -1; + return 0; } -int get_isa_irq_vector(struct vcpu *v, int isa_irq, int type) +int get_isa_irq_vector(struct vcpu *v, int isa_irq, enum hvm_intack src) { unsigned int gsi = hvm_isa_irq_to_gsi(isa_irq); - if ( type == APIC_DM_EXTINT ) + if ( src == hvm_intack_pic ) return (v->domain->arch.hvm_domain.vpic[isa_irq >> 3].irq_base + (isa_irq & 7)); + ASSERT(src == hvm_intack_lapic); return domain_vioapic(v->domain)->redirtbl[gsi].fields.vector; } int is_isa_irq_masked(struct vcpu *v, int isa_irq) { unsigned int gsi = hvm_isa_irq_to_gsi(isa_irq); + uint8_t pic_imr = v->domain->arch.hvm_domain.vpic[isa_irq >> 3].imr; - if ( is_lvtt(v, isa_irq) ) - return !is_lvtt_enabled(v); - - return ((v->domain->arch.hvm_domain.vpic[isa_irq >> 3].imr & - (1 << (isa_irq & 7))) && + return (((pic_imr & (1 << (isa_irq & 7))) || !vlapic_accept_pic_intr(v)) && domain_vioapic(v->domain)->redirtbl[gsi].fields.mask); } -/* - * TODO: 1. Should not need special treatment of event-channel events. - * 2. Should take notice of interrupt shadows (or clear them). - */ int hvm_local_events_need_delivery(struct vcpu *v) { - int pending; - - pending = (vcpu_info(v, evtchn_upcall_pending) || cpu_has_pending_irq(v)); - if ( unlikely(pending) ) - pending = hvm_interrupts_enabled(v); + enum hvm_intack type = hvm_vcpu_has_pending_irq(v); + + if ( likely(type == hvm_intack_none) ) + return 0; - return pending; + return hvm_interrupts_enabled(v, type); } #if 0 /* Keep for debugging */ @@ -388,9 +398,33 @@ static void irq_dump(struct domain *d) static int irq_save_pci(struct domain *d, hvm_domain_context_t *h) { struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; + unsigned int asserted, pdev, pintx; + int rc; + + spin_lock(&d->arch.hvm_domain.irq_lock); + + pdev = hvm_irq->callback_via.pci.dev; + pintx = hvm_irq->callback_via.pci.intx; + asserted = (hvm_irq->callback_via_asserted && + (hvm_irq->callback_via_type == HVMIRQ_callback_pci_intx)); + + /* + * Deassert virtual interrupt via PCI INTx line. The virtual interrupt + * status is not save/restored, so the INTx line must be deasserted in + * the restore context. + */ + if ( asserted ) + __hvm_pci_intx_deassert(d, pdev, pintx); /* Save PCI IRQ lines */ - return ( hvm_save_entry(PCI_IRQ, 0, h, &hvm_irq->pci_intx) ); + rc = hvm_save_entry(PCI_IRQ, 0, h, &hvm_irq->pci_intx); + + if ( asserted ) + __hvm_pci_intx_assert(d, pdev, pintx); + + spin_unlock(&d->arch.hvm_domain.irq_lock); + + return rc; } static int irq_save_isa(struct domain *d, hvm_domain_context_t *h) diff -Naurp xen/arch/x86/hvm/Makefile xen-redhat/arch/x86/hvm/Makefile --- xen/arch/x86/hvm/Makefile +++ xen-redhat/arch/x86/hvm/Makefile @@ -16,3 +16,4 @@ obj-y += vioapic.o obj-y += vlapic.o obj-y += vpic.o obj-y += save.o +obj-y += vmsi.o diff -Naurp xen/arch/x86/hvm/platform.c xen-redhat/arch/x86/hvm/platform.c --- xen/arch/x86/hvm/platform.c +++ xen-redhat/arch/x86/hvm/platform.c @@ -423,6 +423,17 @@ static int mmio_decode(int address_bytes GET_OP_SIZE_FOR_BYTE(size_reg); return reg_mem(size_reg, opcode, mmio_op, rex); + case 0x01: /* add r32/16, m32/16 */ + mmio_op->instr = INSTR_ADD; + GET_OP_SIZE_FOR_NONEBYTE(*op_size); + return reg_mem(*op_size, opcode, mmio_op, rex); + + case 0x02: /* add m8, r8 */ + mmio_op->instr = INSTR_ADD; + *op_size = BYTE; + GET_OP_SIZE_FOR_BYTE(size_reg); + return mem_reg(size_reg, opcode, mmio_op, rex); + case 0x03: /* add m32/16, r32/16 */ mmio_op->instr = INSTR_ADD; GET_OP_SIZE_FOR_NONEBYTE(*op_size); @@ -472,6 +483,23 @@ static int mmio_decode(int address_bytes GET_OP_SIZE_FOR_NONEBYTE(*op_size); return mem_reg(*op_size, opcode, mmio_op, rex); + case 0x28: /* sub r8, m8 */ + mmio_op->instr = INSTR_SUB; + *op_size = BYTE; + GET_OP_SIZE_FOR_BYTE(size_reg); + return reg_mem(size_reg, opcode, mmio_op, rex); + + case 0x29: /* sub r32/16, m32/16 */ + mmio_op->instr = INSTR_SUB; + GET_OP_SIZE_FOR_NONEBYTE(*op_size); + return reg_mem(*op_size, opcode, mmio_op, rex); + + case 0x2A: /* sub m8, r8 */ + mmio_op->instr = INSTR_SUB; + *op_size = BYTE; + GET_OP_SIZE_FOR_BYTE(size_reg); + return mem_reg(size_reg, opcode, mmio_op, rex); + case 0x2B: /* sub m32/16, r32/16 */ mmio_op->instr = INSTR_SUB; GET_OP_SIZE_FOR_NONEBYTE(*op_size); @@ -494,6 +522,11 @@ static int mmio_decode(int address_bytes GET_OP_SIZE_FOR_BYTE(size_reg); return mem_reg(size_reg, opcode, mmio_op, rex); + case 0x33: /* xor m16/32, r16/32 */ + mmio_op->instr = INSTR_XOR; + GET_OP_SIZE_FOR_NONEBYTE(*op_size); + return mem_reg(*op_size, opcode, mmio_op, rex); + case 0x38: /* cmp r8, m8 */ mmio_op->instr = INSTR_CMP; *op_size = BYTE; @@ -1057,7 +1090,9 @@ void handle_mmio(unsigned long gpa) for ( i = 0; i < inst_len; i++ ) printk(" %02x", inst[i] & 0xFF); printk("\n"); - domain_crash_synchronous(); + + hvm_inject_exception(TRAP_invalid_op, -1, 0); + return; } regs->eip += inst_len; /* advance %eip */ diff -Naurp xen/arch/x86/hvm/pmtimer.c xen-redhat/arch/x86/hvm/pmtimer.c --- xen/arch/x86/hvm/pmtimer.c +++ xen-redhat/arch/x86/hvm/pmtimer.c @@ -65,14 +65,16 @@ static void pmt_update_sci(PMTState *s) * since the last time we did that. */ static void pmt_update_time(PMTState *s) { - uint64_t curr_gtime; + uint64_t curr_gtime, tmp; uint32_t msb = s->pm.tmr_val & TMR_VAL_MSB; ASSERT(spin_is_locked(&s->lock)); /* Update the timer */ curr_gtime = hvm_get_guest_time(s->vcpu); - s->pm.tmr_val += ((curr_gtime - s->last_gtime) * s->scale) >> 32; + tmp = ((curr_gtime - s->last_gtime) * s->scale) + s->not_accounted; + s->not_accounted = (uint32_t)tmp; + s->pm.tmr_val += tmp >> 32; s->pm.tmr_val &= TMR_VAL_MASK; s->last_gtime = curr_gtime; @@ -238,6 +240,7 @@ static int pmtimer_load(struct domain *d /* Calculate future counter values from now. */ s->last_gtime = hvm_get_guest_time(s->vcpu); + s->not_accounted = 0; /* Set the SCI state from the registers */ pmt_update_sci(s); @@ -256,7 +259,8 @@ void pmtimer_init(struct vcpu *v) spin_lock_init(&s->lock); - s->scale = ((uint64_t)FREQUENCE_PMTIMER << 32) / ticks_per_sec(v); + s->scale = ((uint64_t)FREQUENCE_PMTIMER << 32) / SYSTEM_TIME_HZ; + s->not_accounted = 0; s->vcpu = v; /* Intercept port I/O (need two handlers because PM1a_CNT is between diff -Naurp xen/arch/x86/hvm/rtc.c xen-redhat/arch/x86/hvm/rtc.c --- xen/arch/x86/hvm/rtc.c +++ xen-redhat/arch/x86/hvm/rtc.c @@ -32,7 +32,7 @@ #define vcpu_vrtc(vcpu) (domain_vrtc((vcpu)->domain)) #define vrtc_domain(rtc) (container_of((rtc), struct domain, \ arch.hvm_domain.pl_time.vrtc)) -#define vrtc_vcpu(rtc) (vrtc_domain(rtc)->vcpu[0]) +#define vrtc_vcpu(rtc) (pt_global_vcpu_target(vrtc_domain(rtc))) static void rtc_periodic_cb(struct vcpu *v, void *opaque) { @@ -42,14 +42,6 @@ static void rtc_periodic_cb(struct vcpu spin_unlock(&s->lock); } -int is_rtc_periodic_irq(void *opaque) -{ - RTCState *s = opaque; - - return !(s->hw.cmos_data[RTC_REG_C] & RTC_AF || - s->hw.cmos_data[RTC_REG_C] & RTC_UF); -} - /* Enable/configure/disable the periodic timer based on the RTC_PIE and * RTC_RATE_SELECT settings */ static void rtc_timer_update(RTCState *s) @@ -489,6 +481,8 @@ void rtc_init(struct vcpu *v, int base) spin_lock_init(&s->lock); + s->pt.source = PTSRC_isa; + s->hw.cmos_data[RTC_REG_A] = RTC_REF_CLCK_32KHZ | 6; /* ~1kHz */ s->hw.cmos_data[RTC_REG_B] = RTC_24H; s->hw.cmos_data[RTC_REG_C] = 0; diff -Naurp xen/arch/x86/hvm/save.c xen-redhat/arch/x86/hvm/save.c --- xen/arch/x86/hvm/save.c +++ xen-redhat/arch/x86/hvm/save.c @@ -23,6 +23,8 @@ #include <xen/version.h> #include <public/version.h> #include <xen/sched.h> +#include <xen/guest_access.h> + #include <asm/hvm/hvm.h> #include <asm/hvm/support.h> #include <asm/hvm/domain.h> @@ -74,6 +76,53 @@ size_t hvm_save_size(struct domain *d) return sz; } +/* Extract a single instance of a save record, by marshalling all + * records of that type and copying out the one we need. */ +int hvm_save_one(struct domain *d, uint16_t typecode, uint16_t instance, + XEN_GUEST_HANDLE_64(uint8_t) handle) +{ + int rv = 0; + size_t sz = 0; + struct vcpu *v; + hvm_domain_context_t ctxt = { 0, }; + + if ( d->is_dying + || typecode > HVM_SAVE_CODE_MAX + || hvm_sr_handlers[typecode].size < sizeof(struct hvm_save_descriptor) + || hvm_sr_handlers[typecode].save == NULL ) + return -EINVAL; + + if ( hvm_sr_handlers[typecode].kind == HVMSR_PER_VCPU ) + for_each_vcpu(d, v) + sz += hvm_sr_handlers[typecode].size; + else + sz = hvm_sr_handlers[typecode].size; + + if ( (instance + 1) * hvm_sr_handlers[typecode].size > sz ) + return -EINVAL; + + ctxt.size = sz; + ctxt.data = xmalloc_bytes(sz); + if ( !ctxt.data ) + return -ENOMEM; + + if ( hvm_sr_handlers[typecode].save(d, &ctxt) != 0 ) + { + gdprintk(XENLOG_ERR, + "HVM save: failed to save type %"PRIu16"\n", typecode); + rv = -EFAULT; + } + else if ( copy_to_guest(handle, + ctxt.data + + (instance * hvm_sr_handlers[typecode].size) + + sizeof (struct hvm_save_descriptor), + hvm_sr_handlers[typecode].size + - sizeof (struct hvm_save_descriptor)) ) + rv = -EFAULT; + + xfree(ctxt.data); + return rv; +} int hvm_save(struct domain *d, hvm_domain_context_t *h) { @@ -91,6 +140,9 @@ int hvm_save(struct domain *d, hvm_domai cpuid(1, &eax, &ebx, &ecx, &edx); hdr.cpuid = eax; + cpuid(0, &eax, &ebx, &ecx, &edx); + hdr.pad0 = ecx; + /* Save xen changeset */ c = strrchr(xen_changeset(), ':'); if ( c ) @@ -98,8 +150,6 @@ int hvm_save(struct domain *d, hvm_domai else hdr.changeset = -1ULL; /* Unknown */ - hdr.pad0 = 0; - if ( hvm_save_entry(HEADER, 0, h, &hdr) != 0 ) { gdprintk(XENLOG_ERR, "HVM save: failed to write header\n"); @@ -161,6 +211,14 @@ int hvm_load(struct domain *d, hvm_domai return -1; } + cpuid(0, &eax, &ebx, &ecx, &edx); + if (hdr.pad0 != 0 && hdr.pad0 != ecx) { + gdprintk(XENLOG_ERR, + "HVM restore: unsupported cross-vendor migration (saved = " + "%#"PRIx32", host = %#"PRIx32")\n", hdr.pad0, ecx); + return -1; + } + cpuid(1, &eax, &ebx, &ecx, &edx); /*TODO: need to define how big a difference is acceptable */ if (hdr.cpuid != eax) diff -Naurp xen/arch/x86/hvm/svm/asid.c xen-redhat/arch/x86/hvm/svm/asid.c --- xen/arch/x86/hvm/svm/asid.c +++ xen-redhat/arch/x86/hvm/svm/asid.c @@ -78,26 +78,25 @@ static struct svm_asid_data *svm_asid_co */ void svm_asid_init(struct cpuinfo_x86 *c) { - int nasids; + int nasids = 0; struct svm_asid_data *data = svm_asid_core_data(); - /* Find #ASID. */ - nasids = cpuid_ebx(0x8000000A); - data->max_asid = nasids - 1; - /* Check if we can use ASIDs. */ data->erratum170 = - !((c->x86 == 0x10) || + !((c->x86 >= 0x10) || ((c->x86 == 0xf) && (c->x86_model >= 0x68) && (c->x86_mask >= 1))); - printk("AMD SVM: ASIDs %s \n", - (data->erratum170 ? "disabled." : "enabled.")); + if (!data->erratum170 ) + nasids = cpuid_ebx(0x8000000A); + + data->max_asid = nasids - 1; + printk("AMD SVM: ASIDS %s\n", (nasids ? "enabled." : "disabled.")); /* Initialize ASID assigment. */ - if ( data->erratum170 ) + if ( nasids == 0 ) { - /* On errata #170, VCPUs and phys processors should have same - generation. We set both to invalid. */ + /* In this case, VCPUs and phys processors should have same + * generation. We set both to invalid. */ data->core_asid_generation = SVM_ASID_INVALID_GENERATION; } else diff -Naurp xen/arch/x86/hvm/svm/emulate.c xen-redhat/arch/x86/hvm/svm/emulate.c --- xen/arch/x86/hvm/svm/emulate.c +++ xen-redhat/arch/x86/hvm/svm/emulate.c @@ -412,13 +412,10 @@ static const u8 *opc_bytes[INSTR_MAX_COU /* * Intel has a vmcs entry to give the instruction length. AMD doesn't. So we * have to do a little bit of work to find out... - * - * The caller can either pass a NULL pointer to the guest_eip_buf, or a pointer - * to enough bytes to satisfy the instruction including prefix bytes. */ int __get_instruction_length_from_list(struct vcpu *v, enum instruction_index *list, unsigned int list_count, - u8 *guest_eip_buf, enum instruction_index *match) + enum instruction_index *match) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; unsigned int inst_len = 0; @@ -426,19 +423,13 @@ int __get_instruction_length_from_list(s unsigned int j; int found = 0; enum instruction_index instr = 0; - u8 buffer[MAX_INST_LEN]; - u8 *buf; + int valid_inst_len; + u8 buf[MAX_INST_LEN]; const u8 *opcode = NULL; - if (guest_eip_buf) - { - buf = guest_eip_buf; - } - else - { - inst_copy_from_guest(buffer, svm_rip2pointer(v), MAX_INST_LEN); - buf = buffer; - } + /* hvm_copy_from_guest_virt returns the number of *unread* bytes. */ + valid_inst_len = MAX_INST_LEN - + hvm_copy_from_guest_virt(buf, svm_rip2pointer(v), MAX_INST_LEN); for (j = 0; j < list_count; j++) { @@ -446,14 +437,16 @@ int __get_instruction_length_from_list(s opcode = opc_bytes[instr]; ASSERT(opcode); - while (inst_len < MAX_INST_LEN && + while (inst_len < valid_inst_len && is_prefix(buf[inst_len]) && !is_prefix(opcode[1])) inst_len++; ASSERT(opcode[0] <= 15); /* Make sure the table is correct. */ - found = 1; + if (inst_len + opcode[0] > valid_inst_len) + continue; + found = 1; for (i = 0; i < opcode[0]; i++) { /* If the last byte is zero, we just accept it without checking */ @@ -476,7 +469,7 @@ int __get_instruction_length_from_list(s { inst_len += opcode[0]; - ASSERT(inst_len <= MAX_INST_LEN); + ASSERT(inst_len <= valid_inst_len); if (match) *match = instr; @@ -484,8 +477,9 @@ int __get_instruction_length_from_list(s return inst_len; } - printk("%s: Mismatch between expected and actual instruction bytes: " - "eip = %lx\n", __func__, (unsigned long)vmcb->rip); + gdprintk(XENLOG_WARNING, + "%s: Mismatch between expected and actual instruction bytes: " + "eip = %lx\n", __func__, (unsigned long)vmcb->rip); return 0; } diff -Naurp xen/arch/x86/hvm/svm/intr.c xen-redhat/arch/x86/hvm/svm/intr.c --- xen/arch/x86/hvm/svm/intr.c +++ xen-redhat/arch/x86/hvm/svm/intr.c @@ -31,6 +31,7 @@ #include <asm/hvm/hvm.h> #include <asm/hvm/io.h> #include <asm/hvm/support.h> +#include <asm/hvm/vlapic.h> #include <asm/hvm/svm/svm.h> #include <asm/hvm/svm/intr.h> #include <xen/event.h> @@ -39,100 +40,144 @@ #include <xen/domain_page.h> #include <asm/hvm/trace.h> -/* - * Most of this code is copied from vmx_io.c and modified - * to be suitable for SVM. - */ - -static inline int svm_inject_extint(struct vcpu *v, int trap) +static void svm_inject_dummy_vintr(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; vintr_t intr = vmcb->vintr; - /* Update only relevant fields */ intr.fields.irq = 1; intr.fields.intr_masking = 1; - intr.fields.vector = trap; + intr.fields.vector = 0; intr.fields.prio = 0xF; intr.fields.ign_tpr = 1; vmcb->vintr = intr; - - return 0; } -asmlinkage void svm_intr_assist(void) +static void svm_inject_nmi(struct vcpu *v) { - struct vcpu *v = current; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; - int intr_type = APIC_DM_EXTINT; - int intr_vector = -1; + eventinj_t event; - /* - * Previous Interrupt delivery caused this intercept? - * This will happen if the injection is latched by the processor (hence - * clearing vintr.fields.irq) but then subsequently a fault occurs (e.g., - * due to lack of shadow mapping of guest IDT or guest-kernel stack). - * - * NB. Exceptions that fault during delivery are lost. This needs to be - * fixed but we'll usually get away with it since faults are usually - * idempotent. But this isn't the case for e.g. software interrupts! - */ - if ( vmcb->exitintinfo.fields.v && (vmcb->exitintinfo.fields.type == 0) ) - { - intr_vector = vmcb->exitintinfo.fields.vector; - vmcb->exitintinfo.bytes = 0; - HVMTRACE_1D(REINJ_VIRQ, v, intr_vector); - svm_inject_extint(v, intr_vector); + event.bytes = 0; + event.fields.v = 1; + event.fields.type = EVENTTYPE_NMI; + event.fields.vector = 2; + + ASSERT(vmcb->eventinj.fields.v == 0); + vmcb->eventinj = event; +} + +static void svm_inject_extint(struct vcpu *v, int vector) +{ + struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + eventinj_t event; + + event.bytes = 0; + event.fields.v = 1; + event.fields.type = EVENTTYPE_INTR; + event.fields.vector = vector; + + ASSERT(vmcb->eventinj.fields.v == 0); + vmcb->eventinj = event; +} + +static void update_cr8_intercept( + struct vcpu *v) +{ + struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + struct vlapic *vlapic = vcpu_vlapic(v); + int max_irr; + + vmcb->cr_intercepts &= ~CR_INTERCEPT_CR8_WRITE; + + /* Is there an interrupt pending at the LAPIC? Nothing to do if not. */ + if ( !vlapic_enabled(vlapic) || + ((max_irr = vlapic_find_highest_irr(vlapic)) == -1) ) return; - } + + /* Highest-priority pending interrupt is masked by the TPR? */ + if ( (vmcb->vintr.fields.tpr & 0xf) >= (max_irr >> 4) ) + vmcb->cr_intercepts |= CR_INTERCEPT_CR8_WRITE; +} + +static void enable_intr_window(struct vcpu *v, enum hvm_intack intr_source) +{ + struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + + ASSERT(intr_source != hvm_intack_none); /* - * Previous interrupt still pending? This occurs if we return from VMRUN - * very early in the entry-to-guest process. Usually this is because an - * external physical interrupt was pending when we executed VMRUN. + * Create a dummy virtual interrupt to intercept as soon as the + * guest can accept the real interrupt. + * + * TODO: Better NMI handling. We need a way to skip a MOV SS interrupt + * shadow. This is hard to do without hardware support. We should also + * track 'NMI blocking' from NMI injection until IRET. This can be done + * quite easily in software by intercepting the unblocking IRET. */ - if ( vmcb->vintr.fields.irq ) - return; + vmcb->general1_intercepts |= GENERAL1_INTERCEPT_VINTR; + HVMTRACE_2D(INJ_VIRQ, v, 0x0, /*fake=*/ 1); + svm_inject_dummy_vintr(v); +} + +asmlinkage void svm_intr_assist(void) +{ + struct vcpu *v = current; + struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + enum hvm_intack intr_source; + int intr_vector; /* Crank the handle on interrupt state and check for new interrrupts. */ pt_update_irq(v); - hvm_set_callback_irq_level(); - if ( !cpu_has_pending_irq(v) ) - return; + hvm_dirq_assist(v); - /* - * If the guest can't take an interrupt right now, create a 'fake' - * virtual interrupt on to intercept as soon as the guest _can_ take - * interrupts. Do not obtain the next interrupt from the vlapic/pic - * if unable to inject. - * - * Also do this if there is an exception pending. This is because - * the delivery of the exception can arbitrarily delay the injection - * of the vintr (for example, if the exception is handled via an - * interrupt gate, hence zeroing RFLAGS.IF). In the meantime: - * - the vTPR could be modified upwards, so we need to wait until the - * exception is delivered before we can safely decide that an - * interrupt is deliverable; and - * - the guest might look at the APIC/PIC state, so we ought not to have - * cleared the interrupt out of the IRR. - */ - if ( irq_masked(vmcb->rflags) || vmcb->interrupt_shadow - || vmcb->eventinj.fields.v ) + do { + intr_source = hvm_vcpu_has_pending_irq(v); + if ( likely(intr_source == hvm_intack_none) ) + goto out; + + /* + * Pending IRQs must be delayed if: + * 1. An event is already pending. This is despite the fact that SVM + * provides a VINTR delivery method quite separate from the EVENTINJ + * mechanism. The event delivery can arbitrarily delay the injection + * of the vintr (for example, if the exception is handled via an + * interrupt gate, hence zeroing RFLAGS.IF). In the meantime: + * - the vTPR could be modified upwards, so we need to wait until + * the exception is delivered before we can safely decide that an + * interrupt is deliverable; and + * - the guest might look at the APIC/PIC state, so we ought not to + * have cleared the interrupt out of the IRR. + * 2. The IRQ is masked. + */ + if ( unlikely(vmcb->eventinj.fields.v) || + !hvm_interrupts_enabled(v, intr_source) ) + { + enable_intr_window(v, intr_source); + return; + } + } while ( !hvm_vcpu_ack_pending_irq(v, intr_source, &intr_vector) ); + + if ( intr_source == hvm_intack_nmi ) { - vmcb->general1_intercepts |= GENERAL1_INTERCEPT_VINTR; - HVMTRACE_2D(INJ_VIRQ, v, 0x0, /*fake=*/ 1); - svm_inject_extint(v, 0x0); /* actual vector doesn't matter */ - return; + svm_inject_nmi(v); + } + else + { + HVMTRACE_2D(INJ_VIRQ, v, intr_vector, /*fake=*/ 0); + svm_inject_extint(v, intr_vector); + pt_intr_post(v, intr_vector, intr_source); } - /* Okay, we can deliver the interrupt: grab it and update PIC state. */ - intr_vector = cpu_get_interrupt(v, &intr_type); - BUG_ON(intr_vector < 0); - - HVMTRACE_2D(INJ_VIRQ, v, intr_vector, /*fake=*/ 0); - svm_inject_extint(v, intr_vector); + /* Is there another IRQ to queue up behind this one? */ + intr_source = hvm_vcpu_has_pending_irq(v); + if ( unlikely(intr_source != hvm_intack_none) ) { + enable_intr_window(v, intr_source); + return; + } - pt_intr_post(v, intr_vector, intr_type); + out: + update_cr8_intercept(v); } /* diff -Naurp xen/arch/x86/hvm/svm/svm.c xen-redhat/arch/x86/hvm/svm/svm.c --- xen/arch/x86/hvm/svm/svm.c +++ xen-redhat/arch/x86/hvm/svm/svm.c @@ -49,12 +49,11 @@ #include <asm/hvm/vpt.h> #include <asm/hvm/trace.h> #include <asm/hap.h> +#include <asm/debugger.h> #define set_segment_register(name, value) \ asm volatile ( "movw %%ax ,%%" STR(name) "" : : "a" (value) ) -int inst_copy_from_guest(unsigned char *buf, unsigned long guest_eip, - int inst_len); asmlinkage void do_IRQ(struct cpu_user_regs *); static int svm_reset_to_realmode(struct vcpu *v, @@ -66,9 +65,6 @@ static void *hsa[NR_CPUS] __read_mostly; /* vmcb used for extended host state */ static void *root_vmcb[NR_CPUS] __read_mostly; -/* hardware assisted paging bits */ -extern int opt_hap_enabled; - static void svm_inject_exception(struct vcpu *v, int trap, int ev, int error_code) { @@ -87,8 +83,6 @@ static void svm_inject_exception(struct event.fields.ev = ev; event.fields.errorcode = error_code; - ASSERT(vmcb->eventinj.fields.v == 0); - vmcb->eventinj = event; } @@ -374,40 +368,15 @@ int svm_vmcb_save(struct vcpu *v, struct c->sysenter_esp = vmcb->sysenter_esp; c->sysenter_eip = vmcb->sysenter_eip; - /* Save any event/interrupt that was being injected when we last - * exited. Although there are three(!) VMCB fields that can contain - * active events, we only need to save at most one: because the - * intr_assist logic never delivers an IRQ when any other event is - * active, we know that the only possible collision is if we inject - * a fault while exitintinfo contains a valid event (the delivery of - * which caused the last exit). In that case replaying just the - * first event should cause the same behaviour when we restore. */ - if ( vmcb->vintr.fields.irq - && /* Check it's not a fake interrupt (see svm_intr_assist()) */ - !(vmcb->general1_intercepts & GENERAL1_INTERCEPT_VINTR) ) - { - c->pending_vector = vmcb->vintr.fields.vector; - c->pending_type = 0; /* External interrupt */ - c->pending_error_valid = 0; - c->pending_reserved = 0; - c->pending_valid = 1; - c->error_code = 0; - } - else if ( vmcb->exitintinfo.fields.v ) + c->pending_event = 0; + c->error_code = 0; + if ( vmcb->eventinj.fields.v && + svm_event_needs_reinjection(vmcb->eventinj.fields.type, + vmcb->eventinj.fields.vector) ) { - c->pending_event = vmcb->exitintinfo.bytes & 0xffffffff; - c->error_code = vmcb->exitintinfo.fields.errorcode; - } - else if ( vmcb->eventinj.fields.v ) - { - c->pending_event = vmcb->eventinj.bytes & 0xffffffff; + c->pending_event = (uint32_t)vmcb->eventinj.bytes; c->error_code = vmcb->eventinj.fields.errorcode; } - else - { - c->pending_event = 0; - c->error_code = 0; - } return 1; } @@ -541,26 +510,23 @@ int svm_vmcb_restore(struct vcpu *v, str gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n", c->pending_event, c->error_code); - /* VMX uses a different type for #OF and #BP; fold into "Exception" */ - if ( c->pending_type == 6 ) - c->pending_type = 3; - /* Sanity check */ - if ( c->pending_type == 1 || c->pending_type > 4 - || c->pending_reserved != 0 ) + if ( (c->pending_type == 1) || (c->pending_type > 6) || + (c->pending_reserved != 0) ) { gdprintk(XENLOG_ERR, "Invalid pending event 0x%"PRIx32"\n", c->pending_event); return -EINVAL; } - /* Put this pending event in exitintinfo and svm_intr_assist() - * will reinject it when we return to the guest. */ - vmcb->exitintinfo.bytes = c->pending_event; - vmcb->exitintinfo.fields.errorcode = c->error_code; + + if ( svm_event_needs_reinjection(c->pending_type, c->pending_vector) ) + { + vmcb->eventinj.bytes = c->pending_event; + vmcb->eventinj.fields.errorcode = c->error_code; + } } paging_update_paging_modes(v); - /* signal paging update to ASID handler */ - svm_asid_g_update_paging (v); + svm_asid_g_update_paging(v); return 0; @@ -582,7 +548,7 @@ static void svm_save_cpu_state(struct vc data->msr_efer = v->arch.hvm_svm.cpu_shadow_efer; data->msr_flags = -1ULL; - data->tsc = hvm_get_guest_time(v); + data->tsc = hvm_get_guest_tsc(v); } @@ -602,7 +568,7 @@ static void svm_load_cpu_state(struct vc if ( !(vmcb->efer & EFER_LMA) ) vmcb->efer &= ~EFER_LME; - hvm_set_guest_time(v, data->tsc); + hvm_set_guest_tsc(v, data->tsc); } static void svm_save_vmcb_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt) @@ -623,10 +589,15 @@ static int svm_load_vmcb_ctxt(struct vcp return 0; } -static int svm_interrupts_enabled(struct vcpu *v) +static int svm_interrupts_enabled(struct vcpu *v, enum hvm_intack type) { - unsigned long eflags = v->arch.hvm_svm.vmcb->rflags; - return !irq_masked(eflags); + struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + + if ( type == hvm_intack_nmi ) + return !vmcb->interrupt_shadow; + + ASSERT((type == hvm_intack_pic) || (type == hvm_intack_lapic)); + return (vmcb->rflags & X86_EFLAGS_IF) && !vmcb->interrupt_shadow; } static int svm_guest_x86_mode(struct vcpu *v) @@ -694,9 +665,7 @@ static void svm_sync_vmcb(struct vcpu *v arch_svm->vmcb_in_sync = 1; - asm volatile ( - ".byte 0x0f,0x01,0xdb" /* vmsave */ - : : "a" (__pa(arch_svm->vmcb)) ); + svm_vmsave(arch_svm->vmcb); } static unsigned long svm_get_segment_base(struct vcpu *v, enum x86_segment seg) @@ -725,6 +694,9 @@ static void svm_get_segment_register(str struct segment_register *reg) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + + ASSERT(v == current); + switch ( seg ) { case x86_seg_cs: @@ -761,8 +733,124 @@ static void svm_get_segment_register(str svm_sync_vmcb(v); memcpy(reg, &vmcb->ldtr, sizeof(*reg)); break; - default: BUG(); + default: + BUG(); + } +} + +static void svm_set_segment_register(struct vcpu *v, enum x86_segment seg, + struct segment_register *reg) +{ + struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + + ASSERT(v == current); + + switch ( seg ) + { + case x86_seg_cs: + memcpy(&vmcb->cs, reg, sizeof(*reg)); + guest_cpu_user_regs()->cs = reg->sel; + break; + case x86_seg_ds: + memcpy(&vmcb->ds, reg, sizeof(*reg)); + break; + case x86_seg_es: + memcpy(&vmcb->es, reg, sizeof(*reg)); + break; + case x86_seg_fs: + svm_sync_vmcb(v); + memcpy(&vmcb->fs, reg, sizeof(*reg)); + svm_vmload(vmcb); + break; + case x86_seg_gs: + svm_sync_vmcb(v); + memcpy(&vmcb->gs, reg, sizeof(*reg)); + svm_vmload(vmcb); + break; + case x86_seg_ss: + memcpy(&vmcb->ss, reg, sizeof(*reg)); + guest_cpu_user_regs()->ss = reg->sel; + break; + case x86_seg_tr: + svm_sync_vmcb(v); + memcpy(&vmcb->tr, reg, sizeof(*reg)); + svm_vmload(vmcb); + break; + case x86_seg_gdtr: + memcpy(&vmcb->gdtr, reg, sizeof(*reg)); + break; + case x86_seg_idtr: + memcpy(&vmcb->idtr, reg, sizeof(*reg)); + break; + case x86_seg_ldtr: + svm_sync_vmcb(v); + memcpy(&vmcb->ldtr, reg, sizeof(*reg)); + svm_vmload(vmcb); + break; + default: + BUG(); + } +} + +static int svm_set_cr3(unsigned long value) +{ + struct vcpu *v = current; + struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + unsigned long old_base_mfn, mfn; + + if ( paging_mode_hap(v->domain) ) + { + vmcb->cr3 = v->arch.hvm_svm.cpu_cr3 = value; + return X86EMUL_OKAY; + } + + /* If paging is not enabled yet, simply copy the value to CR3. */ + if ( !svm_paging_enabled(v) ) + { + v->arch.hvm_svm.cpu_cr3 = value; + return X86EMUL_OKAY; + } + + /* We make a new one if the shadow does not exist. */ + if ( value == v->arch.hvm_svm.cpu_cr3 ) + { + /* + * This is simple TLB flush, implying the guest has + * removed some translation or changed page attributes. + * We simply invalidate the shadow. + */ + mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT); + if ( mfn != pagetable_get_pfn(v->arch.guest_table) ) + return X86EMUL_UNHANDLEABLE; + paging_update_cr3(v); + /* signal paging update to ASID handler */ + svm_asid_g_mov_to_cr3 (v); } + else + { + /* + * If different, make a shadow. Check if the PDBR is valid + * first. + */ + HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value); + mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT); + if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) ) + return X86EMUL_UNHANDLEABLE; + + old_base_mfn = pagetable_get_pfn(v->arch.guest_table); + v->arch.guest_table = pagetable_from_pfn(mfn); + + if ( old_base_mfn ) + put_page(mfn_to_page(old_base_mfn)); + + v->arch.hvm_svm.cpu_cr3 = value; + update_cr3(v); + HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value); + /* signal paging update to ASID handler */ + svm_asid_g_mov_to_cr3 (v); + } + + return X86EMUL_OKAY; } /* Make sure that xen intercepts any FP accesses from current */ @@ -863,10 +951,7 @@ static void svm_ctxt_switch_from(struct svm_save_dr(v); svm_sync_vmcb(v); - - asm volatile ( - ".byte 0x0f,0x01,0xda" /* vmload */ - : : "a" (__pa(root_vmcb[cpu])) ); + svm_vmload(root_vmcb[cpu]); #ifdef __x86_64__ /* Resume use of ISTs now that the host TR is reinstated. */ @@ -902,12 +987,8 @@ static void svm_ctxt_switch_to(struct vc svm_restore_dr(v); - asm volatile ( - ".byte 0x0f,0x01,0xdb" /* vmsave */ - : : "a" (__pa(root_vmcb[cpu])) ); - asm volatile ( - ".byte 0x0f,0x01,0xda" /* vmload */ - : : "a" (__pa(v->arch.hvm_svm.vmcb)) ); + svm_vmsave(root_vmcb[cpu]); + svm_vmload(v->arch.hvm_svm.vmcb); } static void svm_do_resume(struct vcpu *v) @@ -972,10 +1053,10 @@ static void svm_hvm_inject_exception( svm_inject_exception(v, trapnr, (errcode != -1), errcode); } -static int svm_event_injection_faulted(struct vcpu *v) +static int svm_event_pending(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; - return vmcb->exitintinfo.fields.v; + return vmcb->eventinj.fields.v; } static struct hvm_function_table svm_function_table = { @@ -996,8 +1077,10 @@ static struct hvm_function_table svm_fun .get_guest_ctrl_reg = svm_get_ctrl_reg, .get_segment_base = svm_get_segment_base, .get_segment_register = svm_get_segment_register, + .set_segment_register = svm_set_segment_register, .update_host_cr3 = svm_update_host_cr3, .update_guest_cr3 = svm_update_guest_cr3, + .set_cr3 = svm_set_cr3, .flush_guest_tlbs = svm_flush_guest_tlbs, .update_vtpr = svm_update_vtpr, .stts = svm_stts, @@ -1005,23 +1088,9 @@ static struct hvm_function_table svm_fun .inject_exception = svm_hvm_inject_exception, .init_ap_context = svm_init_ap_context, .init_hypercall_page = svm_init_hypercall_page, - .event_injection_faulted = svm_event_injection_faulted + .event_pending = svm_event_pending }; -static void svm_npt_detect(void) -{ - u32 eax, ebx, ecx, edx; - - /* Check CPUID for nested paging support. */ - cpuid(0x8000000A, &eax, &ebx, &ecx, &edx); - - if ( !(edx & 1) && opt_hap_enabled ) - { - printk("SVM: Nested paging is not supported by this CPU.\n"); - opt_hap_enabled = 0; - } -} - int start_svm(struct cpuinfo_x86 *c) { u32 eax, ecx, edx; @@ -1033,7 +1102,7 @@ int start_svm(struct cpuinfo_x86 *c) ecx = cpuid_ecx(0x80000001); boot_cpu_data.x86_capability[5] = ecx; - if ( !(test_bit(X86_FEATURE_SVME, &boot_cpu_data.x86_capability)) ) + if ( !(test_bit(X86_FEATURE_SVM, &boot_cpu_data.x86_capability)) ) return 0; /* Check whether SVM feature is disabled in BIOS */ @@ -1050,8 +1119,6 @@ int start_svm(struct cpuinfo_x86 *c) write_efer(read_efer() | EFER_SVME); - svm_npt_detect(); - /* Initialize the HSA for this core. */ phys_hsa = (u64) virt_to_maddr(hsa[cpu]); phys_hsa_lo = (u32) phys_hsa; @@ -1065,12 +1132,12 @@ int start_svm(struct cpuinfo_x86 *c) return 1; setup_vmcb_dump(); + svm_function_table.hap_supported = (cpuid_edx(0x8000000A) & 1); + svm_function_table.hap_1gb_pgtb = + (CONFIG_PAGING_LEVELS == 4) ? (cpuid_edx(0x80000001) & 0x04000000) : 0; hvm_enable(&svm_function_table); - if ( opt_hap_enabled ) - printk("SVM: Nested paging enabled.\n"); - return 1; } @@ -1096,8 +1163,8 @@ static void svm_do_no_device_fault(struc vmcb->cr0 &= ~X86_CR0_TS; } -/* Reserved bits ECX: [31:14], [12:4], [2:1]*/ -#define SVM_VCPU_CPUID_L1_ECX_RESERVED 0xffffdff6 +/* Reserved bits ECX: [30:29], [24], [18:14], [11:10], [8:4], [2] */ +#define SVM_VCPU_CPUID_L1_ECX_RESERVED 0x6107cdf4 /* Reserved bits EDX: [31:29], [27], [22:20], [18], [10] */ #define SVM_VCPU_CPUID_L1_EDX_RESERVED 0xe8740400 @@ -1109,6 +1176,9 @@ static void svm_vmexit_do_cpuid(struct v struct vcpu *v = current; int inst_len; + if ( (inst_len = __get_instruction_length(v, INSTR_CPUID)) == 0 ) + return; + hvm_cpuid(input, &eax, &ebx, &ecx, &edx); if ( input == 0x00000001 ) @@ -1117,6 +1187,22 @@ static void svm_vmexit_do_cpuid(struct v ecx &= ~SVM_VCPU_CPUID_L1_ECX_RESERVED; edx &= ~SVM_VCPU_CPUID_L1_EDX_RESERVED; + /* Clear FMA instruction support. */ + clear_bit(X86_FEATURE_FMA & 31, &ecx); + + /* Clear x2APIC capability. */ + clear_bit(X86_FEATURE_X2APIC & 31, &ecx); + + /* Clear MOVBE instruction support */ + clear_bit(X86_FEATURE_MOVBE & 31, &ecx); + + /* Clear XSAVE and OSXSAVE bits. */ + clear_bit(X86_FEATURE_XSAVE & 31, &ecx); + clear_bit(X86_FEATURE_OSXSAVE & 31, &ecx); + + /* Clear AVX instruction support. */ + clear_bit(X86_FEATURE_AVX & 31, &ecx); + /* Guest should only see one logical processor. * See details on page 23 of AMD CPUID Specification. */ @@ -1129,12 +1215,15 @@ static void svm_vmexit_do_cpuid(struct v if ( vlapic_hw_disabled(vcpu_vlapic(v)) ) clear_bit(X86_FEATURE_APIC & 31, &edx); + clear_bit(X86_FEATURE_EXTAPIC & 31, &ecx); + #if CONFIG_PAGING_LEVELS >= 3 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] ) #endif clear_bit(X86_FEATURE_PAE & 31, &edx); clear_bit(X86_FEATURE_PSE36 & 31, &edx); + clear_bit(X86_FEATURE_PAGE1GB & 31, &edx); /* Clear the Cmp_Legacy bit * This bit is supposed to be zero when HTT = 0. @@ -1143,17 +1232,49 @@ static void svm_vmexit_do_cpuid(struct v clear_bit(X86_FEATURE_CMP_LEGACY & 31, &ecx); /* Make SVM feature invisible to the guest. */ - clear_bit(X86_FEATURE_SVME & 31, &ecx); + clear_bit(X86_FEATURE_SVM & 31, &ecx); /* So far, we do not support 3DNow for the guest. */ clear_bit(X86_FEATURE_3DNOW & 31, &edx); clear_bit(X86_FEATURE_3DNOWEXT & 31, &edx); + /* no FFXSR instructions feature. */ clear_bit(X86_FEATURE_FFXSR & 31, &edx); + + /* no RDTSCP instruction support */ + clear_bit(X86_FEATURE_RDTSCP & 31, &edx); + + /* no topology extensions */ + clear_bit(X86_FEATURE_NODEID_MSR & 31, &ecx); + clear_bit(X86_FEATURE_TOPOEXT & 31, &ecx); + + /* no OS Visible Workaround support */ + clear_bit(X86_FEATURE_OSVW & 31, &ecx); + + /* no Instruction Based Sampling */ + clear_bit(X86_FEATURE_IBS & 31, &ecx); + + /* no SKINIT and STGI support */ + clear_bit(X86_FEATURE_SKINIT & 31, &ecx); + + /* no Watchdog Timer */ + clear_bit(X86_FEATURE_WDT & 31, &ecx); + + /* no Lightweight Profiling support */ + clear_bit(X86_FEATURE_LWP & 31, &ecx); + + /* no Performance Counter Extensions */ + clear_bit(X86_FEATURE_PERFCTR_CORE & 31, &ecx); + clear_bit(X86_FEATURE_PERFCTR_NB & 31, &ecx); } - else if ( input == 0x80000007 || input == 0x8000000A ) + else if ( input == 0x80000007 || input == 0x8000000A || + input == 0x8000001B || input == 0x8000001C || + input == 0x8000001E ) { - /* Mask out features of power management and SVM extension. */ + /* Mask out features of power management, SVM extension, + * Instruction Based Sampling, Lightweight Profiling, and + * extended topology reporting + */ eax = ebx = ecx = edx = 0; } else if ( input == 0x80000008 ) @@ -1170,8 +1291,6 @@ static void svm_vmexit_do_cpuid(struct v HVMTRACE_3D(CPUID, v, input, ((uint64_t)eax << 32) | ebx, ((uint64_t)ecx << 32) | edx); - inst_len = __get_instruction_length(v, INSTR_CPUID, NULL); - ASSERT(inst_len > 0); __update_guest_eip(vmcb, inst_len); } @@ -1272,18 +1391,13 @@ static void svm_get_prefix_info(struct v { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; unsigned char inst[MAX_INST_LEN]; + int valid_inst_len; int i; - memset(inst, 0, MAX_INST_LEN); - if (inst_copy_from_guest(inst, svm_rip2pointer(v), sizeof(inst)) - != MAX_INST_LEN) - { - gdprintk(XENLOG_ERR, "get guest instruction failed\n"); - domain_crash(current->domain); - return; - } + valid_inst_len = MAX_INST_LEN - + hvm_copy_from_guest_virt(inst, svm_rip2pointer(v), MAX_INST_LEN); - for (i = 0; i < MAX_INST_LEN; i++) + for (i = 0; i < valid_inst_len; i++) { switch (inst[i]) { @@ -1796,7 +1910,7 @@ static void mov_from_cr(int cr, int gp, */ static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs) { - unsigned long value, old_cr, old_base_mfn, mfn; + unsigned long value, old_cr; struct vcpu *v = current; struct vlapic *vlapic = vcpu_vlapic(v); struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; @@ -1814,57 +1928,8 @@ static int mov_to_cr(int gpreg, int cr, return svm_set_cr0(value); case 3: - if ( paging_mode_hap(v->domain) ) - { - vmcb->cr3 = v->arch.hvm_svm.cpu_cr3 = value; - break; - } - - /* If paging is not enabled yet, simply copy the value to CR3. */ - if ( !svm_paging_enabled(v) ) - { - v->arch.hvm_svm.cpu_cr3 = value; - break; - } - - /* We make a new one if the shadow does not exist. */ - if ( value == v->arch.hvm_svm.cpu_cr3 ) - { - /* - * This is simple TLB flush, implying the guest has - * removed some translation or changed page attributes. - * We simply invalidate the shadow. - */ - mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT); - if ( mfn != pagetable_get_pfn(v->arch.guest_table) ) - goto bad_cr3; - paging_update_cr3(v); - /* signal paging update to ASID handler */ - svm_asid_g_mov_to_cr3 (v); - } - else - { - /* - * If different, make a shadow. Check if the PDBR is valid - * first. - */ - HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value); - mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT); - if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) ) - goto bad_cr3; - - old_base_mfn = pagetable_get_pfn(v->arch.guest_table); - v->arch.guest_table = pagetable_from_pfn(mfn); - - if ( old_base_mfn ) - put_page(mfn_to_page(old_base_mfn)); - - v->arch.hvm_svm.cpu_cr3 = value; - update_cr3(v); - HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value); - /* signal paging update to ASID handler */ - svm_asid_g_mov_to_cr3 (v); - } + if (svm_set_cr3(value) != X86EMUL_OKAY) + goto bad_cr3; break; case 4: /* CR4 */ @@ -1984,26 +2049,26 @@ static int svm_cr_access(struct vcpu *v, enum instruction_index list_b[] = {INSTR_MOVCR2, INSTR_SMSW}; enum instruction_index match; - inst_copy_from_guest(buffer, svm_rip2pointer(v), sizeof(buffer)); - - /* get index to first actual instruction byte - as we will need to know - where the prefix lives later on */ - index = skip_prefix_bytes(buffer, sizeof(buffer)); - if ( type == TYPE_MOV_TO_CR ) { inst_len = __get_instruction_length_from_list( - v, list_a, ARR_SIZE(list_a), &buffer[index], &match); + v, list_a, ARR_SIZE(list_a), &match); } else /* type == TYPE_MOV_FROM_CR */ { inst_len = __get_instruction_length_from_list( - v, list_b, ARR_SIZE(list_b), &buffer[index], &match); + v, list_b, ARR_SIZE(list_b), &match); } - ASSERT(inst_len > 0); + if ( inst_len == 0 ) + return 0; - inst_len += index; + memset(buffer, 0, MAX_INST_LEN); + hvm_copy_from_guest_virt(buffer, svm_rip2pointer(v), MAX_INST_LEN); + + /* get index to first actual instruction byte - as we will need to know + where the prefix lives later on */ + index = skip_prefix_bytes(buffer, sizeof(buffer)); /* Check for REX prefix - it's ALWAYS the last byte of any prefix bytes */ if (index > 0 && (buffer[index-1] & 0xF0) == 0x40) @@ -2078,7 +2143,7 @@ static int svm_cr_access(struct vcpu *v, gdprintk(XENLOG_ERR, "SMSW emulation at guest address: " "%lx failed due to unhandled addressing mode." "ModRM byte was: %x \n", svm_rip2pointer(v), modrm); - domain_crash(v->domain); + hvm_inject_exception(TRAP_gp_fault, 0, 0); } inst_len += addr_size; offset = *(( unsigned int *) ( void *) &buffer[index + 3]); @@ -2092,7 +2157,7 @@ static int svm_cr_access(struct vcpu *v, gdprintk(XENLOG_ERR, "SMSW emulation at guest address: %lx " "failed due to unhandled addressing mode!" "ModRM byte was: %x \n", svm_rip2pointer(v), modrm); - domain_crash(v->domain); + hvm_inject_exception(TRAP_gp_fault, 0, 0); } break; @@ -2100,8 +2165,6 @@ static int svm_cr_access(struct vcpu *v, BUG(); } - ASSERT(inst_len); - __update_guest_eip(vmcb, inst_len); return result; @@ -2124,7 +2187,7 @@ static void svm_do_msr_access( { switch (ecx) { case MSR_IA32_TIME_STAMP_COUNTER: - msr_content = hvm_get_guest_time(v); + msr_content = hvm_get_guest_tsc(v); break; case MSR_IA32_APICBASE: @@ -2187,7 +2250,8 @@ static void svm_do_msr_access( HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx", ecx, (unsigned long)regs->eax, (unsigned long)regs->edx); - inst_len = __get_instruction_length(v, INSTR_RDMSR, NULL); + if ( (inst_len = __get_instruction_length(v, INSTR_RDMSR)) == 0 ) + return; } else { @@ -2198,7 +2262,7 @@ static void svm_do_msr_access( switch (ecx) { case MSR_IA32_TIME_STAMP_COUNTER: - hvm_set_guest_time(v, msr_content); + hvm_set_guest_tsc(v, msr_content); pt_reset(v); break; @@ -2216,7 +2280,8 @@ static void svm_do_msr_access( break; } - inst_len = __get_instruction_length(v, INSTR_WRMSR, NULL); + if ( (inst_len = __get_instruction_length(v, INSTR_WRMSR)) == 0 ) + return; } __update_guest_eip(vmcb, inst_len); @@ -2224,11 +2289,14 @@ static void svm_do_msr_access( static void svm_vmexit_do_hlt(struct vmcb_struct *vmcb) { + enum hvm_intack type = hvm_vcpu_has_pending_irq(current); + __update_guest_eip(vmcb, 1); /* Check for interrupt not handled or new interrupt. */ - if ( (vmcb->rflags & X86_EFLAGS_IF) && - (vmcb->vintr.fields.irq || cpu_has_pending_irq(current)) ) { + if ( vmcb->eventinj.fields.v || + ((type != hvm_intack_none) && hvm_interrupts_enabled(current, type)) ) + { HVMTRACE_1D(HLT, current, /*int pending=*/ 1); return; } @@ -2252,7 +2320,8 @@ static void svm_vmexit_do_invd(struct vc */ gdprintk(XENLOG_WARNING, "INVD instruction intercepted - ignored\n"); - inst_len = __get_instruction_length(v, INSTR_INVD, NULL); + if ( (inst_len = __get_instruction_length(v, INSTR_INVD)) == 0 ) + return; __update_guest_eip(vmcb, inst_len); } @@ -2264,21 +2333,11 @@ void svm_handle_invlpg(const short invlp int inst_len; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; - /* - * Unknown how many bytes the invlpg instruction will take. Use the - * maximum instruction length here - */ - if (inst_copy_from_guest(opcode, svm_rip2pointer(v), length) < length) - { - gdprintk(XENLOG_ERR, "Error reading memory %d bytes\n", length); - domain_crash(v->domain); - return; - } - if (invlpga) { - inst_len = __get_instruction_length(v, INSTR_INVLPGA, opcode); - ASSERT(inst_len > 0); + if ( (inst_len = + __get_instruction_length(v, INSTR_INVLPGA)) == 0 ) + return; __update_guest_eip(vmcb, inst_len); /* @@ -2289,10 +2348,15 @@ void svm_handle_invlpg(const short invlp } else { + if ( (inst_len = + __get_instruction_length(v, INSTR_INVLPG)) == 0 ) + return; + + memset(opcode, 0, MAX_INST_LEN); + hvm_copy_from_guest_virt(opcode, svm_rip2pointer(v), MAX_INST_LEN); + /* What about multiple prefix codes? */ prefix = (is_prefix(opcode[0])?opcode[0]:0); - inst_len = __get_instruction_length(v, INSTR_INVLPG, opcode); - ASSERT(inst_len > 0); inst_len--; length -= inst_len; @@ -2413,8 +2477,19 @@ asmlinkage void svm_vmexit_handler(struc unsigned long eip; struct vcpu *v = current; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + eventinj_t eventinj; int inst_len, rc; + /* + * Before doing anything else, we need to sync up the VLAPIC's TPR with + * SVM's vTPR if CR8 writes are currently disabled. It's OK if the + * guest doesn't touch the CR8 (e.g. 32-bit Windows) because we update + * the vTPR on MMIO writes to the TPR + */ + if ( !(vmcb->cr_intercepts & CR_INTERCEPT_CR8_WRITE) ) + vlapic_set_reg(vcpu_vlapic(v), APIC_TASKPRI, + (vmcb->vintr.fields.tpr & 0x0F) << 4); + exit_reason = vmcb->exitcode; HVMTRACE_2D(VMEXIT, v, vmcb->rip, exit_reason); @@ -2428,6 +2503,15 @@ asmlinkage void svm_vmexit_handler(struc perfc_incra(svmexits, exit_reason); eip = vmcb->rip; + /* Event delivery caused this intercept? Queue for redelivery. */ + eventinj = vmcb->exitintinfo; + if ( unlikely(eventinj.fields.v) && + svm_event_needs_reinjection(eventinj.fields.type, + eventinj.fields.vector) ) + vmcb->eventinj = eventinj; + + hvm_maybe_deassert_evtchn_irq(); + switch ( exit_reason ) { case VMEXIT_INTR: @@ -2455,8 +2539,10 @@ asmlinkage void svm_vmexit_handler(struc if ( !v->domain->debugger_attached ) goto exit_and_crash; /* AMD Vol2, 15.11: INT3, INTO, BOUND intercepts do not update RIP. */ - inst_len = __get_instruction_length(v, INSTR_INT3, NULL); + if ( (inst_len = __get_instruction_length(v, INSTR_INT3)) == 0 ) + break; __update_guest_eip(vmcb, inst_len); + current->arch.gdbsx_vcpu_event = TRAP_int3; domain_pause_for_debugger(); break; @@ -2499,12 +2585,20 @@ asmlinkage void svm_vmexit_handler(struc svm_vmexit_do_invd(v); break; - case VMEXIT_GDTR_WRITE: - printk("WRITE to GDTR\n"); + case VMEXIT_TASK_SWITCH: { + enum hvm_task_switch_reason reason; + int32_t errcode = -1; + if ( (vmcb->exitinfo2 >> 36) & 1 ) + reason = TSW_iret; + else if ( (vmcb->exitinfo2 >> 38) & 1 ) + reason = TSW_jmp; + else + reason = TSW_call_or_int; + if ( (vmcb->exitinfo2 >> 44) & 1 ) + errcode = (uint32_t)vmcb->exitinfo2; + hvm_task_switch((uint16_t)vmcb->exitinfo1, reason, errcode); break; - - case VMEXIT_TASK_SWITCH: - goto exit_and_crash; + } case VMEXIT_CPUID: svm_vmexit_do_cpuid(vmcb, regs); @@ -2523,8 +2617,8 @@ asmlinkage void svm_vmexit_handler(struc break; case VMEXIT_VMMCALL: - inst_len = __get_instruction_length(v, INSTR_VMCALL, NULL); - ASSERT(inst_len > 0); + if ( (inst_len = __get_instruction_length(v, INSTR_VMCALL)) == 0 ) + break; HVMTRACE_1D(VMMCALL, v, regs->eax); rc = hvm_do_hypercall(regs); if ( rc != HVM_HCALL_preempted ) diff -Naurp xen/arch/x86/hvm/svm/vmcb.c xen-redhat/arch/x86/hvm/svm/vmcb.c --- xen/arch/x86/hvm/svm/vmcb.c +++ xen-redhat/arch/x86/hvm/svm/vmcb.c @@ -129,8 +129,14 @@ static int construct_vmcb(struct vcpu *v /* Intercept all debug-register writes. */ vmcb->dr_intercepts = ~0u; - /* Intercept all control-register accesses, except to CR2. */ - vmcb->cr_intercepts = ~(CR_INTERCEPT_CR2_READ | CR_INTERCEPT_CR2_WRITE); + /* + * Intercept all control-register accesses except for CR2 reads/writes + * and CR8 reads (and actually CR8 writes, but that's a special case + * that's handled in svm/intr.c). + */ + vmcb->cr_intercepts = ~(CR_INTERCEPT_CR2_READ | + CR_INTERCEPT_CR2_WRITE | + CR_INTERCEPT_CR8_READ); /* I/O and MSR permission bitmaps. */ arch_svm->msrpm = alloc_xenheap_pages(get_order_from_bytes(MSRPM_SIZE)); diff -Naurp xen/arch/x86/hvm/vioapic.c xen-redhat/arch/x86/hvm/vioapic.c --- xen/arch/x86/hvm/vioapic.c +++ xen-redhat/arch/x86/hvm/vioapic.c @@ -92,9 +92,9 @@ static unsigned long vioapic_read_indire return result; } -static unsigned long vioapic_read(struct vcpu *v, - unsigned long addr, - unsigned long length) +static int vioapic_read( + struct vcpu *v, unsigned long addr, + unsigned long length, unsigned long *pval) { struct hvm_hw_vioapic *vioapic = domain_vioapic(v->domain); uint32_t result; @@ -118,11 +118,13 @@ static unsigned long vioapic_read(struct break; } - return result; + *pval = result; + return 1; } static void vioapic_write_redirent( - struct hvm_hw_vioapic *vioapic, unsigned int idx, int top_word, uint32_t val) + struct hvm_hw_vioapic *vioapic, unsigned int idx, + int top_word, uint32_t val) { struct domain *d = vioapic_domain(vioapic); struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; @@ -200,10 +202,9 @@ static void vioapic_write_indirect( } } -static void vioapic_write(struct vcpu *v, - unsigned long addr, - unsigned long length, - unsigned long val) +static int vioapic_write( + struct vcpu *v, unsigned long addr, + unsigned long length, unsigned long val) { struct hvm_hw_vioapic *vioapic = domain_vioapic(v->domain); @@ -228,6 +229,8 @@ static void vioapic_write(struct vcpu *v default: break; } + + return 1; } static int vioapic_range(struct vcpu *v, unsigned long addr) @@ -254,17 +257,11 @@ static void ioapic_inj_irq( HVM_DBG_LOG(DBG_LEVEL_IOAPIC, "irq %d trig %d deliv %d", vector, trig_mode, delivery_mode); - switch ( delivery_mode ) - { - case dest_Fixed: - case dest_LowestPrio: - if ( vlapic_set_irq(target, vector, trig_mode) ) - vcpu_kick(vlapic_vcpu(target)); - break; - default: - gdprintk(XENLOG_WARNING, "error delivery mode %d\n", delivery_mode); - break; - } + ASSERT((delivery_mode == dest_Fixed) || + (delivery_mode == dest_LowestPrio)); + + if ( vlapic_set_irq(target, vector, trig_mode) ) + vcpu_kick(vlapic_vcpu(target)); } static uint32_t ioapic_get_delivery_bitmask( @@ -311,7 +308,7 @@ static inline int pit_channel0_enabled(v { PITState *pit = ¤t->domain->arch.hvm_domain.pl_time.vpit; struct periodic_time *pt = &pit->pt[0]; - return pt->enabled; + return pt_active(pt); } static void vioapic_deliver(struct hvm_hw_vioapic *vioapic, int irq) @@ -393,10 +390,21 @@ static void vioapic_deliver(struct hvm_h break; } - case dest_SMI: case dest_NMI: - case dest_INIT: - case dest__reserved_2: + { + uint8_t bit; + for ( bit = 0; deliver_bitmask != 0; bit++ ) + { + if ( !(deliver_bitmask & (1 << bit)) ) + continue; + deliver_bitmask &= ~(1 << bit); + if ( ((v = vioapic_domain(vioapic)->vcpu[bit]) != NULL) && + !test_and_set_bool(v->arch.hvm_vcpu.nmi_pending) ) + vcpu_kick(v); + } + break; + } + default: gdprintk(XENLOG_WARNING, "Unsupported delivery mode %d\n", delivery_mode); @@ -458,6 +466,14 @@ void vioapic_update_EOI(struct domain *d ent = &vioapic->redirtbl[gsi]; ent->fields.remote_irr = 0; + + if ( iommu_enabled ) + { + spin_unlock(&d->arch.hvm_domain.irq_lock); + hvm_dpci_eoi(current->domain, gsi, ent); + spin_lock(&d->arch.hvm_domain.irq_lock); + } + if ( (ent->fields.trig_mode == VIOAPIC_LEVEL_TRIG) && !ent->fields.mask && hvm_irq->gsi_assert_count[gsi] ) diff -Naurp xen/arch/x86/hvm/vlapic.c xen-redhat/arch/x86/hvm/vlapic.c --- xen/arch/x86/hvm/vlapic.c +++ xen-redhat/arch/x86/hvm/vlapic.c @@ -67,9 +67,6 @@ static unsigned int vlapic_lvt_mask[VLAP #define APIC_DEST_NOSHORT 0x0 #define APIC_DEST_MASK 0x800 -#define vlapic_lvt_enabled(vlapic, lvt_type) \ - (!(vlapic_get_reg(vlapic, lvt_type) & APIC_LVT_MASKED)) - #define vlapic_lvt_vector(vlapic, lvt_type) \ (vlapic_get_reg(vlapic, lvt_type) & APIC_VECTOR_MASK) @@ -293,7 +290,8 @@ static int vlapic_accept_irq(struct vcpu break; case APIC_DM_NMI: - gdprintk(XENLOG_WARNING, "Ignoring guest NMI\n"); + if ( !test_and_set_bool(v->arch.hvm_vcpu.nmi_pending) ) + vcpu_kick(v); break; case APIC_DM_INIT: @@ -376,6 +374,8 @@ void vlapic_EOI_set(struct vlapic *vlapi if ( vlapic_test_and_clear_vector(vector, &vlapic->regs->data[APIC_TMR]) ) vioapic_update_EOI(vlapic_domain(vlapic), vector); + + hvm_dpci_msi_eoi(current->domain, vector); } static void vlapic_ipi(struct vlapic *vlapic) @@ -428,8 +428,7 @@ static uint32_t vlapic_get_tmcct(struct uint32_t tmcct, tmict = vlapic_get_reg(vlapic, APIC_TMICT); uint64_t counter_passed; - counter_passed = (hvm_get_guest_time(v) - vlapic->pt.last_plt_gtime) // TSC - * 1000000000ULL / ticks_per_sec(v) // NS + counter_passed = (hvm_get_guest_time(v) - vlapic->timer_last_update) / APIC_BUS_CYCLE_NS / vlapic->hw.timer_divisor; tmcct = tmict - counter_passed; @@ -476,17 +475,18 @@ static void vlapic_read_aligned(struct v } } -static unsigned long vlapic_read(struct vcpu *v, unsigned long address, - unsigned long len) +static int vlapic_read( + struct vcpu *v, unsigned long address, + unsigned long len, unsigned long *pval) { unsigned int alignment; unsigned int tmp; - unsigned long result; + unsigned long result = 0; struct vlapic *vlapic = vcpu_vlapic(v); unsigned int offset = address - vlapic_base_address(vlapic); if ( offset > APIC_TDCR ) - return 0; + goto out; /* some bugs on kernel cause read this with byte*/ if ( len != 4 ) @@ -522,15 +522,22 @@ static unsigned long vlapic_read(struct HVM_DBG_LOG(DBG_LEVEL_VLAPIC, "offset 0x%x with length 0x%lx, " "and the result is 0x%lx", offset, len, result); - return result; + goto out; exit_and_crash: domain_crash(v->domain); - return 0; + out: + *pval = result; + return 1; } -static void vlapic_write(struct vcpu *v, unsigned long address, - unsigned long len, unsigned long val) +void vlapic_pt_cb(struct vcpu *v, void *data) +{ + *(s_time_t *)data = hvm_get_guest_time(v); +} + +static int vlapic_write(struct vcpu *v, unsigned long address, + unsigned long len, unsigned long val) { struct vlapic *vlapic = vcpu_vlapic(v); unsigned int offset = address - vlapic_base_address(vlapic); @@ -547,13 +554,13 @@ static void vlapic_write(struct vcpu *v, val &= 0xffffffff; if ( len != 4 ) { - unsigned int tmp; + unsigned long tmp; unsigned char alignment; gdprintk(XENLOG_INFO, "Notice: Local APIC write with len = %lx\n",len); alignment = offset & 0x3; - tmp = vlapic_read(v, offset & ~0x3, 4); + (void)vlapic_read(v, offset & ~0x3, 4, &tmp); switch ( len ) { @@ -579,7 +586,7 @@ static void vlapic_write(struct vcpu *v, "should be 4 instead\n", len); exit_and_crash: domain_crash(v->domain); - return; + return 0; } } @@ -650,6 +657,8 @@ static void vlapic_write(struct vcpu *v, val |= APIC_LVT_MASKED; val &= vlapic_lvt_mask[(offset - APIC_LVTT) >> 4]; vlapic_set_reg(vlapic, offset, val); + if ( offset == APIC_LVT0 ) + vlapic_adjust_i8259_target(v->domain); break; case APIC_TMICT: @@ -658,7 +667,9 @@ static void vlapic_write(struct vcpu *v, vlapic_set_reg(vlapic, APIC_TMICT, val); create_periodic_time(current, &vlapic->pt, period, vlapic->pt.irq, - !vlapic_lvtt_period(vlapic), NULL, vlapic); + !vlapic_lvtt_period(vlapic), vlapic_pt_cb, + &vlapic->timer_last_update); + vlapic->timer_last_update = vlapic->pt.last_plt_gtime; HVM_DBG_LOG(DBG_LEVEL_VLAPIC, "bus cycle is %uns, " @@ -678,6 +689,8 @@ static void vlapic_write(struct vcpu *v, "Local APIC Write to read-only register 0x%x\n", offset); break; } + + return 1; } static int vlapic_range(struct vcpu *v, unsigned long addr) @@ -714,18 +727,44 @@ void vlapic_msr_set(struct vlapic *vlapi "apic base msr is 0x%016"PRIx64, vlapic->hw.apic_base_msr); } -int vlapic_accept_pic_intr(struct vcpu *v) +static int __vlapic_accept_pic_intr(struct vcpu *v) { struct vlapic *vlapic = vcpu_vlapic(v); uint32_t lvt0 = vlapic_get_reg(vlapic, APIC_LVT0); /* - * Only CPU0 is wired to the 8259A. INTA cycles occur if LINT0 is set up - * accept ExtInts, or if the LAPIC is disabled (so LINT0 behaves as INTR). + * INTA cycles occur if LINT0 is set up to accept ExtInts, of it + * the LAPIC is disabled (so LINT0 behaves as INTR). */ - return ((v->vcpu_id == 0) && - (((lvt0 & (APIC_MODE_MASK|APIC_LVT_MASKED)) == APIC_DM_EXTINT) || - vlapic_hw_disabled(vlapic))); + return ((lvt0 & (APIC_MODE_MASK|APIC_LVT_MASKED)) == APIC_DM_EXTINT || + /* LAPIC is fully disabled? */ + vlapic_hw_disabled(vlapic)); +} + +int vlapic_accept_pic_intr(struct vcpu *v) +{ + /* By default, deliver 8259A interrupts to CPU0. */ + return ((v->domain->arch.hvm_domain.i8259_target + ? v == v->domain->arch.hvm_domain.i8259_target + : v->vcpu_id == 0) && + __vlapic_accept_pic_intr(v)); +} + +void vlapic_adjust_i8259_target(struct domain *d) +{ + struct vcpu *v; + + for_each_vcpu ( d, v ) + if ( __vlapic_accept_pic_intr(v) ) + goto found; + + v = d->vcpu ? d->vcpu[0] : NULL; + + found: + if ( d->arch.hvm_domain.i8259_target == v ) + return; + d->arch.hvm_domain.i8259_target = v; + pt_adjust_global_vcpu_target(v); } int vlapic_has_interrupt(struct vcpu *v) @@ -744,7 +783,7 @@ int vlapic_has_interrupt(struct vcpu *v) return highest_irr; } -int cpu_get_apic_interrupt(struct vcpu *v, int *mode) +int cpu_get_apic_interrupt(struct vcpu *v) { int vector = vlapic_has_interrupt(v); struct vlapic *vlapic = vcpu_vlapic(v); @@ -754,8 +793,6 @@ int cpu_get_apic_interrupt(struct vcpu * vlapic_set_vector(vector, &vlapic->regs->data[APIC_ISR]); vlapic_clear_irr(vector, vlapic); - - *mode = APIC_DM_FIXED; return vector; } @@ -817,7 +854,9 @@ static void lapic_rearm(struct vlapic *s s->pt.irq = lvtt & APIC_VECTOR_MASK; create_periodic_time(vlapic_vcpu(s), &s->pt, period, s->pt.irq, - !vlapic_lvtt_period(s), NULL, s); + !vlapic_lvtt_period(s), vlapic_pt_cb, + &s->timer_last_update); + s->timer_last_update = s->pt.last_plt_gtime; printk("lapic_load to rearm the actimer:" "bus cycle is %uns, " @@ -898,6 +937,7 @@ static int lapic_load_regs(struct domain if ( hvm_load_entry(LAPIC_REGS, h, s->regs) != 0 ) return -EINVAL; + vlapic_adjust_i8259_target(d); lapic_rearm(s); return 0; } @@ -913,6 +953,8 @@ int vlapic_init(struct vcpu *v) HVM_DBG_LOG(DBG_LEVEL_VLAPIC, "%d", v->vcpu_id); + vlapic->pt.source = PTSRC_lapic; + vlapic->regs_page = alloc_domheap_page(NULL); if ( vlapic->regs_page == NULL ) { @@ -949,18 +991,3 @@ void vlapic_destroy(struct vcpu *v) unmap_domain_page_global(vlapic->regs); free_domheap_page(vlapic->regs_page); } - -int is_lvtt(struct vcpu *v, int vector) -{ - return vcpu_vlapic(v)->pt.enabled && - vector == vlapic_lvt_vector(vcpu_vlapic(v), APIC_LVTT); -} - -int is_lvtt_enabled(struct vcpu *v) -{ - if ( unlikely(!vlapic_enabled(vcpu_vlapic(v))) || - !vlapic_lvt_enabled(vcpu_vlapic(v), APIC_LVTT)) - return 0; - - return 1; -} diff -Naurp xen/arch/x86/hvm/vmsi.c xen-redhat/arch/x86/hvm/vmsi.c --- xen/arch/x86/hvm/vmsi.c +++ xen-redhat/arch/x86/hvm/vmsi.c @@ -0,0 +1,493 @@ +/* + * Copyright (C) 2001 MandrakeSoft S.A. + * + * MandrakeSoft S.A. + * 43, rue d'Aboukir + * 75002 Paris - France + * http://www.linux-mandrake.com/ + * http://www.mandrakesoft.com/ + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Support for virtual MSI logic + * Will be merged it with virtual IOAPIC logic, since most is the same +*/ + +#include <xen/config.h> +#include <xen/types.h> +#include <xen/mm.h> +#include <xen/xmalloc.h> +#include <xen/lib.h> +#include <xen/errno.h> +#include <xen/sched.h> +#include <public/hvm/ioreq.h> +#include <asm/hvm/io.h> +#include <asm/hvm/vpic.h> +#include <asm/hvm/vlapic.h> +#include <asm/hvm/support.h> +#include <asm/current.h> +#include <asm/event.h> + +static uint32_t vmsi_get_delivery_bitmask( + struct domain *d, uint16_t dest, uint8_t dest_mode) +{ + uint32_t mask = 0; + struct vcpu *v; + + HVM_DBG_LOG(DBG_LEVEL_IOAPIC, "ioapic_get_delivery_bitmask " + "dest %d dest_mode %d\n", dest, dest_mode); + + if ( dest_mode == 0 ) /* Physical mode. */ + { + if ( dest == 0xFF ) /* Broadcast. */ + { + for_each_vcpu ( d, v ) + mask |= 1 << v->vcpu_id; + goto out; + } + + for_each_vcpu ( d, v ) + { + if ( VLAPIC_ID(vcpu_vlapic(v)) == dest ) + { + mask = 1 << v->vcpu_id; + break; + } + } + } + else if ( dest != 0 ) /* Logical mode, MDA non-zero. */ + { + for_each_vcpu ( d, v ) + if ( vlapic_match_logical_addr(vcpu_vlapic(v), dest) ) + mask |= 1 << v->vcpu_id; + } + + out: + HVM_DBG_LOG(DBG_LEVEL_IOAPIC, "ioapic_get_delivery_bitmask mask %x\n", + mask); + return mask; +} + +static void vmsi_inj_irq( + struct domain *d, + struct vlapic *target, + uint8_t vector, + uint8_t trig_mode, + uint8_t delivery_mode) +{ + HVM_DBG_LOG(DBG_LEVEL_IOAPIC, "ioapic_inj_irq " + "irq %d trig %d delive mode %d\n", + vector, trig_mode, delivery_mode); + + switch ( delivery_mode ) + { + case dest_Fixed: + case dest_LowestPrio: + if ( vlapic_set_irq(target, vector, trig_mode) ) + vcpu_kick(vlapic_vcpu(target)); + break; + default: + gdprintk(XENLOG_WARNING, "error delivery mode %d\n", delivery_mode); + break; + } +} + +#define VMSI_DEST_ID_MASK 0xff +#define VMSI_RH_MASK 0x100 +#define VMSI_DM_MASK 0x200 +#define VMSI_DELIV_MASK 0x7000 +#define VMSI_TRIG_MODE 0x8000 + +#define GFLAGS_SHIFT_DEST_ID 0 +#define GFLAGS_SHIFT_RH 8 +#define GFLAGS_SHIFT_DM 9 +#define GLFAGS_SHIFT_DELIV_MODE 12 +#define GLFAGS_SHIFT_TRG_MODE 15 + +int vmsi_deliver(struct domain *d, int pirq) +{ + struct hvm_irq_dpci *hvm_irq_dpci = d->arch.hvm_domain.irq.dpci; + uint32_t flags = hvm_irq_dpci->mirq[pirq].gmsi.gflags; + int vector = hvm_irq_dpci->mirq[pirq].gmsi.gvec; + uint16_t dest = (flags & VMSI_DEST_ID_MASK) >> GFLAGS_SHIFT_DEST_ID; + uint8_t dest_mode = (flags & VMSI_DM_MASK) >> GFLAGS_SHIFT_DM; + uint8_t delivery_mode = (flags & VMSI_DELIV_MASK) >> GLFAGS_SHIFT_DELIV_MODE; + uint8_t trig_mode = (flags & VMSI_TRIG_MODE) >> GLFAGS_SHIFT_TRG_MODE; + uint32_t deliver_bitmask; + struct vlapic *target; + struct vcpu *v; + + HVM_DBG_LOG(DBG_LEVEL_IOAPIC, + "msi: dest=%x dest_mode=%x delivery_mode=%x " + "vector=%x trig_mode=%x\n", + dest, dest_mode, delivery_mode, vector, trig_mode); + + if ( !test_bit(_HVM_IRQ_DPCI_MSI, &hvm_irq_dpci->mirq[pirq].flags) ) + { + gdprintk(XENLOG_WARNING, "pirq %x not msi \n", pirq); + return 0; + } + + deliver_bitmask = vmsi_get_delivery_bitmask(d, dest, dest_mode); + if ( !deliver_bitmask ) + { + HVM_DBG_LOG(DBG_LEVEL_IOAPIC, "ioapic deliver " + "no target on destination\n"); + return 0; + } + + switch ( delivery_mode ) + { + case dest_LowestPrio: + { + /* N.B. backport, from apic_lowest_prio, vector is not used */ + target = apic_round_robin(d, 0, deliver_bitmask); + if ( target != NULL ) + vmsi_inj_irq(d, target, vector, trig_mode, delivery_mode); + else + HVM_DBG_LOG(DBG_LEVEL_IOAPIC, "null round robin: " + "mask=%x vector=%x delivery_mode=%x\n", + deliver_bitmask, vector, dest_LowestPrio); + break; + } + + case dest_Fixed: + case dest_ExtINT: + { + uint8_t bit; + for ( bit = 0; deliver_bitmask != 0; bit++ ) + { + if ( !(deliver_bitmask & (1 << bit)) ) + continue; + deliver_bitmask &= ~(1 << bit); + v = d->vcpu[bit]; + if ( v != NULL ) + { + target = vcpu_vlapic(v); + vmsi_inj_irq(d, target, vector, trig_mode, delivery_mode); + } + } + break; + } + + case dest_SMI: + case dest_NMI: + case dest_INIT: + case dest__reserved_2: + default: + gdprintk(XENLOG_WARNING, "Unsupported delivery mode %d\n", + delivery_mode); + break; + } + return 1; +} + +/* MSI-X mask bit hypervisor interception */ +struct msixtbl_entry +{ + struct list_head list; + atomic_t refcnt; /* how many bind_pt_irq called for the device */ + + /* TODO: resolve the potential race by destruction of pdev */ + struct pci_dev *pdev; + unsigned long gtable; /* gpa of msix table */ + unsigned long table_len; + unsigned long table_flags[MAX_MSIX_TABLE_ENTRIES / BITS_PER_LONG + 1]; + + struct rcu_head rcu; +}; + +static struct msixtbl_entry *msixtbl_find_entry( + struct vcpu *v, unsigned long addr) +{ + struct msixtbl_entry *entry; + struct domain *d = v->domain; + + list_for_each_entry( entry, &d->arch.hvm_domain.msixtbl_list, list ) + if ( addr >= entry->gtable && + addr < entry->gtable + entry->table_len ) + return entry; + + return NULL; +} + +static void __iomem *msixtbl_addr_to_virt( + struct msixtbl_entry *entry, unsigned long addr) +{ + int idx, nr_page; + + if ( !entry ) + return NULL; + + nr_page = (addr >> PAGE_SHIFT) - + (entry->gtable >> PAGE_SHIFT); + + if ( !entry->pdev ) + return NULL; + + idx = entry->pdev->msix_table_idx[nr_page]; + if ( !idx ) + return NULL; + + return (void *)(fix_to_virt(idx) + + (addr & ((1UL << PAGE_SHIFT) - 1))); +} + +static int msixtbl_read( + struct vcpu *v, unsigned long address, + unsigned long len, unsigned long *pval) +{ + unsigned long offset; + struct msixtbl_entry *entry; + void *virt; + int r = 0; + + rcu_read_lock(); + + if ( len != 4 ) + goto out; + + offset = address & (PCI_MSIX_ENTRY_SIZE - 1); + if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) + goto out; + + entry = msixtbl_find_entry(v, address); + virt = msixtbl_addr_to_virt(entry, address); + if ( !virt ) + goto out; + + *pval = readl(virt); + r = 1; + +out: + rcu_read_unlock(); + return r; +} + +static int msixtbl_write(struct vcpu *v, unsigned long address, + unsigned long len, unsigned long val) +{ + unsigned long offset; + struct msixtbl_entry *entry; + void *virt; + int nr_entry; + int r = 0; + + rcu_read_lock(); + + if ( len != 4 ) + goto out; + + entry = msixtbl_find_entry(v, address); + nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; + + offset = address & (PCI_MSIX_ENTRY_SIZE - 1); + if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) + { + set_bit(nr_entry, &entry->table_flags); + goto out; + } + + /* exit to device model if address/data has been modified */ + if ( test_and_clear_bit(nr_entry, &entry->table_flags) ) + goto out; + + virt = msixtbl_addr_to_virt(entry, address); + if ( !virt ) + goto out; + + writel(val, virt); + r = 1; + +out: + rcu_read_unlock(); + return r; +} + +static int msixtbl_range(struct vcpu *v, unsigned long addr) +{ + struct msixtbl_entry *entry; + void *virt; + + rcu_read_lock(); + + entry = msixtbl_find_entry(v, addr); + virt = msixtbl_addr_to_virt(entry, addr); + + rcu_read_unlock(); + + return !!virt; +} + +struct hvm_mmio_handler msixtbl_mmio_handler = { + .check_handler = msixtbl_range, + .read_handler = msixtbl_read, + .write_handler = msixtbl_write +}; + +static void add_msixtbl_entry(struct domain *d, + struct pci_dev *pdev, + uint64_t gtable, + struct msixtbl_entry *entry) +{ + u32 len; + + memset(entry, 0, sizeof(struct msixtbl_entry)); + + INIT_LIST_HEAD(&entry->list); + INIT_RCU_HEAD(&entry->rcu); + atomic_set(&entry->refcnt, 0); + + len = pci_msix_get_table_len(pdev); + entry->table_len = len; + entry->pdev = pdev; + entry->gtable = (unsigned long) gtable; + + list_add_rcu(&entry->list, &d->arch.hvm_domain.msixtbl_list); +} + +static void free_msixtbl_entry(struct rcu_head *rcu) +{ + struct msixtbl_entry *entry; + + entry = container_of (rcu, struct msixtbl_entry, rcu); + + xfree(entry); +} + +static void del_msixtbl_entry(struct msixtbl_entry *entry) +{ + list_del_rcu(&entry->list); + call_rcu(&entry->rcu, free_msixtbl_entry); +} + +int msixtbl_pt_register(struct domain *d, int pirq) +{ + irq_desc_t *irq_desc; + struct msi_desc *msi_desc; + struct pci_dev *pdev; + struct msixtbl_entry *entry, *new_entry; + int r = -EINVAL; + + ASSERT(spin_is_locked(&pcidevs_lock)); + + /* + * xmalloc() with irq_disabled causes the failure of check_lock() + * for xenpool->lock. So we allocate an entry beforehand. + */ + new_entry = xmalloc(struct msixtbl_entry); + if ( !new_entry ) + return -ENOMEM; + + irq_desc = domain_spin_lock_irq_desc(d, pirq, NULL); + if ( !irq_desc ) + { + xfree(new_entry); + return r; + } + + if ( irq_desc->handler != &pci_msi_type ) + goto out; + + msi_desc = irq_desc->msi_desc; + if ( !msi_desc ) + goto out; + + pdev = msi_desc->dev; + if ( !pdev->msix_table ) + { + r = 0; /* msix_table is not mandatory */ + goto out; + } + + spin_lock(&d->arch.hvm_domain.msixtbl_list_lock); + + list_for_each_entry( entry, &d->arch.hvm_domain.msixtbl_list, list ) + if ( pdev == entry->pdev ) + goto found; + + entry = new_entry; + new_entry = NULL; + add_msixtbl_entry(d, pdev, pdev->msix_table, entry); + +found: + atomic_inc(&entry->refcnt); + spin_unlock(&d->arch.hvm_domain.msixtbl_list_lock); + r = 0; + +out: + spin_unlock_irq(&irq_desc->lock); + xfree(new_entry); + return r; +} + +void msixtbl_pt_unregister(struct domain *d, int pirq) +{ + irq_desc_t *irq_desc; + struct msi_desc *msi_desc; + struct pci_dev *pdev; + struct msixtbl_entry *entry; + + ASSERT(spin_is_locked(&pcidevs_lock)); + + irq_desc = domain_spin_lock_irq_desc(d, pirq, NULL); + if ( !irq_desc ) + return; + + if ( irq_desc->handler != &pci_msi_type ) + goto out; + + msi_desc = irq_desc->msi_desc; + if ( !msi_desc ) + goto out; + + pdev = msi_desc->dev; + + spin_lock(&d->arch.hvm_domain.msixtbl_list_lock); + + list_for_each_entry( entry, &d->arch.hvm_domain.msixtbl_list, list ) + if ( pdev == entry->pdev ) + goto found; + + spin_unlock(&d->arch.hvm_domain.msixtbl_list_lock); + + +out: + spin_unlock_irq(&irq_desc->lock); + return; + +found: + if ( !atomic_dec_and_test(&entry->refcnt) ) + del_msixtbl_entry(entry); + + spin_unlock(&d->arch.hvm_domain.msixtbl_list_lock); + spin_unlock_irq(&irq_desc->lock); +} +void msixtbl_pt_cleanup(struct domain *d, int pirq) +{ + struct msixtbl_entry *entry, *temp; + unsigned long flags; + + /* msixtbl_list_lock must be acquired with irq_disabled for check_lock() */ + local_irq_save(flags); + spin_lock(&d->arch.hvm_domain.msixtbl_list_lock); + + list_for_each_entry_safe( entry, temp, + &d->arch.hvm_domain.msixtbl_list, list ) + del_msixtbl_entry(entry); + + spin_unlock(&d->arch.hvm_domain.msixtbl_list_lock); + local_irq_restore(flags); +} diff -Naurp xen/arch/x86/hvm/vmx/intr.c xen-redhat/arch/x86/hvm/vmx/intr.c --- xen/arch/x86/hvm/vmx/intr.c +++ xen-redhat/arch/x86/hvm/vmx/intr.c @@ -71,13 +71,38 @@ * the effect is cleared. (i.e., MOV-SS-blocking 'dominates' STI-blocking). */ -static void enable_irq_window(struct vcpu *v) +static void enable_intr_window(struct vcpu *v, enum hvm_intack intr_source) { - u32 *cpu_exec_control = &v->arch.hvm_vmx.exec_control; - - if ( !(*cpu_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) ) + u32 *cpu_exec_control = &v->arch.hvm_vcpu.u.vmx.exec_control; + u32 ctl = CPU_BASED_VIRTUAL_INTR_PENDING; + + ASSERT(intr_source != hvm_intack_none); + + if ( (intr_source == hvm_intack_nmi) && cpu_has_vmx_vnmi ) + { + /* + * We set MOV-SS blocking in lieu of STI blocking when delivering an + * NMI. This is because it is processor-specific whether STI-blocking + * blocks NMIs. Hence we *must* check for STI-blocking on NMI delivery + * (otherwise vmentry will fail on processors that check for STI- + * blocking) but if the processor does not check for STI-blocking then + * we may immediately vmexit and hance make no progress! + * (see SDM 3B 21.3, "Other Causes of VM Exits"). + */ + u32 intr_shadow = __vmread(GUEST_INTERRUPTIBILITY_INFO); + if ( intr_shadow & VMX_INTR_SHADOW_STI ) + { + /* Having both STI-blocking and MOV-SS-blocking fails vmentry. */ + intr_shadow &= ~VMX_INTR_SHADOW_STI; + intr_shadow |= VMX_INTR_SHADOW_MOV_SS; + __vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_shadow); + } + ctl = CPU_BASED_VIRTUAL_NMI_PENDING; + } + + if ( !(*cpu_exec_control & ctl) ) { - *cpu_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; + *cpu_exec_control |= ctl; __vmwrite(CPU_BASED_VM_EXEC_CONTROL, *cpu_exec_control); } } @@ -107,77 +132,51 @@ static void update_tpr_threshold(struct asmlinkage void vmx_intr_assist(void) { - int has_ext_irq, intr_vector, intr_type = 0; - unsigned long eflags, intr_shadow; + int intr_vector; + enum hvm_intack intr_source; struct vcpu *v = current; - unsigned int idtv_info_field; - unsigned long inst_len; + unsigned int intr_info; + /* Crank the handle on interrupt state. */ pt_update_irq(v); + hvm_dirq_assist(v); - hvm_set_callback_irq_level(); - - update_tpr_threshold(vcpu_vlapic(v)); - - has_ext_irq = cpu_has_pending_irq(v); - - if ( unlikely(v->arch.hvm_vmx.vector_injected) ) - { - v->arch.hvm_vmx.vector_injected = 0; - if ( unlikely(has_ext_irq) ) - enable_irq_window(v); - return; - } - - /* This could be moved earlier in the VMX resume sequence. */ - idtv_info_field = __vmread(IDT_VECTORING_INFO_FIELD); - if ( unlikely(idtv_info_field & INTR_INFO_VALID_MASK) ) - { - __vmwrite(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); + do { + intr_source = hvm_vcpu_has_pending_irq(v); + if ( likely(intr_source == hvm_intack_none) ) + goto out; /* - * Safe: the length will only be interpreted for software exceptions - * and interrupts. If we get here then delivery of some event caused a - * fault, and this always results in defined VM_EXIT_INSTRUCTION_LEN. + * An event is already pending or the pending interrupt is masked? + * Then the pending interrupt must be delayed. */ - inst_len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe */ - __vmwrite(VM_ENTRY_INSTRUCTION_LEN, inst_len); + intr_info = __vmread(VM_ENTRY_INTR_INFO_FIELD); + if ( unlikely(intr_info & INTR_INFO_VALID_MASK) || + !hvm_interrupts_enabled(v, intr_source) ) + { + enable_intr_window(v, intr_source); + goto out; + } + } while ( !hvm_vcpu_ack_pending_irq(v, intr_source, &intr_vector) ); - if ( unlikely(idtv_info_field & 0x800) ) /* valid error code */ - __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, - __vmread(IDT_VECTORING_ERROR_CODE)); - if ( unlikely(has_ext_irq) ) - enable_irq_window(v); - - HVM_DBG_LOG(DBG_LEVEL_1, "idtv_info_field=%x", idtv_info_field); - return; - } - - if ( likely(!has_ext_irq) ) - return; - - intr_shadow = __vmread(GUEST_INTERRUPTIBILITY_INFO); - if ( unlikely(intr_shadow & (VMX_INTR_SHADOW_STI|VMX_INTR_SHADOW_MOV_SS)) ) + if ( intr_source == hvm_intack_nmi ) { - enable_irq_window(v); - HVM_DBG_LOG(DBG_LEVEL_1, "interruptibility"); - return; + vmx_inject_nmi(v); } - - eflags = __vmread(GUEST_RFLAGS); - if ( irq_masked(eflags) ) + else { - enable_irq_window(v); - return; + HVMTRACE_2D(INJ_VIRQ, v, intr_vector, /*fake=*/ 0); + vmx_inject_extint(v, intr_vector); + pt_intr_post(v, intr_vector, intr_source); } - intr_vector = cpu_get_interrupt(v, &intr_type); - BUG_ON(intr_vector < 0); - - HVMTRACE_2D(INJ_VIRQ, v, intr_vector, /*fake=*/ 0); - vmx_inject_extint(v, intr_vector, VMX_DELIVER_NO_ERROR_CODE); + /* Is there another IRQ to queue up behind this one? */ + intr_source = hvm_vcpu_has_pending_irq(v); + if ( unlikely(intr_source != hvm_intack_none) ) + enable_intr_window(v, intr_source); - pt_intr_post(v, intr_vector, intr_type); + out: + update_tpr_threshold(vcpu_vlapic(v)); } /* diff -Naurp xen/arch/x86/hvm/vmx/vmcs.c xen-redhat/arch/x86/hvm/vmx/vmcs.c --- xen/arch/x86/hvm/vmx/vmcs.c +++ xen-redhat/arch/x86/hvm/vmx/vmcs.c @@ -37,6 +37,9 @@ #include <xen/keyhandler.h> #include <asm/shadow.h> +static int opt_vpid_enabled = 1; +boolean_param("vpid", opt_vpid_enabled); + /* Dynamic (run-time adjusted) execution control flags. */ u32 vmx_pin_based_exec_control __read_mostly; u32 vmx_cpu_based_exec_control __read_mostly; @@ -64,7 +67,7 @@ static u32 adjust_vmx_controls(u32 ctl_m } #define vmx_has_secondary_exec_ctls \ - (_vmx_cpu_based_exec_control & ACTIVATE_SECONDARY_CONTROLS) + (_vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) void vmx_init_vmcs_config(void) { @@ -75,21 +78,25 @@ void vmx_init_vmcs_config(void) u32 _vmx_vmexit_control; u32 _vmx_vmentry_control; + rdmsr(MSR_IA32_VMX_BASIC_MSR, vmx_msr_low, vmx_msr_high); + min = (PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING); - opt = 0; /*PIN_BASED_VIRTUAL_NMIS*/ + opt = PIN_BASED_VIRTUAL_NMIS; _vmx_pin_based_exec_control = adjust_vmx_controls( min, opt, MSR_IA32_VMX_PINBASED_CTLS_MSR); min = (CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING | CPU_BASED_MWAIT_EXITING | CPU_BASED_MOV_DR_EXITING | CPU_BASED_ACTIVATE_IO_BITMAP | CPU_BASED_USE_TSC_OFFSETING); - opt = CPU_BASED_ACTIVATE_MSR_BITMAP; - opt |= CPU_BASED_TPR_SHADOW; - opt |= ACTIVATE_SECONDARY_CONTROLS; + opt = (CPU_BASED_ACTIVATE_MSR_BITMAP | + CPU_BASED_TPR_SHADOW | + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); _vmx_cpu_based_exec_control = adjust_vmx_controls( min, opt, MSR_IA32_VMX_PROCBASED_CTLS_MSR); #ifdef __x86_64__ @@ -107,24 +114,44 @@ void vmx_init_vmcs_config(void) if ( vmx_has_secondary_exec_ctls ) { min = 0; - opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + + opt = (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_ENABLE_EPT); + if ( opt_vpid_enabled ) + opt |= SECONDARY_EXEC_ENABLE_VPID; _vmx_secondary_exec_control = adjust_vmx_controls( min, opt, MSR_IA32_VMX_PROCBASED_CTLS2); } + if ( _vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT ) + { + /* + * To use EPT we expect to be able to clear certain intercepts. + * We check VMX_BASIC_MSR[55] to correctly handle default1 controls. + */ + uint32_t must_be_one, must_be_zero, msr = MSR_IA32_VMX_PROCBASED_CTLS_MSR; + if ( vmx_msr_high & (1u << 23) ) + msr = MSR_IA32_VMX_TRUE_PROCBASED_CTLS; + rdmsr(msr, must_be_one, must_be_zero); + if ( must_be_one & (CPU_BASED_INVLPG_EXITING | + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING) ) + _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; + } + min = VM_EXIT_ACK_INTR_ON_EXIT; - opt = 0; + opt = VM_EXIT_SAVE_GUEST_PAT | VM_EXIT_LOAD_HOST_PAT; #ifdef __x86_64__ min |= VM_EXIT_IA32E_MODE; #endif _vmx_vmexit_control = adjust_vmx_controls( min, opt, MSR_IA32_VMX_EXIT_CTLS_MSR); - min = opt = 0; + min = 0; + opt = VM_ENTRY_LOAD_GUEST_PAT; _vmx_vmentry_control = adjust_vmx_controls( min, opt, MSR_IA32_VMX_ENTRY_CTLS_MSR); - rdmsr(MSR_IA32_VMX_BASIC_MSR, vmx_msr_low, vmx_msr_high); if ( smp_processor_id() == 0 ) { @@ -205,34 +232,69 @@ static void vmx_load_vmcs(struct vcpu *v this_cpu(current_vmcs) = v->arch.hvm_vmx.vmcs; } +struct foreign_vmcs { + struct vcpu *v; + unsigned int count; +}; +static DEFINE_PER_CPU(struct foreign_vmcs, foreign_vmcs); + void vmx_vmcs_enter(struct vcpu *v) { + struct foreign_vmcs *fv; + /* * NB. We must *always* run an HVM VCPU on its own VMCS, except for * vmx_vmcs_enter/exit critical regions. */ - if ( v == current ) + if ( likely(v == current) ) return; - vcpu_pause(v); - spin_lock(&v->arch.hvm_vmx.vmcs_lock); + fv = &this_cpu(foreign_vmcs); - vmx_clear_vmcs(v); - vmx_load_vmcs(v); + if ( fv->v == v ) + { + BUG_ON(fv->count == 0); + } + else + { + BUG_ON(fv->v != NULL); + BUG_ON(fv->count != 0); + + vcpu_pause(v); + spin_lock(&v->arch.hvm_vmx.vmcs_lock); + + vmx_clear_vmcs(v); + vmx_load_vmcs(v); + + fv->v = v; + } + + fv->count++; } void vmx_vmcs_exit(struct vcpu *v) { - if ( v == current ) + struct foreign_vmcs *fv; + + if ( likely(v == current) ) return; - /* Don't confuse vmx_do_resume (for @v or @current!) */ - vmx_clear_vmcs(v); - if ( is_hvm_vcpu(current) ) - vmx_load_vmcs(current); + fv = &this_cpu(foreign_vmcs); + BUG_ON(fv->v != v); + BUG_ON(fv->count == 0); - spin_unlock(&v->arch.hvm_vmx.vmcs_lock); - vcpu_unpause(v); + if ( --fv->count == 0 ) + { + /* Don't confuse vmx_do_resume (for @v or @current!) */ + vmx_clear_vmcs(v); + if ( is_hvm_vcpu(current) ) + vmx_load_vmcs(current); + + spin_unlock(&v->arch.hvm_vmx.vmcs_lock); + vcpu_unpause(v); + + fv->v = NULL; + } } struct vmcs_struct *vmx_alloc_host_vmcs(void) @@ -273,27 +335,14 @@ struct host_execution_env { static void vmx_set_host_env(struct vcpu *v) { - unsigned int tr, cpu; - struct host_execution_env host_env; - struct Xgt_desc_struct desc; - - cpu = smp_processor_id(); - __asm__ __volatile__ ("sidt (%0) \n" :: "a"(&desc) : "memory"); - host_env.idtr_limit = desc.size; - host_env.idtr_base = desc.address; - __vmwrite(HOST_IDTR_BASE, host_env.idtr_base); - - __asm__ __volatile__ ("sgdt (%0) \n" :: "a"(&desc) : "memory"); - host_env.gdtr_limit = desc.size; - host_env.gdtr_base = desc.address; - __vmwrite(HOST_GDTR_BASE, host_env.gdtr_base); - - __asm__ __volatile__ ("str (%0) \n" :: "a"(&tr) : "memory"); - host_env.tr_selector = tr; - host_env.tr_limit = sizeof(struct tss_struct); - host_env.tr_base = (unsigned long) &init_tss[cpu]; - __vmwrite(HOST_TR_SELECTOR, host_env.tr_selector); - __vmwrite(HOST_TR_BASE, host_env.tr_base); + unsigned int cpu = smp_processor_id(); + + __vmwrite(HOST_GDTR_BASE, + (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY)); + __vmwrite(HOST_IDTR_BASE, (unsigned long)idt_tables[cpu]); + + __vmwrite(HOST_TR_SELECTOR, TSS_ENTRY << 3); + __vmwrite(HOST_TR_BASE, (unsigned long)&init_tss[cpu]); /* * Skip end of cpu_user_regs when entering the hypervisor because the @@ -306,6 +355,7 @@ static void vmx_set_host_env(struct vcpu static void construct_vmcs(struct vcpu *v) { + struct domain *d = v->domain; unsigned long cr0, cr4; union vmcs_arbytes arbytes; @@ -313,12 +363,31 @@ static void construct_vmcs(struct vcpu * /* VMCS controls. */ __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control); + + v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control; + v->arch.hvm_vmx.secondary_exec_control = vmx_secondary_exec_control; + + if ( paging_mode_hap(d) ) + { + v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_INVLPG_EXITING | + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING); + } + else + { + v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; + vmx_vmexit_control &= ~(VM_EXIT_SAVE_GUEST_PAT | + VM_EXIT_LOAD_HOST_PAT); + vmx_vmentry_control &= ~VM_ENTRY_LOAD_GUEST_PAT; + } + + __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control); __vmwrite(VM_EXIT_CONTROLS, vmx_vmexit_control); __vmwrite(VM_ENTRY_CONTROLS, vmx_vmentry_control); - __vmwrite(CPU_BASED_VM_EXEC_CONTROL, vmx_cpu_based_exec_control); - v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control; - if ( vmx_cpu_based_exec_control & ACTIVATE_SECONDARY_CONTROLS ) - __vmwrite(SECONDARY_VM_EXEC_CONTROL, vmx_secondary_exec_control); + + if ( cpu_has_vmx_secondary_exec_control ) + __vmwrite(SECONDARY_VM_EXEC_CONTROL, + v->arch.hvm_vmx.secondary_exec_control); if ( cpu_has_vmx_msr_bitmap ) __vmwrite(MSR_BITMAP, virt_to_maddr(vmx_msr_bitmap)); @@ -346,7 +415,7 @@ static void construct_vmcs(struct vcpu * /* Host control registers. */ __vmwrite(HOST_CR0, read_cr0() | X86_CR0_TS); - __vmwrite(HOST_CR4, read_cr4()); + __vmwrite(HOST_CR4, mmu_cr4_features); /* Host CS:RIP. */ __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS); @@ -428,7 +497,10 @@ static void construct_vmcs(struct vcpu * __vmwrite(VMCS_LINK_POINTER_HIGH, ~0UL); #endif - __vmwrite(EXCEPTION_BITMAP, HVM_TRAP_MASK | (1U << TRAP_page_fault)); + if ( paging_mode_hap(d) ) + __vmwrite(EXCEPTION_BITMAP, HVM_TRAP_MASK); + else + __vmwrite(EXCEPTION_BITMAP, HVM_TRAP_MASK | (1U << TRAP_page_fault)); /* Guest CR0. */ cr0 = read_cr0(); @@ -439,7 +511,14 @@ static void construct_vmcs(struct vcpu * /* Guest CR4. */ cr4 = read_cr4(); - __vmwrite(GUEST_CR4, cr4 & ~X86_CR4_PSE); + if ( paging_mode_hap(v->domain) ) + { + hvm_update_guest_cr(v, 0); + hvm_update_guest_cr(v, 4); + } + else + __vmwrite(GUEST_CR4, cr4 & ~X86_CR4_PSE); + v->arch.hvm_vmx.cpu_shadow_cr4 = cr4 & ~(X86_CR4_PGE | X86_CR4_VMXE | X86_CR4_PAE); __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4); @@ -454,11 +533,34 @@ static void construct_vmcs(struct vcpu * } #endif + if ( paging_mode_hap(d) ) + { + v->arch.hvm_vmx.ept_control.etmt = EPT_DEFAULT_MT; + v->arch.hvm_vmx.ept_control.gaw = EPT_DEFAULT_GAW; + v->arch.hvm_vmx.ept_control.asr = + pagetable_get_pfn(d->arch.phys_table); + __vmwrite(EPT_POINTER, v->arch.hvm_vmx.ept_control.eptp); +#ifdef CONFIG_X86_PAE + __vmwrite(EPT_POINTER_HIGH, v->arch.hvm_vmx.ept_control.eptp >> 32); +#endif + } + + if ( cpu_has_vmx_vpid ) + { + v->arch.hvm_vmx.vpid = v->vcpu_id + + v->domain->arch.hvm_domain.vmx_vpid_base; + __vmwrite(VIRTUAL_PROCESSOR_ID, v->arch.hvm_vmx.vpid); + } + /* Memory-mapped based VLAPIC TPR optimization. */ if ( cpu_has_vmx_mmap_vtpr_optimization ) { __vmwrite(VIRTUAL_APIC_PAGE_ADDR, page_to_maddr(vcpu_vlapic(v)->regs_page)); +#if defined (CONFIG_X86_PAE) + __vmwrite(VIRTUAL_APIC_PAGE_ADDR_HIGH, + page_to_maddr(vcpu_vlapic(v)->regs_page) >> 32); +#endif __vmwrite(TPR_THRESHOLD, 0); vcpu_vlapic(v)->mmap_vtpr_enabled = 1; @@ -471,6 +573,21 @@ static void construct_vmcs(struct vcpu * __vmwrite(GUEST_TR_BASE, 0); __vmwrite(GUEST_TR_LIMIT, 0xff); + if ( cpu_has_vmx_pat && paging_mode_hap(d) ) + { + u64 host_pat, guest_pat; + + rdmsrl(MSR_IA32_CR_PAT, host_pat); + guest_pat = 0x7040600070406ULL; + + __vmwrite(HOST_PAT, host_pat); + __vmwrite(GUEST_PAT, guest_pat); +#ifdef __i386__ + __vmwrite(HOST_PAT_HIGH, host_pat >> 32); + __vmwrite(GUEST_PAT_HIGH, guest_pat >> 32); +#endif + } + vmx_vmcs_exit(v); paging_update_paging_modes(v); /* will update HOST & GUEST_CR3 as reqd */ @@ -533,6 +650,7 @@ void vmx_do_resume(struct vcpu *v) vmx_load_vmcs(v); hvm_migrate_timers(v); vmx_set_host_env(v); + vpid_sync_vcpu_all(v); } if ( !v->arch.hvm_vmx.launched && vcpu_vlapic(v)->mmap_vtpr_enabled ) @@ -545,6 +663,9 @@ void vmx_do_resume(struct vcpu *v) domain_crash_synchronous(); } __vmwrite(APIC_ACCESS_ADDR, page_to_maddr(pg)); +#if defined (CONFIG_X86_PAE) + __vmwrite(APIC_ACCESS_ADDR_HIGH, page_to_maddr(pg) >> 32); +#endif } debug_state = v->domain->debugger_attached; @@ -564,55 +685,142 @@ void vmx_do_resume(struct vcpu *v) reset_stack_and_jump(vmx_asm_do_vmentry); } -/* Dump a section of VMCS */ -static void print_section(char *header, uint32_t start, - uint32_t end, int incr) +static void vmx_dump_sel(char *name, enum x86_segment seg) { - uint32_t addr, j; - unsigned long val; - int code, rc; - char *fmt[4] = {"0x%04lx ", "0x%016lx ", "0x%08lx ", "0x%016lx "}; - char *err[4] = {"------ ", "------------------ ", - "---------- ", "------------------ "}; - - /* Find width of the field (encoded in bits 14:13 of address) */ - code = (start>>13)&3; - - if (header) - printk("\t %s", header); - - for (addr=start, j=0; addr<=end; addr+=incr, j++) { - - if (!(j&3)) - printk("\n\t\t0x%08x: ", addr); - - val = __vmread_safe(addr, &rc); - if (rc == 0) - printk(fmt[code], val); - else - printk("%s", err[code]); - } + struct segment_register sreg; + hvm_get_segment_register(current, seg, &sreg); + printk("%s: sel=0x%04x, attr=0x%04x, limit=0x%08x, base=0x%016llx\n", + name, sreg.sel, sreg.attr.bytes, sreg.limit, + (unsigned long long)sreg.base); +} - printk("\n"); +static unsigned long vmr(unsigned long field) +{ + int rc; + unsigned long val; + val = __vmread_safe(field, &rc); + return rc ? 0 : val; } -/* Dump current VMCS */ void vmcs_dump_vcpu(void) { - print_section("16-bit Guest-State Fields", 0x800, 0x80e, 2); - print_section("16-bit Host-State Fields", 0xc00, 0xc0c, 2); - print_section("64-bit Control Fields", 0x2000, 0x2013, 1); - print_section("64-bit Guest-State Fields", 0x2800, 0x2803, 1); - print_section("32-bit Control Fields", 0x4000, 0x401c, 2); - print_section("32-bit RO Data Fields", 0x4400, 0x440e, 2); - print_section("32-bit Guest-State Fields", 0x4800, 0x482a, 2); - print_section("32-bit Host-State Fields", 0x4c00, 0x4c00, 2); - print_section("Natural 64-bit Control Fields", 0x6000, 0x600e, 2); - print_section("64-bit RO Data Fields", 0x6400, 0x640A, 2); - print_section("Natural 64-bit Guest-State Fields", 0x6800, 0x6826, 2); - print_section("Natural 64-bit Host-State Fields", 0x6c00, 0x6c16, 2); -} + unsigned long long x; + printk("*** Guest State ***\n"); + printk("CR0: actual=0x%016llx, shadow=0x%016llx, gh_mask=%016llx\n", + (unsigned long long)vmr(GUEST_CR0), + (unsigned long long)vmr(CR0_READ_SHADOW), + (unsigned long long)vmr(CR0_GUEST_HOST_MASK)); + printk("CR4: actual=0x%016llx, shadow=0x%016llx, gh_mask=%016llx\n", + (unsigned long long)vmr(GUEST_CR4), + (unsigned long long)vmr(CR4_READ_SHADOW), + (unsigned long long)vmr(CR4_GUEST_HOST_MASK)); + printk("CR3: actual=0x%016llx, target_count=%d\n", + (unsigned long long)vmr(GUEST_CR3), + (int)vmr(CR3_TARGET_COUNT)); + printk(" target0=%016llx, target1=%016llx\n", + (unsigned long long)vmr(CR3_TARGET_VALUE0), + (unsigned long long)vmr(CR3_TARGET_VALUE1)); + printk(" target2=%016llx, target3=%016llx\n", + (unsigned long long)vmr(CR3_TARGET_VALUE2), + (unsigned long long)vmr(CR3_TARGET_VALUE3)); + printk("RSP = 0x%016llx RIP = 0x%016llx\n", + (unsigned long long)vmr(GUEST_RSP), + (unsigned long long)vmr(GUEST_RIP)); + printk("RFLAGS=0x%016llx DR7 = 0x%016llx\n", + (unsigned long long)vmr(GUEST_DR7), + (unsigned long long)vmr(GUEST_RFLAGS)); + printk("Sysenter RSP=%016llx CS:RIP=%04x:%016llx\n", + (unsigned long long)vmr(GUEST_SYSENTER_ESP), + (int)vmr(GUEST_SYSENTER_CS), + (unsigned long long)vmr(GUEST_SYSENTER_EIP)); + vmx_dump_sel("CS", x86_seg_cs); + vmx_dump_sel("DS", x86_seg_ds); + vmx_dump_sel("SS", x86_seg_ss); + vmx_dump_sel("ES", x86_seg_es); + vmx_dump_sel("FS", x86_seg_fs); + vmx_dump_sel("GS", x86_seg_gs); + vmx_dump_sel("GDTR", x86_seg_gdtr); + vmx_dump_sel("LDTR", x86_seg_ldtr); + vmx_dump_sel("IDTR", x86_seg_idtr); + vmx_dump_sel("TR", x86_seg_tr); + x = (unsigned long long)vmr(TSC_OFFSET_HIGH) << 32; + x |= (uint32_t)vmr(TSC_OFFSET); + printk("TSC Offset = %016llx\n", x); + x = (unsigned long long)vmr(GUEST_IA32_DEBUGCTL) << 32; + x |= (uint32_t)vmr(GUEST_IA32_DEBUGCTL); + printk("DebugCtl=%016llx DebugExceptions=%016llx\n", x, + (unsigned long long)vmr(GUEST_PENDING_DBG_EXCEPTIONS)); + printk("Interruptibility=%04x ActivityState=%04x\n", + (int)vmr(GUEST_INTERRUPTIBILITY_INFO), + (int)vmr(GUEST_ACTIVITY_STATE)); + + printk("*** Host State ***\n"); + printk("RSP = 0x%016llx RIP = 0x%016llx\n", + (unsigned long long)vmr(HOST_RSP), + (unsigned long long)vmr(HOST_RIP)); + printk("CS=%04x DS=%04x ES=%04x FS=%04x GS=%04x SS=%04x TR=%04x\n", + (uint16_t)vmr(HOST_CS_SELECTOR), + (uint16_t)vmr(HOST_DS_SELECTOR), + (uint16_t)vmr(HOST_ES_SELECTOR), + (uint16_t)vmr(HOST_FS_SELECTOR), + (uint16_t)vmr(HOST_GS_SELECTOR), + (uint16_t)vmr(HOST_SS_SELECTOR), + (uint16_t)vmr(HOST_TR_SELECTOR)); + printk("FSBase=%016llx GSBase=%016llx TRBase=%016llx\n", + (unsigned long long)vmr(HOST_FS_BASE), + (unsigned long long)vmr(HOST_GS_BASE), + (unsigned long long)vmr(HOST_TR_BASE)); + printk("GDTBase=%016llx IDTBase=%016llx\n", + (unsigned long long)vmr(HOST_GDTR_BASE), + (unsigned long long)vmr(HOST_IDTR_BASE)); + printk("CR0=%016llx CR3=%016llx CR4=%016llx\n", + (unsigned long long)vmr(HOST_CR0), + (unsigned long long)vmr(HOST_CR3), + (unsigned long long)vmr(HOST_CR4)); + printk("Sysenter RSP=%016llx CS:RIP=%04x:%016llx\n", + (unsigned long long)vmr(HOST_IA32_SYSENTER_ESP), + (int)vmr(HOST_IA32_SYSENTER_CS), + (unsigned long long)vmr(HOST_IA32_SYSENTER_EIP)); + + printk("*** Control State ***\n"); + printk("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", + (uint32_t)vmr(PIN_BASED_VM_EXEC_CONTROL), + (uint32_t)vmr(CPU_BASED_VM_EXEC_CONTROL), + (uint32_t)vmr(SECONDARY_VM_EXEC_CONTROL)); + printk("EntryControls=%08x ExitControls=%08x\n", + (uint32_t)vmr(VM_ENTRY_CONTROLS), + (uint32_t)vmr(VM_EXIT_CONTROLS)); + printk("ExceptionBitmap=%08x\n", + (uint32_t)vmr(EXCEPTION_BITMAP)); + printk("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", + (uint32_t)vmr(VM_ENTRY_INTR_INFO_FIELD), + (uint32_t)vmr(VM_ENTRY_EXCEPTION_ERROR_CODE), + (uint32_t)vmr(VM_ENTRY_INSTRUCTION_LEN)); + printk("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", + (uint32_t)vmr(VM_EXIT_INTR_INFO), + (uint32_t)vmr(VM_EXIT_INTR_ERROR_CODE), + (uint32_t)vmr(VM_ENTRY_INSTRUCTION_LEN)); + printk(" reason=%08x qualification=%08x\n", + (uint32_t)vmr(VM_EXIT_REASON), + (uint32_t)vmr(EXIT_QUALIFICATION)); + printk("IDTVectoring: info=%08x errcode=%08x\n", + (uint32_t)vmr(IDT_VECTORING_INFO_FIELD), + (uint32_t)vmr(IDT_VECTORING_ERROR_CODE)); + printk("TPR Threshold = 0x%02x\n", + (uint32_t)vmr(TPR_THRESHOLD)); + printk("secondary exec control = 0x%08x\n", + (uint32_t)vmr(SECONDARY_VM_EXEC_CONTROL)); + printk("Guest PAT = 0x%08x%08x\n", + (uint32_t)vmr(GUEST_PAT_HIGH), (uint32_t)vmr(GUEST_PAT)); + printk("Host PAT = 0x%08x%08x\n", + (uint32_t)vmr(HOST_PAT_HIGH), (uint32_t)vmr(HOST_PAT)); + printk("EPT pointer = 0x%08x%08x\n", + (uint32_t)vmr(EPT_POINTER_HIGH), (uint32_t)vmr(EPT_POINTER)); + printk("virtual processor ID = 0x%04x\n", + (uint32_t)vmr(VIRTUAL_PROCESSOR_ID)); + +} static void vmcs_dump(unsigned char ch) { diff -Naurp xen/arch/x86/hvm/vmx/vmx.c xen-redhat/arch/x86/hvm/vmx/vmx.c --- xen/arch/x86/hvm/vmx/vmx.c +++ xen-redhat/arch/x86/hvm/vmx/vmx.c @@ -50,11 +50,25 @@ #include <asm/hvm/vpt.h> #include <public/hvm/save.h> #include <asm/hvm/trace.h> +#include <asm/paging.h> +#include <asm/debugger.h> char *vmx_msr_bitmap; static void vmx_ctxt_switch_from(struct vcpu *v); static void vmx_ctxt_switch_to(struct vcpu *v); +static int vmx_alloc_vpid(struct domain *d); +static void vmx_free_vpid(struct domain *d); + +static int vmx_domain_initialise(struct domain *d) +{ + return vmx_alloc_vpid(d); +} + +static void vmx_domain_destroy(struct domain *d) +{ + vmx_free_vpid(d); +} static int vmx_vcpu_initialise(struct vcpu *v) { @@ -79,6 +93,7 @@ static int vmx_vcpu_initialise(struct vc static void vmx_vcpu_destroy(struct vcpu *v) { + ept_sync_all(); vmx_destroy_vmcs(v); } @@ -176,20 +191,14 @@ static int long_mode_do_msr_read(struct case MSR_FS_BASE: msr_content = __vmread(GUEST_FS_BASE); - goto check_long_mode; + break; case MSR_GS_BASE: msr_content = __vmread(GUEST_GS_BASE); - goto check_long_mode; + break; case MSR_SHADOW_GS_BASE: - msr_content = v->arch.hvm_vmx.shadow_gs; - check_long_mode: - if ( !(vmx_long_mode_enabled(v)) ) - { - vmx_inject_hw_exception(v, TRAP_gp_fault, 0); - return 0; - } + rdmsrl(MSR_SHADOW_GS_BASE, msr_content); break; case MSR_STAR: @@ -274,9 +283,6 @@ static int long_mode_do_msr_write(struct case MSR_FS_BASE: case MSR_GS_BASE: case MSR_SHADOW_GS_BASE: - if ( !vmx_long_mode_enabled(v) ) - goto gp_fault; - if ( !is_canonical_address(msr_content) ) goto uncanonical_address; @@ -285,10 +291,7 @@ static int long_mode_do_msr_write(struct else if ( ecx == MSR_GS_BASE ) __vmwrite(GUEST_GS_BASE, msr_content); else - { - v->arch.hvm_vmx.shadow_gs = msr_content; wrmsrl(MSR_SHADOW_GS_BASE, msr_content); - } break; @@ -346,7 +349,10 @@ static void vmx_restore_host_msrs(void) static void vmx_save_guest_msrs(struct vcpu *v) { - /* MSR_SHADOW_GS_BASE may have been changed by swapgs instruction. */ + /* + * We cannot cache SHADOW_GS_BASE while the VCPU runs, as it can + * be updated at any time via SWAPGS, which we cannot trap. + */ rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs); } @@ -607,28 +613,109 @@ void vmx_vmcs_save(struct vcpu *v, struc c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP); c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP); - /* - * Save any event/interrupt that was being injected when we last - * exited. IDT_VECTORING_INFO_FIELD has priority, as anything in - * VM_ENTRY_INTR_INFO_FIELD is either a fault caused by the first - * event, which will happen the next time, or an interrupt, which we - * never inject when IDT_VECTORING_INFO_FIELD is valid. - */ - if ( (ev = __vmread(IDT_VECTORING_INFO_FIELD)) & INTR_INFO_VALID_MASK ) - { - c->pending_event = ev; - c->error_code = __vmread(IDT_VECTORING_ERROR_CODE); - } - else if ( (ev = __vmread(VM_ENTRY_INTR_INFO_FIELD)) & - INTR_INFO_VALID_MASK ) + c->pending_event = 0; + c->error_code = 0; + if ( ((ev = __vmread(VM_ENTRY_INTR_INFO_FIELD)) & INTR_INFO_VALID_MASK) && + vmx_event_needs_reinjection((ev >> 8) & 7, ev & 0xff) ) { c->pending_event = ev; c->error_code = __vmread(VM_ENTRY_EXCEPTION_ERROR_CODE); } - else + + vmx_vmcs_exit(v); +} + +/* the caller needs to check if the guest is switching to PAE mode */ +static void vmx_load_pdptrs(struct vcpu *v) +{ + uint64_t *guest_pdptrs; + unsigned long cr3 = v->arch.hvm_vmx.cpu_cr3, mfn; + char *p; + + if ( cr3 & 0x1fUL ) + goto crash; + + mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT)); + p = map_domain_page(mfn); + guest_pdptrs = (uint64_t *)(p + (cr3 & ~PAGE_MASK)); + + vmx_vmcs_enter(v); + + __vmwrite(GUEST_PDPTR0, guest_pdptrs[0]); + __vmwrite(GUEST_PDPTR1, guest_pdptrs[1]); + __vmwrite(GUEST_PDPTR2, guest_pdptrs[2]); + __vmwrite(GUEST_PDPTR3, guest_pdptrs[3]); +#ifdef __i386__ + __vmwrite(GUEST_PDPTR0_HIGH, guest_pdptrs[0] >> 32); + __vmwrite(GUEST_PDPTR1_HIGH, guest_pdptrs[1] >> 32); + __vmwrite(GUEST_PDPTR2_HIGH, guest_pdptrs[2] >> 32); + __vmwrite(GUEST_PDPTR3_HIGH, guest_pdptrs[3] >> 32); +#endif + + vmx_vmcs_exit(v); + unmap_domain_page(p); + return; + +crash: + domain_crash(v->domain); +} + +static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr) +{ + unsigned long cr4; + + if ( !hap_enabled(v->domain) ) + return; + + ASSERT((v == current) || !vcpu_runnable(v)); + + vmx_vmcs_enter(v); + + switch (cr) { - c->pending_event = 0; - c->error_code = 0; + case 0: + if ( vmx_paging_enabled(v) ) + v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING); + else + v->arch.hvm_vmx.exec_control |= (CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING); + + __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control); + break; + + case 3: + if ( vmx_paging_enabled(v) ) + { + if ( vmx_pae_enabled(v) && !vmx_long_mode_enabled(v) ) + vmx_load_pdptrs(v); + __vmwrite(GUEST_CR3, v->arch.hvm_vmx.cpu_cr3); + } + else + __vmwrite(GUEST_CR3, HVM_IDENT_PT_PAGE); + vpid_sync_vcpu_all(v); + break; + + case 4: + if ( vmx_paging_enabled(v) ) + { + cr4 = HVM_CR4_HOST_MASK & ~X86_CR4_PAE; + cr4 |= v->arch.hvm_vmx.cpu_shadow_cr4; + if ( vmx_pae_enabled(v) && !vmx_long_mode_enabled(v) ) + vmx_load_pdptrs(v); + } + else + { + cr4 = __vmread(GUEST_CR4) | HVM_CR4_HOST_MASK; + cr4 |= X86_CR4_PSE; + cr4 &= ~X86_CR4_PAE; + } + + __vmwrite(GUEST_CR4, cr4); + break; + + default: + BUG(); } vmx_vmcs_exit(v); @@ -636,7 +723,7 @@ void vmx_vmcs_save(struct vcpu *v, struc int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c) { - unsigned long mfn, old_base_mfn; + unsigned long mfn = 0, old_base_mfn; vmx_vmcs_enter(v); @@ -645,8 +732,13 @@ int vmx_vmcs_restore(struct vcpu *v, str __vmwrite(GUEST_RFLAGS, c->rflags); v->arch.hvm_vmx.cpu_cr0 = (c->cr0 | X86_CR0_PE | X86_CR0_PG | - X86_CR0_NE | X86_CR0_WP | X86_CR0_ET); + X86_CR0_NE | X86_CR0_ET); + + if ( paging_mode_shadow(v->domain) ) + v->arch.hvm_vmx.cpu_cr0 |= X86_CR0_WP; + __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0); + v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0; __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0); @@ -659,7 +751,7 @@ int vmx_vmcs_restore(struct vcpu *v, str __func__, c->cr3, c->cr0, c->cr4); #endif - if ( !vmx_paging_enabled(v) ) + if ( !vmx_paging_enabled(v) || paging_mode_hap(v->domain) ) { HVM_DBG_LOG(DBG_LEVEL_VMMU, "%s: paging not enabled.", __func__); goto skip_cr3; @@ -686,10 +778,14 @@ int vmx_vmcs_restore(struct vcpu *v, str if ( vmx_long_mode_enabled(v) ) vmx_enable_long_mode(v); - __vmwrite(GUEST_CR4, (c->cr4 | HVM_CR4_HOST_MASK)); v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4; __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4); + if ( paging_mode_shadow(v->domain) ) + __vmwrite(GUEST_CR4, (c->cr4 | HVM_CR4_HOST_MASK)); + else + v->arch.hvm_vmx.cpu_cr3 = c->cr3; + __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit); __vmwrite(GUEST_IDTR_BASE, c->idtr_base); @@ -746,36 +842,18 @@ int vmx_vmcs_restore(struct vcpu *v, str paging_update_paging_modes(v); - if ( c->pending_valid ) + if ( paging_mode_hap(v->domain) ) { - vmx_vmcs_enter(v); + vmx_update_guest_cr(v, 0); + vmx_update_guest_cr(v, 3); + vmx_update_guest_cr(v, 4); + } + if ( c->pending_valid ) + { gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n", c->pending_event, c->error_code); - /* SVM uses type 3 ("Exception") for #OF and #BP; VMX uses type 6 */ - if ( (c->pending_type == 3) && - ((c->pending_vector == 3) || (c->pending_vector == 4)) ) - c->pending_type = 6; - - /* For software exceptions, we need to tell the hardware the - * instruction length as well (hmmm). */ - if ( c->pending_type > 4 ) - { - int addrbytes, ilen; - if ( (c->cs_arbytes & X86_SEG_AR_CS_LM_ACTIVE) && - (c->msr_efer & EFER_LMA) ) - addrbytes = 8; - else if ( c->cs_arbytes & X86_SEG_AR_DEF_OP_SIZE ) - addrbytes = 4; - else - addrbytes = 2; - - ilen = hvm_instruction_fetch(c->rip, addrbytes, NULL); - __vmwrite(VM_ENTRY_INSTRUCTION_LEN, ilen); - } - - /* Sanity check */ if ( (c->pending_type == 1) || (c->pending_type > 6) || (c->pending_reserved != 0) ) { @@ -784,12 +862,13 @@ int vmx_vmcs_restore(struct vcpu *v, str return -EINVAL; } - /* Re-inject the exception */ - __vmwrite(VM_ENTRY_INTR_INFO_FIELD, c->pending_event); - __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, c->error_code); - v->arch.hvm_vmx.vector_injected = 1; - - vmx_vmcs_exit(v); + if ( vmx_event_needs_reinjection(c->pending_type, c->pending_vector) ) + { + vmx_vmcs_enter(v); + __vmwrite(VM_ENTRY_INTR_INFO_FIELD, c->pending_event); + __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, c->error_code); + vmx_vmcs_exit(v); + } } return 0; @@ -825,7 +904,7 @@ static void vmx_save_cpu_state(struct vc data->msr_syscall_mask = guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK]; #endif - data->tsc = hvm_get_guest_time(v); + data->tsc = hvm_get_guest_tsc(v); dump_msr_state(guest_state); } @@ -847,7 +926,7 @@ static void vmx_load_cpu_state(struct vc v->arch.hvm_vmx.vmxassist_enabled = !(data->cr0 & X86_CR0_PE); - hvm_set_guest_time(v, data->tsc); + hvm_set_guest_tsc(v, data->tsc); dump_msr_state(guest_state); } @@ -882,6 +961,10 @@ static void vmx_ctxt_switch_from(struct static void vmx_ctxt_switch_to(struct vcpu *v) { + /* HOST_CR4 in VMCS is always mmu_cr4_features. Sync CR4 now. */ + if ( unlikely(read_cr4() != mmu_cr4_features) ) + write_cr4(mmu_cr4_features); + vmx_restore_guest_msrs(v); vmx_restore_dr(v); } @@ -1005,7 +1088,7 @@ static unsigned long vmx_get_segment_bas static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg, struct segment_register *reg) { - u16 attr = 0; + uint32_t attr = 0; ASSERT(v == current); @@ -1074,6 +1157,133 @@ static void vmx_get_segment_register(str reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00); } +static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg, + struct segment_register *reg) +{ + uint32_t attr; + + ASSERT(v == current); + + attr = reg->attr.bytes; + attr = ((attr & 0xf00) << 4) | (attr & 0xff); + + /* Not-present must mean unusable. */ + if ( !reg->attr.fields.p ) + attr |= (1u << 16); + + switch ( seg ) + { + case x86_seg_cs: + __vmwrite(GUEST_CS_SELECTOR, reg->sel); + __vmwrite(GUEST_CS_LIMIT, reg->limit); + __vmwrite(GUEST_CS_BASE, reg->base); + __vmwrite(GUEST_CS_AR_BYTES, attr); + guest_cpu_user_regs()->cs = reg->sel; + break; + case x86_seg_ds: + __vmwrite(GUEST_DS_SELECTOR, reg->sel); + __vmwrite(GUEST_DS_LIMIT, reg->limit); + __vmwrite(GUEST_DS_BASE, reg->base); + __vmwrite(GUEST_DS_AR_BYTES, attr); + break; + case x86_seg_es: + __vmwrite(GUEST_ES_SELECTOR, reg->sel); + __vmwrite(GUEST_ES_LIMIT, reg->limit); + __vmwrite(GUEST_ES_BASE, reg->base); + __vmwrite(GUEST_ES_AR_BYTES, attr); + break; + case x86_seg_fs: + __vmwrite(GUEST_FS_SELECTOR, reg->sel); + __vmwrite(GUEST_FS_LIMIT, reg->limit); + __vmwrite(GUEST_FS_BASE, reg->base); + __vmwrite(GUEST_FS_AR_BYTES, attr); + break; + case x86_seg_gs: + __vmwrite(GUEST_GS_SELECTOR, reg->sel); + __vmwrite(GUEST_GS_LIMIT, reg->limit); + __vmwrite(GUEST_GS_BASE, reg->base); + __vmwrite(GUEST_GS_AR_BYTES, attr); + break; + case x86_seg_ss: + __vmwrite(GUEST_SS_SELECTOR, reg->sel); + __vmwrite(GUEST_SS_LIMIT, reg->limit); + __vmwrite(GUEST_SS_BASE, reg->base); + __vmwrite(GUEST_SS_AR_BYTES, attr); + guest_cpu_user_regs()->ss = reg->sel; + break; + case x86_seg_tr: + __vmwrite(GUEST_TR_SELECTOR, reg->sel); + __vmwrite(GUEST_TR_LIMIT, reg->limit); + __vmwrite(GUEST_TR_BASE, reg->base); + __vmwrite(GUEST_TR_AR_BYTES, attr); + break; + case x86_seg_gdtr: + __vmwrite(GUEST_GDTR_LIMIT, reg->limit); + __vmwrite(GUEST_GDTR_BASE, reg->base); + break; + case x86_seg_idtr: + __vmwrite(GUEST_IDTR_LIMIT, reg->limit); + __vmwrite(GUEST_IDTR_BASE, reg->base); + break; + case x86_seg_ldtr: + __vmwrite(GUEST_LDTR_SELECTOR, reg->sel); + __vmwrite(GUEST_LDTR_LIMIT, reg->limit); + __vmwrite(GUEST_LDTR_BASE, reg->base); + __vmwrite(GUEST_LDTR_AR_BYTES, attr); + break; + default: + BUG(); + } +} + +static int vmx_set_cr3(unsigned long value) +{ + struct vcpu *v = current; + unsigned long mfn, old_base_mfn; + + /* + * If paging is not enabled yet, simply copy the value to CR3. + */ + if ( !vmx_paging_enabled(v) || paging_mode_hap(v->domain) ) + { + v->arch.hvm_vmx.cpu_cr3 = value; + return X86EMUL_OKAY; + } + + /* + * We make a new one if the shadow does not exist. + */ + if ( value == v->arch.hvm_vmx.cpu_cr3 ) { + /* + * This is simple TLB flush, implying the guest has + * removed some translation or changed page attributes. + * We simply invalidate the shadow. + */ + mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT); + if ( mfn != pagetable_get_pfn(v->arch.guest_table) ) + return X86EMUL_UNHANDLEABLE; + paging_update_cr3(v); + } else { + /* + * If different, make a shadow. Check if the PDBR is valid + * first. + */ + HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value); + mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT); + if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) ) + return X86EMUL_UNHANDLEABLE; + old_base_mfn = pagetable_get_pfn(v->arch.guest_table); + v->arch.guest_table = pagetable_from_pfn(mfn); + if ( old_base_mfn ) + put_page(mfn_to_page(old_base_mfn)); + v->arch.hvm_vmx.cpu_cr3 = value; + update_cr3(v); + HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value); + } + + return X86EMUL_OKAY; +} + /* Make sure that xen intercepts any FP accesses from current */ static void vmx_stts(struct vcpu *v) { @@ -1135,16 +1345,29 @@ static void vmx_init_hypercall_page(stru *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */ } -static int vmx_interrupts_enabled(struct vcpu *v) +static int vmx_interrupts_enabled(struct vcpu *v, enum hvm_intack type) { - unsigned long eflags = __vmread(GUEST_RFLAGS); - return !irq_masked(eflags); + unsigned long intr_shadow, eflags; + + ASSERT(v == current); + + intr_shadow = __vmread(GUEST_INTERRUPTIBILITY_INFO); + + if ( type == hvm_intack_nmi ) + return !(intr_shadow & (VMX_INTR_SHADOW_STI| + VMX_INTR_SHADOW_MOV_SS| + VMX_INTR_SHADOW_NMI)); + + ASSERT((type == hvm_intack_pic) || (type == hvm_intack_lapic)); + eflags = __vmread(GUEST_RFLAGS); + return ((eflags & X86_EFLAGS_IF) && + !(intr_shadow & (VMX_INTR_SHADOW_STI|VMX_INTR_SHADOW_MOV_SS))); } static void vmx_update_host_cr3(struct vcpu *v) { - ASSERT( (v == current) || !vcpu_runnable(v) ); + ASSERT((v == current) || !vcpu_runnable(v)); vmx_vmcs_enter(v); __vmwrite(HOST_CR3, v->arch.cr3); vmx_vmcs_exit(v); @@ -1152,17 +1375,22 @@ static void vmx_update_host_cr3(struct v static void vmx_update_guest_cr3(struct vcpu *v) { - ASSERT( (v == current) || !vcpu_runnable(v) ); + ASSERT((v == current) || !vcpu_runnable(v)); vmx_vmcs_enter(v); __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3); + vpid_sync_vcpu_all(v); vmx_vmcs_exit(v); } static void vmx_flush_guest_tlbs(void) { - /* No tagged TLB support on VMX yet. The fact that we're in Xen - * at all means any guest will have a clean TLB when it's next run, - * because VMRESUME will flush it for us. */ + /* If VPID (i.e. tagged TLB support) is not enabled, the fact that + * we're in Xen at all means any guest will have a clean TLB when + * it's next run, because VMRESUME will flush it for us. + * + * If enabled, we invalidate all translations associated with all + * VPID values */ + vpid_sync_all(); } static void vmx_inject_exception( @@ -1179,14 +1407,10 @@ static void vmx_update_vtpr(struct vcpu /* VMX doesn't have a V_TPR field */ } -static int vmx_event_injection_faulted(struct vcpu *v) +static int vmx_event_pending(struct vcpu *v) { - unsigned int idtv_info_field; - ASSERT(v == current); - - idtv_info_field = __vmread(IDT_VECTORING_INFO_FIELD); - return (idtv_info_field & INTR_INFO_VALID_MASK); + return (__vmread(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK); } static void disable_intercept_for_msr(u32 msr) @@ -1212,6 +1436,8 @@ static void disable_intercept_for_msr(u3 static struct hvm_function_table vmx_function_table = { .name = "VMX", .disable = stop_vmx, + .domain_initialise = vmx_domain_initialise, + .domain_destroy = vmx_domain_destroy, .vcpu_initialise = vmx_vcpu_initialise, .vcpu_destroy = vmx_vcpu_destroy, .store_cpu_guest_regs = vmx_store_cpu_guest_regs, @@ -1227,8 +1453,10 @@ static struct hvm_function_table vmx_fun .get_guest_ctrl_reg = vmx_get_ctrl_reg, .get_segment_base = vmx_get_segment_base, .get_segment_register = vmx_get_segment_register, + .set_segment_register = vmx_set_segment_register, .update_host_cr3 = vmx_update_host_cr3, .update_guest_cr3 = vmx_update_guest_cr3, + .set_cr3 = vmx_set_cr3, .flush_guest_tlbs = vmx_flush_guest_tlbs, .update_vtpr = vmx_update_vtpr, .stts = vmx_stts, @@ -1236,9 +1464,13 @@ static struct hvm_function_table vmx_fun .inject_exception = vmx_inject_exception, .init_ap_context = vmx_init_ap_context, .init_hypercall_page = vmx_init_hypercall_page, - .event_injection_faulted = vmx_event_injection_faulted + .event_pending = vmx_event_pending, + .update_guest_cr = vmx_update_guest_cr }; +static unsigned long *vpid_bitmap; +#define VPID_BITMAP_SIZE ((1u << VMCS_VPID_WIDTH) / MAX_VIRT_CPUS) + int start_vmx(void) { u32 eax, edx; @@ -1291,6 +1523,26 @@ int start_vmx(void) return 0; } + vmx_function_table.hap_supported = cpu_has_vmx_ept; + vmx_function_table.hap_1gb_pgtb = 0; + + ept_sync_all(); + + vpid_sync_all(); + + if ( cpu_has_vmx_vpid ) + { + printk("VMX: VPID is available.\n"); + + vpid_bitmap = xmalloc_array( + unsigned long, BITS_TO_LONGS(VPID_BITMAP_SIZE)); + BUG_ON(vpid_bitmap == NULL); + memset(vpid_bitmap, 0, BITS_TO_LONGS(VPID_BITMAP_SIZE) * sizeof(long)); + + /* VPID 0 is used by VMX root mode (the hypervisor). */ + __set_bit(0, vpid_bitmap); + } + vmx_save_host_msrs(); if ( smp_processor_id() != 0 ) @@ -1311,11 +1563,44 @@ int start_vmx(void) disable_intercept_for_msr(MSR_IA32_SYSENTER_CS); disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP); disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP); + if ( cpu_has_vmx_pat && + vmx_function_table.hap_supported ) + disable_intercept_for_msr(MSR_IA32_CR_PAT); } return 1; } +static int vmx_alloc_vpid(struct domain *d) +{ + int idx; + + if ( !cpu_has_vmx_vpid ) + return 0; + + do { + idx = find_first_zero_bit(vpid_bitmap, VPID_BITMAP_SIZE); + if ( idx >= VPID_BITMAP_SIZE ) + { + dprintk(XENLOG_WARNING, "VMX VPID space exhausted.\n"); + return -EBUSY; + } + } + while ( test_and_set_bit(idx, vpid_bitmap) ); + + d->arch.hvm_domain.vmx_vpid_base = idx * MAX_VIRT_CPUS; + return 0; +} + +static void vmx_free_vpid(struct domain *d) +{ + if ( !cpu_has_vmx_vpid ) + return; + + clear_bit(d->arch.hvm_domain.vmx_vpid_base / MAX_VIRT_CPUS, vpid_bitmap); +} + + /* * Not all cases receive valid value in the VM-exit instruction length field. * Callers must know what they're doing! @@ -1391,7 +1676,7 @@ static void vmx_do_cpuid(struct cpu_user if ( (value & 7) || (mfn == INVALID_MFN) || !v->arch.hvm_vmx.vmxassist_enabled ) { - domain_crash(v->domain); + hvm_inject_exception(TRAP_gp_fault, 0, 0); return; } @@ -1405,6 +1690,10 @@ static void vmx_do_cpuid(struct cpu_user } else { hvm_cpuid(input, &eax, &ebx, &ecx, &edx); + /* don't support features > 0xa */ + if (unlikely(input == 0x0) && eax > 0xa) + eax = 0xa; + if ( input == 0x00000001 ) { /* Mask off reserved bits. */ @@ -1416,14 +1705,17 @@ static void vmx_do_cpuid(struct cpu_user ecx &= ~(bitmaskof(X86_FEATURE_VMXE) | bitmaskof(X86_FEATURE_EST) | bitmaskof(X86_FEATURE_TM2) | - bitmaskof(X86_FEATURE_CID)); + bitmaskof(X86_FEATURE_CID) | + bitmaskof(X86_FEATURE_XSAVE)| + bitmaskof(X86_FEATURE_OSXSAVE)); edx &= ~(bitmaskof(X86_FEATURE_HT) | bitmaskof(X86_FEATURE_ACPI) | bitmaskof(X86_FEATURE_ACC)); } - if ( input == 0x00000006 || input == 0x00000009 || input == 0x0000000A ) + if ( input == 0x00000006 || input == 0x00000009 || + input == 0x0000000A || input == 0x0000000B ) eax = ebx = ecx = edx = 0x0; } @@ -1488,7 +1780,8 @@ static void vmx_do_invlpg(unsigned long * We do the safest things first, then try to update the shadow * copying from guest */ - paging_invlpg(v, va); + if ( paging_invlpg(v, va) ) + vpid_sync_vcpu_gva(v, va); } /* @@ -1974,7 +2267,7 @@ static int vmx_world_restore(struct vcpu v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0; __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0); - if ( !vmx_paging_enabled(v) ) + if ( !vmx_paging_enabled(v) || paging_mode_hap(v->domain) ) goto skip_cr3; if ( c->cr3 == v->arch.hvm_vmx.cpu_cr3 ) @@ -2011,10 +2304,18 @@ static int vmx_world_restore(struct vcpu else HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3); - __vmwrite(GUEST_CR4, (c->cr4 | HVM_CR4_HOST_MASK)); v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4; __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4); + if ( paging_mode_shadow(v->domain) ) + __vmwrite(GUEST_CR4, (c->cr4 | HVM_CR4_HOST_MASK)); + else + { + v->arch.hvm_vmx.cpu_cr3 = c->cr3; + vmx_update_guest_cr(v, 3); + vmx_update_guest_cr(v, 4); + } + __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit); __vmwrite(GUEST_IDTR_BASE, c->idtr_base); @@ -2153,10 +2454,11 @@ static int vmx_assist(struct vcpu *v, in static int vmx_set_cr0(unsigned long value) { struct vcpu *v = current; - unsigned long mfn; + struct domain *d = v->domain; unsigned long eip; int paging_enabled; unsigned long old_cr0; + unsigned long mfn; unsigned long old_base_mfn; HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value); @@ -2181,12 +2483,23 @@ static int vmx_set_cr0(unsigned long val paging_enabled = old_cr0 & X86_CR0_PG; v->arch.hvm_vmx.cpu_cr0 = (value | X86_CR0_PE | X86_CR0_PG - | X86_CR0_NE | X86_CR0_WP); + | X86_CR0_NE); + + if ( paging_mode_shadow(d) ) + v->arch.hvm_vmx.cpu_cr0 |= X86_CR0_WP; + __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0); v->arch.hvm_vmx.cpu_shadow_cr0 = value; __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0); + if ( paging_mode_hap(d) ) + { + vmx_update_guest_cr(v, 0); + vmx_update_guest_cr(v, 3); + vmx_update_guest_cr(v, 4); + } + /* Trying to enable paging. */ if ( (value & X86_CR0_PE) && (value & X86_CR0_PG) && !paging_enabled ) { @@ -2207,37 +2520,40 @@ static int vmx_set_cr0(unsigned long val /* * The guest CR3 must be pointing to the guest physical. */ - mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT); - if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) ) + if ( paging_mode_shadow(v->domain) ) { - gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n", - v->arch.hvm_vmx.cpu_cr3, mfn); - domain_crash(v->domain); - return 0; - } + mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT); + if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) ) + { + gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n", + v->arch.hvm_vmx.cpu_cr3, mfn); + domain_crash(v->domain); + return 0; + } - /* - * Now arch.guest_table points to machine physical. - */ - old_base_mfn = pagetable_get_pfn(v->arch.guest_table); - v->arch.guest_table = pagetable_from_pfn(mfn); - if ( old_base_mfn ) - put_page(mfn_to_page(old_base_mfn)); + /* + * Now arch.guest_table points to machine physical. + */ + old_base_mfn = pagetable_get_pfn(v->arch.guest_table); + v->arch.guest_table = pagetable_from_pfn(mfn); + if ( old_base_mfn ) + put_page(mfn_to_page(old_base_mfn)); - paging_update_paging_modes(v); + HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", + (unsigned long) (mfn << PAGE_SHIFT)); - HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", - (unsigned long) (mfn << PAGE_SHIFT)); + HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx", + v->arch.hvm_vmx.cpu_cr3, mfn); + } - HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx", - v->arch.hvm_vmx.cpu_cr3, mfn); + paging_update_paging_modes(v); } /* Trying to disable paging. */ if ( ((value & (X86_CR0_PE | X86_CR0_PG)) != (X86_CR0_PE | X86_CR0_PG)) && paging_enabled ) { - if ( v->arch.hvm_vmx.cpu_cr3 ) + if ( v->arch.hvm_vmx.cpu_cr3 && paging_mode_shadow(v->domain) ) { put_page(mfn_to_page(get_mfn_from_gpfn( v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT))); @@ -2316,7 +2632,7 @@ static int vmx_set_cr0(unsigned long val */ static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs) { - unsigned long value, old_cr, old_base_mfn, mfn; + unsigned long value, old_cr; struct vcpu *v = current; struct vlapic *vlapic = vcpu_vlapic(v); @@ -2348,45 +2664,8 @@ static int mov_to_cr(int gp, int cr, str return vmx_set_cr0(value); case 3: - /* - * If paging is not enabled yet, simply copy the value to CR3. - */ - if ( !vmx_paging_enabled(v) ) - { - v->arch.hvm_vmx.cpu_cr3 = value; - break; - } - - /* - * We make a new one if the shadow does not exist. - */ - if ( value == v->arch.hvm_vmx.cpu_cr3 ) { - /* - * This is simple TLB flush, implying the guest has - * removed some translation or changed page attributes. - * We simply invalidate the shadow. - */ - mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT); - if ( mfn != pagetable_get_pfn(v->arch.guest_table) ) - goto bad_cr3; - paging_update_cr3(v); - } else { - /* - * If different, make a shadow. Check if the PDBR is valid - * first. - */ - HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value); - mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT); - if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) ) - goto bad_cr3; - old_base_mfn = pagetable_get_pfn(v->arch.guest_table); - v->arch.guest_table = pagetable_from_pfn(mfn); - if ( old_base_mfn ) - put_page(mfn_to_page(old_base_mfn)); - v->arch.hvm_vmx.cpu_cr3 = value; - update_cr3(v); - HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value); - } + if (vmx_set_cr3(value) != X86EMUL_OKAY) + goto bad_cr3; break; case 4: /* CR4 */ @@ -2403,7 +2682,7 @@ static int mov_to_cr(int gp, int cr, str if ( (value & X86_CR4_PAE) && !(old_cr & X86_CR4_PAE) ) { - if ( vmx_pgbit_test(v) ) + if ( vmx_pgbit_test(v) && paging_mode_shadow(v->domain) ) { /* The guest is a 32-bit PAE guest. */ #if CONFIG_PAGING_LEVELS >= 3 @@ -2441,10 +2720,17 @@ static int mov_to_cr(int gp, int cr, str } } - __vmwrite(GUEST_CR4, value | HVM_CR4_HOST_MASK); v->arch.hvm_vmx.cpu_shadow_cr4 = value; __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4); + if ( paging_mode_shadow(v->domain) ) + __vmwrite(GUEST_CR4, (value | HVM_CR4_HOST_MASK)); + else + { + vmx_update_guest_cr(v, 3); + vmx_update_guest_cr(v, 4); + } + /* * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates * all TLB entries except global entries. @@ -2572,7 +2858,7 @@ static int vmx_do_msr_read(struct cpu_us switch ( ecx ) { case MSR_IA32_TIME_STAMP_COUNTER: - msr_content = hvm_get_guest_time(v); + msr_content = hvm_get_guest_tsc(v); break; case MSR_IA32_SYSENTER_CS: msr_content = (u32)__vmread(GUEST_SYSENTER_CS); @@ -2655,8 +2941,9 @@ struct page_info * change_guest_physmap_ mfn = page_to_mfn(pg); d->arch.hvm_domain.apic_access_page = pg; + d->arch.hvm_domain.vmx_apic_access_mfn = mfn; - guest_physmap_add_page(d, pfn, mfn); + guest_physmap_add_page(d, pfn, mfn, 0); d->arch.hvm_domain.physmap_changed_for_vlapic_access = 1; @@ -2667,7 +2954,7 @@ struct page_info * change_guest_physmap_ if ( d->arch.hvm_domain.physmap_changed_for_vlapic_access ) { mfn = page_to_mfn(pg); - guest_physmap_remove_page(d, pfn, mfn); + guest_physmap_remove_page(d, pfn, mfn, 0); flush_tlb_mask(d->domain_dirty_cpumask); d->arch.hvm_domain.physmap_changed_for_vlapic_access = 0; @@ -2716,7 +3003,7 @@ static void check_vlapic_msr_for_vtpr(st vcpu_vlapic(v)->mmap_vtpr_enabled = 1; v->arch.hvm_vcpu.u.vmx.exec_control |= - ( ACTIVATE_SECONDARY_CONTROLS | CPU_BASED_TPR_SHADOW ); + ( CPU_BASED_ACTIVATE_SECONDARY_CONTROLS | CPU_BASED_TPR_SHADOW ); __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vcpu.u.vmx.exec_control); tmp = __vmread(SECONDARY_VM_EXEC_CONTROL); @@ -2752,7 +3039,7 @@ static int vmx_do_msr_write(struct cpu_u switch ( ecx ) { case MSR_IA32_TIME_STAMP_COUNTER: - hvm_set_guest_time(v, msr_content); + hvm_set_guest_tsc(v, msr_content); pt_reset(v); break; case MSR_IA32_SYSENTER_CS: @@ -2843,45 +3130,60 @@ static void vmx_do_extint(struct cpu_use } } -static void vmx_reflect_exception(struct vcpu *v) +static void ept_handle_violation(unsigned long qualification, paddr_t gpa) { - int error_code, intr_info, vector; - - intr_info = __vmread(VM_EXIT_INTR_INFO); - vector = intr_info & 0xff; - if ( intr_info & INTR_INFO_DELIVER_CODE_MASK ) - error_code = __vmread(VM_EXIT_INTR_ERROR_CODE); - else - error_code = VMX_DELIVER_NO_ERROR_CODE; + unsigned long gla_validity = qualification & EPT_GLA_VALIDITY_MASK; + struct domain *d = current->domain; + u64 gfn = gpa >> PAGE_SHIFT; + mfn_t mfn; + p2m_type_t t; + + /* GPA exceeds GAW. */ + if ( unlikely(qualification & EPT_GAW_VIOLATION) ) + { + printk("EPT violation: guest physical address %"PRIpaddr" exceeded " + "its width limit.\n", gpa); + domain_crash(d); + } + + /* The validity of the guest-linear adddress field has 4 values: + * 00 - EPT_GLA_VALIDITY_PDPTR_LOAD + * 01 - EPT_GLA_VALIDITY_GPT_WALK + * 10 - EPT_GLA_VALIDITY_RSVD + * 11 - EPT_GLA_VALIDITY_MATCH + * + * 11 is the normal case, and 01 also contains the situations + * No-write EPT page encounted when trying to write an A or D + * bits. When we in log-dirty mode, it may occurs. + */ -#ifndef NDEBUG + if ( gla_validity == EPT_GLA_VALIDITY_RSVD || + gla_validity == EPT_GLA_VALIDITY_PDPTR_LOAD ) { - unsigned long rip; - - rip = __vmread(GUEST_RIP); - HVM_DBG_LOG(DBG_LEVEL_1, "rip = %lx, error_code = %x", - rip, error_code); + printk("ept violation: reserved bit or pdptr load violation.\n"); + domain_crash(d); } -#endif /* NDEBUG */ - /* - * According to Intel Virtualization Technology Specification for - * the IA-32 Intel Architecture (C97063-002 April 2005), section - * 2.8.3, SW_EXCEPTION should be used for #BP and #OV, and - * HW_EXCEPTION used for everything else. The main difference - * appears to be that for SW_EXCEPTION, the EIP/RIP is incremented - * by VM_ENTER_INSTRUCTION_LEN bytes, whereas for HW_EXCEPTION, - * it is not. - */ - if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_SW_EXCEPTION ) + mfn = ept_get_entry(d, gfn, &t); + + if ( unlikely( gla_validity != EPT_GLA_VALIDITY_MATCH) ) { - int ilen = __get_instruction_length(); /* Safe: software exception */ - vmx_inject_sw_exception(v, vector, ilen); + if ( !p2m_is_ram(t) || !paging_mode_log_dirty(d) ) + { + domain_crash(d); + return; + } } - else + + if ( p2m_is_ram(t) && paging_mode_log_dirty(d) ) { - vmx_inject_hw_exception(v, vector, error_code); + paging_mark_dirty(d, mfn_x(mfn)); + p2m_set_flags(d, gpa, __PAGE_HYPERVISOR|_PAGE_PSE); + flush_tlb_mask(d->domain_dirty_cpumask); + return; } + /* must be MMIO */ + handle_mmio(gpa); } static void vmx_failed_vmentry(unsigned int exit_reason, @@ -2920,10 +3222,19 @@ static void vmx_failed_vmentry(unsigned asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs) { - unsigned int exit_reason; + unsigned int exit_reason, idtv_info; unsigned long exit_qualification, inst_len = 0; struct vcpu *v = current; + if ( paging_mode_hap(v->domain) && hvm_paging_enabled(v) ) + { + __asm__ __volatile__ ("mov"__OS" %%cr2, %0" + : "=r"(v->arch.hvm_vmx.cpu_cr2)); + + /* __hvm_copy() need this when paging is enabled. */ + v->arch.hvm_vmx.cpu_cr3 = __vmread(GUEST_CR3); + } + exit_reason = __vmread(VM_EXIT_REASON); HVMTRACE_2D(VMEXIT, v, __vmread(GUEST_RIP), exit_reason); @@ -2936,6 +3247,33 @@ asmlinkage void vmx_vmexit_handler(struc if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) ) return vmx_failed_vmentry(exit_reason, regs); + /* Event delivery caused this intercept? Queue for redelivery. */ + idtv_info = __vmread(IDT_VECTORING_INFO_FIELD); + if ( unlikely(idtv_info & INTR_INFO_VALID_MASK) + && (exit_reason != EXIT_REASON_TASK_SWITCH) ) + { + if ( vmx_event_needs_reinjection((idtv_info>>8)&7, idtv_info&0xff) ) + { + /* See SDM 3B 25.7.1.1 and .2 for info about masking resvd bits. */ + __vmwrite(VM_ENTRY_INTR_INFO_FIELD, + idtv_info & ~INTR_INFO_RESVD_BITS_MASK); + if ( idtv_info & INTR_INFO_DELIVER_CODE_MASK ) + __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, + __vmread(IDT_VECTORING_ERROR_CODE)); + } + + /* + * Clear NMI-blocking interruptibility info if an NMI delivery faulted. + * Re-delivery will re-set it (see SDM 3B 25.7.1.2). + */ + if ( (idtv_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI ) + __vmwrite(GUEST_INTERRUPTIBILITY_INFO, + __vmread(GUEST_INTERRUPTIBILITY_INFO) & + ~VMX_INTR_SHADOW_NMI); + } + + hvm_maybe_deassert_evtchn_irq(); + switch ( exit_reason ) { case EXIT_REASON_EXCEPTION_NMI: @@ -2952,14 +3290,38 @@ asmlinkage void vmx_vmexit_handler(struc vector = intr_info & INTR_INFO_VECTOR_MASK; + /* + * Re-set the NMI shadow if vmexit caused by a guest IRET fault (see 3B + * 25.7.1.2, "Resuming Guest Software after Handling an Exception"). + * (NB. If we emulate this IRET for any reason, we should re-clear!) + */ + if ( unlikely(intr_info & INTR_INFO_NMI_UNBLOCKED_BY_IRET) && + !(__vmread(IDT_VECTORING_INFO_FIELD) & INTR_INFO_VALID_MASK) && + (vector != TRAP_double_fault) ) + __vmwrite(GUEST_INTERRUPTIBILITY_INFO, + __vmread(GUEST_INTERRUPTIBILITY_INFO)|VMX_INTR_SHADOW_NMI); + perfc_incra(cause_vector, vector); switch ( vector ) { case TRAP_debug: + /* + * Updates DR6 where debugger can peek (See 3B 23.2.1, + * Table 23-1, "Exit Qualification for Debug Exceptions"). + */ + exit_qualification = __vmread(EXIT_QUALIFICATION); + write_debugreg(6, exit_qualification | 0xffff0ff0); + if ( !v->domain->debugger_attached ) + goto exit_and_crash; + domain_pause_for_debugger(); + break; case TRAP_int3: if ( !v->domain->debugger_attached ) goto exit_and_crash; + inst_len = __get_instruction_length(); /* Safe: INT3 */ + __update_guest_eip(inst_len); + current->arch.gdbsx_vcpu_event = TRAP_int3; domain_pause_for_debugger(); break; case TRAP_no_device: @@ -2985,14 +3347,11 @@ asmlinkage void vmx_vmexit_handler(struc vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code); break; case TRAP_nmi: - if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI ) - { - HVMTRACE_0D(NMI, v); - vmx_store_cpu_guest_regs(v, regs, NULL); - do_nmi(regs); /* Real NMI, vector 2: normal processing. */ - } - else - vmx_reflect_exception(v); + if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) != INTR_TYPE_NMI ) + goto exit_and_crash; + HVMTRACE_0D(NMI, v); + vmx_store_cpu_guest_regs(v, regs, NULL); + do_nmi(regs); /* Real NMI, vector 2: normal processing. */ break; case TRAP_machine_check: HVMTRACE_0D(MCE, v); @@ -3022,8 +3381,21 @@ asmlinkage void vmx_vmexit_handler(struc __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control); break; - case EXIT_REASON_TASK_SWITCH: - goto exit_and_crash; + case EXIT_REASON_TASK_SWITCH: { + const enum hvm_task_switch_reason reasons[] = { + TSW_call_or_int, TSW_iret, TSW_jmp, TSW_call_or_int }; + int32_t errcode = -1; + unsigned int idtv_info; + exit_qualification = __vmread(EXIT_QUALIFICATION); + idtv_info = __vmread(IDT_VECTORING_INFO_FIELD); + if ( (idtv_info & INTR_INFO_VALID_MASK) && + (idtv_info & INTR_INFO_DELIVER_CODE_MASK) ) + errcode = __vmread(IDT_VECTORING_ERROR_CODE); + hvm_task_switch((uint16_t)exit_qualification, + reasons[(exit_qualification >> 30) & 3], + errcode); + break; + } case EXIT_REASON_CPUID: inst_len = __get_instruction_length(); /* Safe: CPUID */ __update_guest_eip(inst_len); @@ -3113,6 +3485,21 @@ asmlinkage void vmx_vmexit_handler(struc break; } + case EXIT_REASON_EPT_VIOLATION: + { + paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS); +#ifdef __i386__ + gpa += (unsigned long long)__vmread(GUEST_PHYSICAL_ADDRESS_HIGH) << 32; +#endif + exit_qualification = __vmread(EXIT_QUALIFICATION); + ept_handle_violation(exit_qualification, gpa); + break; + } + + case EXIT_REASON_EPT_MISCONFIG: + domain_crash(current->domain); + break; + default: exit_and_crash: gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason); @@ -3127,6 +3514,22 @@ asmlinkage void vmx_trace_vmentry(void) HVMTRACE_0D(VMENTRY, v); } +static void __ept_sync_domain(void *info) +{ + struct domain *d = info; + __invept(1, d->vcpu[0]->arch.hvm_vmx.ept_control.eptp, 0); +} + +void ept_sync_domain(struct domain *d) +{ + /* Only if using EPT and this domain has some VCPUs to dirty. */ + if ( hap_enabled(d) && d->vcpu[0] ) + { + ASSERT(local_irq_is_enabled()); + on_each_cpu(__ept_sync_domain, d, 1, 1); + } +} + /* * Local variables: * mode: C diff -Naurp xen/arch/x86/hvm/vpic.c xen-redhat/arch/x86/hvm/vpic.c --- xen/arch/x86/hvm/vpic.c +++ xen-redhat/arch/x86/hvm/vpic.c @@ -56,7 +56,7 @@ static int vpic_get_priority(struct hvm_ /* prio = ffs(mask ROR vpic->priority_add); */ asm ( "ror %%cl,%b1 ; bsf %1,%0" - : "=r" (prio) : "r" ((uint32_t)mask), "c" (vpic->priority_add) ); + : "=r" (prio) : "q" ((uint32_t)mask), "c" (vpic->priority_add) ); return prio; } @@ -109,9 +109,12 @@ static void vpic_update_int_output(struc { if ( vpic->is_master ) { - /* Master INT line is connected to VCPU0's VLAPIC LVT0. */ - struct vcpu *v = vpic_domain(vpic)->vcpu[0]; - if ( (v != NULL) && vlapic_accept_pic_intr(v) ) + /* + * Master INT line is connected to whatever VCPU has its LAPIC + * LVT0 set up to receive ExtINT IRQs. + */ + struct vcpu *v = vpic_domain(vpic)->arch.hvm_domain.i8259_target; + if ( v != NULL ) vcpu_kick(v); } else @@ -182,8 +185,7 @@ static void vpic_ioport_write( vpic_lock(vpic); - addr &= 1; - if ( addr == 0 ) + if ( (addr & 1) == 0 ) { if ( val & 0x10 ) { @@ -250,7 +252,13 @@ static void vpic_ioport_write( vpic->isr &= ~(1 << irq); if ( cmd == 7 ) vpic->priority_add = (irq + 1) & 7; - break; + /* Release lock and EOI the physical interrupt (if any). */ + vpic_update_int_output(vpic); + vpic_unlock(vpic); + hvm_dpci_eoi(current->domain, + hvm_isa_irq_to_gsi((addr >> 7) ? (irq|8) : irq), + NULL); + return; /* bail immediately */ case 6: /* Set Priority */ vpic->priority_add = (val + 1) & 7; break; @@ -499,7 +507,7 @@ void vpic_irq_negative_edge(struct domai vpic_update_int_output(vpic); } -int cpu_get_pic_interrupt(struct vcpu *v, int *type) +int cpu_get_pic_interrupt(struct vcpu *v) { int irq, vector; struct hvm_hw_vpic *vpic = &v->domain->arch.hvm_domain.vpic[0]; @@ -512,6 +520,5 @@ int cpu_get_pic_interrupt(struct vcpu *v return -1; vector = vpic[irq >> 3].irq_base + (irq & 7); - *type = APIC_DM_EXTINT; return vector; } diff -Naurp xen/arch/x86/hvm/vpt.c xen-redhat/arch/x86/hvm/vpt.c --- xen/arch/x86/hvm/vpt.c +++ xen-redhat/arch/x86/hvm/vpt.c @@ -15,7 +15,6 @@ * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. - * */ #include <xen/time.h> @@ -23,6 +22,66 @@ #include <asm/hvm/vpt.h> #include <asm/event.h> +#define mode_is(d, name) \ + ((d)->arch.hvm_domain.params[HVM_PARAM_TIMER_MODE] == HVMPTM_##name) + +void hvm_init_guest_time(struct domain *d) +{ + struct pl_time *pl = &d->arch.hvm_domain.pl_time; + + spin_lock_init(&pl->pl_time_lock); + pl->stime_offset = -(u64)get_s_time(); + pl->last_guest_time = 0; +} + +u64 hvm_get_guest_time(struct vcpu *v) +{ + struct pl_time *pl = &v->domain->arch.hvm_domain.pl_time; + u64 now; + + /* Called from device models shared with PV guests. Be careful. */ + ASSERT(is_hvm_vcpu(v)); + + spin_lock(&pl->pl_time_lock); + now = get_s_time() + pl->stime_offset; + if ( (int64_t)(now - pl->last_guest_time) > 0 ) + pl->last_guest_time = now; + else + now = ++pl->last_guest_time; + spin_unlock(&pl->pl_time_lock); + + return now + v->arch.hvm_vcpu.stime_offset; +} + +void hvm_set_guest_time(struct vcpu *v, u64 guest_time) +{ + v->arch.hvm_vcpu.stime_offset += guest_time - hvm_get_guest_time(v); +} + +static int pt_irq_vector(struct periodic_time *pt, enum hvm_intack src) +{ + struct vcpu *v = pt->vcpu; + + if ( pt->source == PTSRC_lapic ) + return pt->irq; + + return get_isa_irq_vector(v, pt->irq, src); +} + +static int pt_irq_masked(struct periodic_time *pt) +{ + struct vcpu *v = pt->vcpu; + + if ( pt->source == PTSRC_lapic ) + { + struct vlapic *vlapic = vcpu_vlapic(v); + return (!vlapic_enabled(vlapic) || + (vlapic_get_reg(vlapic, APIC_LVTT) & APIC_LVT_MASKED)); + } + + return is_isa_irq_masked(v, pt->irq); +} + static void pt_lock(struct periodic_time *pt) { struct vcpu *v; @@ -42,29 +101,46 @@ static void pt_unlock(struct periodic_ti spin_unlock(&pt->vcpu->arch.hvm_vcpu.tm_lock); } -static void missed_ticks(struct periodic_time *pt) +static void pt_process_missed_ticks(struct periodic_time *pt) { - s_time_t missed_ticks; + s_time_t missed_ticks, now = NOW(); - missed_ticks = NOW() - pt->scheduled; + if ( pt->one_shot ) + return; + + missed_ticks = now - pt->scheduled; if ( missed_ticks <= 0 ) return; missed_ticks = missed_ticks / (s_time_t) pt->period + 1; - if ( missed_ticks > 1000 ) - { - /* TODO: Adjust guest time together */ - pt->pending_intr_nr++; - } + if ( mode_is(pt->vcpu->domain, no_missed_ticks_pending) ) + pt->do_not_freeze = !pt->pending_intr_nr; else - { pt->pending_intr_nr += missed_ticks; - } - pt->scheduled += missed_ticks * pt->period; } -void pt_freeze_time(struct vcpu *v) +static void pt_freeze_time(struct vcpu *v) +{ + if ( !mode_is(v->domain, delay_for_missed_ticks) ) + return; + + v->arch.hvm_vcpu.guest_time = hvm_get_guest_time(v); +} + +static void pt_thaw_time(struct vcpu *v) +{ + if ( !mode_is(v->domain, delay_for_missed_ticks) ) + return; + + if ( v->arch.hvm_vcpu.guest_time == 0 ) + return; + + hvm_set_guest_time(v, v->arch.hvm_vcpu.guest_time); + v->arch.hvm_vcpu.guest_time = 0; +} + +void pt_save_timer(struct vcpu *v) { struct list_head *head = &v->arch.hvm_vcpu.tm_list; struct periodic_time *pt; @@ -74,33 +150,30 @@ void pt_freeze_time(struct vcpu *v) spin_lock(&v->arch.hvm_vcpu.tm_lock); - v->arch.hvm_vcpu.guest_time = hvm_get_guest_time(v); - list_for_each_entry ( pt, head, list ) - stop_timer(&pt->timer); + if ( !pt->do_not_freeze ) + stop_timer(&pt->timer); + + pt_freeze_time(v); spin_unlock(&v->arch.hvm_vcpu.tm_lock); } -void pt_thaw_time(struct vcpu *v) +void pt_restore_timer(struct vcpu *v) { struct list_head *head = &v->arch.hvm_vcpu.tm_list; struct periodic_time *pt; spin_lock(&v->arch.hvm_vcpu.tm_lock); - if ( v->arch.hvm_vcpu.guest_time ) + list_for_each_entry ( pt, head, list ) { - hvm_set_guest_time(v, v->arch.hvm_vcpu.guest_time); - v->arch.hvm_vcpu.guest_time = 0; - - list_for_each_entry ( pt, head, list ) - { - missed_ticks(pt); - set_timer(&pt->timer, pt->scheduled); - } + pt_process_missed_ticks(pt); + set_timer(&pt->timer, pt->scheduled); } + pt_thaw_time(v); + spin_unlock(&v->arch.hvm_vcpu.tm_lock); } @@ -111,12 +184,14 @@ static void pt_timer_fn(void *data) pt_lock(pt); pt->pending_intr_nr++; - pt->scheduled += pt->period; - - missed_ticks(pt); + pt->do_not_freeze = 0; if ( !pt->one_shot ) + { + pt->scheduled += pt->period; + pt_process_missed_ticks(pt); set_timer(&pt->timer, pt->scheduled); + } vcpu_kick(pt->vcpu); @@ -126,67 +201,62 @@ static void pt_timer_fn(void *data) void pt_update_irq(struct vcpu *v) { struct list_head *head = &v->arch.hvm_vcpu.tm_list; - struct periodic_time *pt; + struct periodic_time *pt, *earliest_pt = NULL; uint64_t max_lag = -1ULL; - int irq = -1; + int irq, is_lapic; spin_lock(&v->arch.hvm_vcpu.tm_lock); list_for_each_entry ( pt, head, list ) { - if ( !is_isa_irq_masked(v, pt->irq) && pt->pending_intr_nr && + if ( !pt_irq_masked(pt) && pt->pending_intr_nr && ((pt->last_plt_gtime + pt->period_cycles) < max_lag) ) { max_lag = pt->last_plt_gtime + pt->period_cycles; - irq = pt->irq; + earliest_pt = pt; } } + if ( earliest_pt == NULL ) + { + spin_unlock(&v->arch.hvm_vcpu.tm_lock); + return; + } + + earliest_pt->irq_issued = 1; + irq = earliest_pt->irq; + is_lapic = (earliest_pt->source == PTSRC_lapic); + spin_unlock(&v->arch.hvm_vcpu.tm_lock); - if ( is_lvtt(v, irq) ) + if ( is_lapic ) { vlapic_set_irq(vcpu_vlapic(v), irq, 0); } - else if ( irq >= 0 ) + else { hvm_isa_irq_deassert(v->domain, irq); hvm_isa_irq_assert(v->domain, irq); } } -static struct periodic_time *is_pt_irq(struct vcpu *v, int vector, int type) +static struct periodic_time *is_pt_irq( + struct vcpu *v, int vector, enum hvm_intack src) { struct list_head *head = &v->arch.hvm_vcpu.tm_list; struct periodic_time *pt; - struct RTCState *rtc = &v->domain->arch.hvm_domain.pl_time.vrtc; - int vec; list_for_each_entry ( pt, head, list ) { - if ( !pt->pending_intr_nr ) - continue; - - if ( is_lvtt(v, pt->irq) ) - { - if ( pt->irq != vector ) - continue; + if ( pt->pending_intr_nr && pt->irq_issued && + (vector == pt_irq_vector(pt, src)) ) return pt; - } - - vec = get_isa_irq_vector(v, pt->irq, type); - - /* RTC irq need special care */ - if ( (vector != vec) || (pt->irq == 8 && !is_rtc_periodic_irq(rtc)) ) - continue; - - return pt; } return NULL; } -void pt_intr_post(struct vcpu *v, int vector, int type) +void pt_intr_post(struct vcpu *v, int vector, enum hvm_intack src) { struct periodic_time *pt; time_cb *cb; @@ -194,19 +264,38 @@ void pt_intr_post(struct vcpu *v, int ve spin_lock(&v->arch.hvm_vcpu.tm_lock); - pt = is_pt_irq(v, vector, type); + pt = is_pt_irq(v, vector, src); if ( pt == NULL ) { spin_unlock(&v->arch.hvm_vcpu.tm_lock); return; } - ASSERT(pt->vcpu == v); + pt->irq_issued = 0; - pt->pending_intr_nr--; - pt->last_plt_gtime += pt->period_cycles; + if ( pt->one_shot ) + { + if ( pt->on_list ) + list_del(&pt->list); + pt->on_list = 0; + } + else + { + if ( mode_is(v->domain, one_missed_tick_pending) || + mode_is(v->domain, no_missed_ticks_pending) ) + { + pt->last_plt_gtime = hvm_get_guest_time(v); + pt->pending_intr_nr = 0; /* 'collapse' all missed ticks */ + } + else + { + pt->last_plt_gtime += pt->period_cycles; + pt->pending_intr_nr--; + } + } - if ( hvm_get_guest_time(v) < pt->last_plt_gtime ) + if ( mode_is(v->domain, delay_for_missed_ticks) && + (hvm_get_guest_time(v) < pt->last_plt_gtime) ) hvm_set_guest_time(v, pt->last_plt_gtime); cb = pt->cb; @@ -253,30 +342,36 @@ void create_periodic_time( struct vcpu *v, struct periodic_time *pt, uint64_t period, uint8_t irq, char one_shot, time_cb *cb, void *data) { + ASSERT(pt->source != 0); + destroy_periodic_time(pt); spin_lock(&v->arch.hvm_vcpu.tm_lock); - pt->enabled = 1; pt->pending_intr_nr = 0; + pt->do_not_freeze = 0; + pt->irq_issued = 0; - if ( period < 900000 ) /* < 0.9 ms */ + /* Periodic timer must be at least 0.9ms. */ + if ( (period < 900000) && !one_shot ) { gdprintk(XENLOG_WARNING, "HVM_PlatformTime: program too small period %"PRIu64"\n", period); - period = 900000; /* force to 0.9ms */ + period = 900000; } + pt->period = period; pt->vcpu = v; pt->last_plt_gtime = hvm_get_guest_time(pt->vcpu); pt->irq = irq; - pt->period_cycles = (u64)period * cpu_khz / 1000000L; + pt->period_cycles = (u64)period; pt->one_shot = one_shot; pt->scheduled = NOW() + period; pt->cb = cb; pt->priv = data; + pt->on_list = 1; list_add(&pt->list, &v->arch.hvm_vcpu.tm_list); init_timer(&pt->timer, pt_timer_fn, pt, v->processor); @@ -287,12 +382,14 @@ void create_periodic_time( void destroy_periodic_time(struct periodic_time *pt) { - if ( !pt->enabled ) + /* Was this structure previously initialised by create_periodic_time()? */ + if ( pt->vcpu == NULL ) return; pt_lock(pt); - pt->enabled = 0; - list_del(&pt->list); + if ( pt->on_list ) + list_del(&pt->list); + pt->on_list = 0; pt_unlock(pt); /* @@ -301,3 +398,53 @@ void destroy_periodic_time(struct period */ kill_timer(&pt->timer); } + +static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v) +{ + int on_list; + + ASSERT(pt->source == PTSRC_isa); + + if ( pt->vcpu == NULL ) + return; + + pt_lock(pt); + on_list = pt->on_list; + if ( pt->on_list ) + list_del(&pt->list); + pt->on_list = 0; + pt_unlock(pt); + + spin_lock(&v->arch.hvm_vcpu.tm_lock); + pt->vcpu = v; + if ( on_list ) + { + pt->on_list = 1; + list_add(&pt->list, &v->arch.hvm_vcpu.tm_list); + + migrate_timer(&pt->timer, v->processor); + } + spin_unlock(&v->arch.hvm_vcpu.tm_lock); +} + +void pt_adjust_global_vcpu_target(struct vcpu *v) +{ + struct pl_time *pl_time = &v->domain->arch.hvm_domain.pl_time; + + if ( v == NULL ) + return; + + ASSERT(v == v->domain->arch.hvm_domain.i8259_target); + + spin_lock(&pl_time->vpit.lock); + pt_adjust_vcpu(&pl_time->vpit.pt[0], v); + spin_unlock(&pl_time->vpit.lock); + + spin_lock(&pl_time->vrtc.lock); + rtc_migrate_timers(v); + spin_unlock(&pl_time->vrtc.lock); + + spin_lock(&pl_time->vhpet.lock); + hpet_migrate_timers(v); + spin_unlock(&pl_time->vhpet.lock); +} diff -Naurp xen/arch/x86/i8259.c xen-redhat/arch/x86/i8259.c --- xen/arch/x86/i8259.c +++ xen-redhat/arch/x86/i8259.c @@ -395,6 +395,8 @@ void __init init_IRQ(void) irq_desc[i].handler = &no_irq_type; irq_desc[i].action = NULL; irq_desc[i].depth = 1; + irq_desc[i].vector = i; + INIT_LIST_HEAD(&irq_desc[i].rl_link); spin_lock_init(&irq_desc[i].lock); set_intr_gate(i, interrupt[i]); } @@ -405,6 +407,10 @@ void __init init_IRQ(void) irq_desc[LEGACY_VECTOR(i)].handler = &i8259A_irq_type; } + /* Never allocate the hypercall vector or Linux/BSD fast-trap vector. */ + vector_irq[HYPERCALL_VECTOR] = NEVER_ASSIGN; + vector_irq[0x80] = NEVER_ASSIGN; + apic_intr_init(); /* Set the clock to HZ Hz */ diff -Naurp xen/arch/x86/io_apic.c xen-redhat/arch/x86/io_apic.c --- xen/arch/x86/io_apic.c +++ xen-redhat/arch/x86/io_apic.c @@ -27,16 +27,17 @@ #include <xen/delay.h> #include <xen/sched.h> #include <xen/acpi.h> +#include <xen/pci.h> +#include <xen/pci_regs.h> #include <xen/keyhandler.h> #include <asm/io.h> #include <asm/mc146818rtc.h> #include <asm/smp.h> #include <asm/desc.h> +#include <asm/msi.h> #include <mach_apic.h> #include <io_ports.h> - -#define set_irq_info(irq, mask) ((void)0) -#define set_native_irq_info(irq, mask) ((void)0) +#include <public/physdev.h> /* Different to Linux: our implementation can be simpler. */ #define make_8259A_irq(irq) (io_apic_irqs &= ~(1<<(irq))) @@ -83,10 +84,13 @@ int disable_timer_pin_1 __initdata; static struct irq_pin_list { int apic, pin, next; -} irq_2_pin[PIN_MAP_SIZE]; +} irq_2_pin[PIN_MAP_SIZE] = { + [0 ... PIN_MAP_SIZE-1].pin = -1 +}; static int irq_2_pin_free_entry = NR_IRQS; -int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1}; +int vector_irq[NR_VECTORS] __read_mostly = { + [0 ... NR_VECTORS - 1] = FREE_TO_ASSIGN}; /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are @@ -229,27 +233,32 @@ static void unmask_IO_APIC_irq (unsigned spin_unlock_irqrestore(&ioapic_lock, flags); } -static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) -{ +#define clear_IO_APIC_pin(a,p) __clear_IO_APIC_pin(a,p,0) +#define clear_IO_APIC_pin_raw(a,p) __clear_IO_APIC_pin(a,p,1) +static void __clear_IO_APIC_pin(unsigned int apic, unsigned int pin, int raw) +{ + unsigned int (*read)(unsigned int, unsigned int) + = raw ? __io_apic_read : io_apic_read; + void (*write)(unsigned int, unsigned int, unsigned int) + = raw ? __io_apic_write : io_apic_write; struct IO_APIC_route_entry entry; unsigned long flags; - + /* Check delivery_mode to be sure we're not clearing an SMI pin */ spin_lock_irqsave(&ioapic_lock, flags); - *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); + *(((int*)&entry) + 0) = (*read)(apic, 0x10 + 2 * pin); + *(((int*)&entry) + 1) = (*read)(apic, 0x11 + 2 * pin); spin_unlock_irqrestore(&ioapic_lock, flags); if (entry.delivery_mode == dest_SMI) return; - /* * Disable it in the IO-APIC irq-routing table: */ memset(&entry, 0, sizeof(entry)); entry.mask = 1; spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); + (*write)(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); + (*write)(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -257,9 +266,12 @@ static void clear_IO_APIC (void) { int apic, pin; - for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { clear_IO_APIC_pin(apic, pin); + clear_IO_APIC_pin_raw(apic, pin); + } + } } #ifdef CONFIG_SMP @@ -663,42 +675,53 @@ static inline int IO_APIC_irq_trigger(in } /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ -u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; +u8 irq_vector[NR_IRQS] __read_mostly; + +int free_irq_vector(int vector) +{ + int irq; + + BUG_ON((vector > LAST_DYNAMIC_VECTOR) || (vector < FIRST_DYNAMIC_VECTOR)); + + spin_lock(&vector_lock); + if ((irq = vector_irq[vector]) == AUTO_ASSIGN) + vector_irq[vector] = FREE_TO_ASSIGN; + spin_unlock(&vector_lock); + + return (irq == AUTO_ASSIGN) ? 0 : -EINVAL; +} int assign_irq_vector(int irq) { - static unsigned current_vector = FIRST_DYNAMIC_VECTOR, offset = 0; + static unsigned current_vector = FIRST_DYNAMIC_VECTOR; unsigned vector; - BUG_ON(irq >= NR_IRQ_VECTORS); + BUG_ON(irq >= NR_IRQS); + spin_lock(&vector_lock); - if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) { + if ((irq != AUTO_ASSIGN) && (irq_to_vector(irq) > 0)) { + spin_unlock(&vector_lock); + return irq_to_vector(irq); + } + if ((irq != AUTO_ASSIGN) && (IO_APIC_VECTOR(irq) > 0)) { spin_unlock(&vector_lock); return IO_APIC_VECTOR(irq); } -next: - current_vector += 8; + vector = current_vector; + while (vector_irq[vector] != FREE_TO_ASSIGN) { + vector += 8; + if (vector > LAST_DYNAMIC_VECTOR) + vector = FIRST_DYNAMIC_VECTOR + ((vector + 1) & 7); - /* Skip the hypercall vector. */ - if (current_vector == HYPERCALL_VECTOR) - goto next; - - /* Skip the Linux/BSD fast-trap vector. */ - if (current_vector == 0x80) - goto next; - - if (current_vector > LAST_DYNAMIC_VECTOR) { - offset++; - if (!(offset%8)) { + if (vector == current_vector) { spin_unlock(&vector_lock); return -ENOSPC; } - current_vector = FIRST_DYNAMIC_VECTOR + offset; } - vector = current_vector; + current_vector = vector; vector_irq[vector] = irq; if (irq != AUTO_ASSIGN) IO_APIC_VECTOR(irq) = vector; @@ -708,8 +731,8 @@ next: return vector; } -static struct hw_interrupt_type ioapic_level_type; -static struct hw_interrupt_type ioapic_edge_type; +struct hw_interrupt_type ioapic_level_type; +struct hw_interrupt_type ioapic_edge_type; #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 @@ -1009,11 +1032,6 @@ static void __init enable_IO_APIC(void) int i, apic; unsigned long flags; - for (i = 0; i < PIN_MAP_SIZE; i++) { - irq_2_pin[i].pin = -1; - irq_2_pin[i].next = 0; - } - /* Initialise dynamic irq_2_pin free list. */ for (i = NR_IRQS; i < PIN_MAP_SIZE; i++) irq_2_pin[i].next = i + 1; @@ -1104,6 +1122,7 @@ void disable_IO_APIC(void) entry.delivery_mode = dest_ExtINT; /* ExtInt */ entry.vector = 0; entry.dest.physical.physical_dest = + // TODO: BP: should be get_apic_id GET_APIC_ID(apic_read(APIC_ID)); /* @@ -1510,7 +1529,7 @@ static void end_edge_ioapic_vector(unsig * edge-triggered handler, without risking IRQ storms and other ugly * races. */ -static struct hw_interrupt_type ioapic_edge_type = { +struct hw_interrupt_type ioapic_edge_type = { .typename = "IO-APIC-edge", .startup = startup_edge_ioapic_vector, .shutdown = disable_edge_ioapic_vector, @@ -1521,7 +1540,7 @@ static struct hw_interrupt_type ioapic_e .set_affinity = set_ioapic_affinity_vector, }; -static struct hw_interrupt_type ioapic_level_type = { +struct hw_interrupt_type ioapic_level_type = { .typename = "IO-APIC-level", .startup = startup_level_ioapic_vector, .shutdown = mask_IO_APIC_vector, @@ -1532,6 +1551,50 @@ static struct hw_interrupt_type ioapic_l .set_affinity = set_ioapic_affinity_vector, }; +static unsigned int startup_msi_vector(unsigned int vector) +{ + unmask_msi_vector(vector); + return 0; +} + +static void ack_msi_vector(unsigned int vector) +{ + if ( msi_maskable_irq(irq_desc[vector].msi_desc) ) + ack_APIC_irq(); /* ACKTYPE_NONE */ +} + +static void end_msi_vector(unsigned int vector) +{ + if ( !msi_maskable_irq(irq_desc[vector].msi_desc) ) + ack_APIC_irq(); /* ACKTYPE_EOI */ +} + +static void shutdown_msi_vector(unsigned int vector) +{ + mask_msi_vector(vector); +} + +static void set_msi_affinity_vector(unsigned int vector, cpumask_t cpu_mask) +{ + set_native_irq_info(vector, cpu_mask); + set_msi_affinity(vector, cpu_mask); +} + +/* + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, + * which implement the MSI or MSI-X Capability Structure. + */ +struct hw_interrupt_type pci_msi_type = { + .typename = "PCI-MSI", + .startup = startup_msi_vector, + .shutdown = shutdown_msi_vector, + .enable = unmask_msi_vector, + .disable = mask_msi_vector, + .ack = ack_msi_vector, + .end = end_msi_vector, + .set_affinity = set_msi_affinity_vector, +}; + static inline void init_IO_APIC_traps(void) { int irq; @@ -1649,6 +1712,9 @@ static inline void check_timer(void) { int apic1, pin1, apic2, pin2; int vector; + unsigned long flags; + + local_irq_save(flags); /* * get/set the timer IRQ vector: @@ -1690,6 +1756,7 @@ static inline void check_timer(void) */ unmask_IO_APIC_irq(0); if (timer_irq_works()) { + local_irq_restore(flags); if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(apic1, pin1); return; @@ -1707,6 +1774,7 @@ static inline void check_timer(void) */ setup_ExtINT_IRQ0_pin(apic2, pin2, vector); if (timer_irq_works()) { + local_irq_restore(flags); printk("works.\n"); if (pin1 != -1) replace_pin_at_irq(0, apic1, pin1, apic2, pin2); @@ -1734,6 +1802,7 @@ static inline void check_timer(void) enable_8259A_irq(0); if (timer_irq_works()) { + local_irq_restore(flags); printk(" works.\n"); return; } @@ -1749,6 +1818,8 @@ static inline void check_timer(void) unlock_ExtINT_logic(); + local_irq_restore(flags); + if (timer_irq_works()) { printk(" works.\n"); return; @@ -2128,7 +2199,7 @@ int ioapic_guest_write(unsigned long phy if ( new_rte.vector >= FIRST_DYNAMIC_VECTOR ) new_irq = vector_irq[new_rte.vector]; - if ( (old_irq != new_irq) && (old_irq != -1) && IO_APIC_IRQ(old_irq) ) + if ( (old_irq != new_irq) && (old_irq >= 0) && IO_APIC_IRQ(old_irq) ) { if ( irq_desc[IO_APIC_VECTOR(old_irq)].action ) { @@ -2140,7 +2211,7 @@ int ioapic_guest_write(unsigned long phy remove_pin_at_irq(old_irq, apic, pin); } - if ( (new_irq != -1) && IO_APIC_IRQ(new_irq) ) + if ( (new_irq >= 0) && IO_APIC_IRQ(new_irq) ) { if ( irq_desc[IO_APIC_VECTOR(new_irq)].action ) { diff -Naurp xen/arch/x86/ioport_emulate.c xen-redhat/arch/x86/ioport_emulate.c --- xen/arch/x86/ioport_emulate.c +++ xen-redhat/arch/x86/ioport_emulate.c @@ -0,0 +1,141 @@ +/****************************************************************************** + * ioport_emulate.c + * + * Handle I/O port access quirks of various platforms. + */ + +#include <xen/config.h> +#include <xen/init.h> +#include <xen/sched.h> +#include <xen/dmi.h> + +/* Function pointer used to handle platform specific I/O port emulation. */ +extern void (*ioemul_handle_quirk)( + u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs); + +static void ioemul_handle_proliant_quirk( + u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs) +{ + uint16_t port = regs->edx; + uint8_t value = regs->eax; + + if ( (opcode != 0xee) || (port != 0xcd4) || !(value & 0x80) ) + return; + + /* pushfw */ + io_emul_stub[ 0] = 0x66; + io_emul_stub[ 1] = 0x9c; + /* cli */ + io_emul_stub[ 2] = 0xfa; + /* out %al,%dx */ + io_emul_stub[ 3] = 0xee; + /* 1: in %dx,%al */ + io_emul_stub[ 4] = 0xec; + /* test $0x80,%al */ + io_emul_stub[ 5] = 0xa8; + io_emul_stub[ 6] = 0x80; + /* jnz 1b */ + io_emul_stub[ 7] = 0x75; + io_emul_stub[ 8] = 0xfb; + /* popfw */ + io_emul_stub[ 9] = 0x66; + io_emul_stub[10] = 0x9d; + /* ret */ + io_emul_stub[11] = 0xc3; +} + +int __init proliant_quirk(struct dmi_system_id *d) +{ + ioemul_handle_quirk = ioemul_handle_proliant_quirk; + return 0; +} + +/* This table is the set of system-specific I/O emulation hooks. */ +static struct dmi_system_id __initdata ioport_quirks_tbl[] = { + /* + * I/O emulation hook for certain HP ProLiant servers with + * 'special' SMM goodness. + */ + { + .callback = proliant_quirk, + .ident = "HP ProLiant DL3xx", + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "HP"), + DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL3"), + }, + }, + { + .callback = proliant_quirk, + .ident = "HP ProLiant DL5xx", + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "HP"), + DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL5"), + }, + }, + { + .callback = proliant_quirk, + .ident = "HP ProLiant DL7xx", + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "HP"), + DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL7"), + }, + }, + { + .callback = proliant_quirk, + .ident = "HP ProLiant ML3xx", + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "HP"), + DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant ML3"), + }, + }, + { + .callback = proliant_quirk, + .ident = "HP ProLiant ML5xx", + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "HP"), + DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant ML5"), + }, + }, + { + .callback = proliant_quirk, + .ident = "HP ProLiant BL2xx", + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "HP"), + DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL2"), + }, + }, + { + .callback = proliant_quirk, + .ident = "HP ProLiant BL4xx", + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "HP"), + DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL4"), + }, + }, + { + .callback = proliant_quirk, + .ident = "HP ProLiant BL6xx", + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "HP"), + DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL6"), + }, + }, + { } +}; + +int __init ioport_quirks_init(void) +{ + dmi_check_system(ioport_quirks_tbl); + return 0; +} +__initcall(ioport_quirks_init); + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -Naurp xen/arch/x86/irq.c xen-redhat/arch/x86/irq.c --- xen/arch/x86/irq.c +++ xen-redhat/arch/x86/irq.c @@ -14,8 +14,10 @@ #include <xen/sched.h> #include <xen/keyhandler.h> #include <xen/compat.h> +#include <xen/iocap.h> +#include <asm/msi.h> #include <asm/current.h> -#include <asm/smpboot.h> +#include <public/physdev.h> /* opt_noirqbalance: If true, software IRQ balancing/affinity is disabled. */ int opt_noirqbalance = 0; @@ -23,6 +25,14 @@ boolean_param("noirqbalance", opt_noirqb irq_desc_t irq_desc[NR_IRQS]; +static LIST_HEAD(irq_ratelimit_list); +static DEFINE_SPINLOCK(irq_ratelimit_lock); +static struct timer irq_ratelimit_timer; + +/* irq_ratelimit: the max irq rate allowed in every 10ms, set 0 to disable */ +unsigned int __read_mostly irq_ratelimit_threshold = 10000; +integer_param("irq_ratelimit", irq_ratelimit_threshold); + static void __do_IRQ_guest(int vector); void no_action(int cpl, void *dev_id, struct cpu_user_regs *regs) { } @@ -98,6 +108,66 @@ asmlinkage void do_IRQ(struct cpu_user_r spin_unlock(&desc->lock); } +static void irq_ratelimit_timer_fn(void *data) +{ + irq_desc_t *desc, *tmp; + unsigned long flags; + + spin_lock_irqsave(&irq_ratelimit_lock, flags); + + list_for_each_entry_safe ( desc, tmp, &irq_ratelimit_list, rl_link ) + { + spin_lock(&desc->lock); + desc->handler->enable(desc->vector); + list_del(&desc->rl_link); + INIT_LIST_HEAD(&desc->rl_link); + spin_unlock(&desc->lock); + } + + spin_unlock_irqrestore(&irq_ratelimit_lock, flags); +} + +static int __init irq_ratelimit_init(void) +{ + if ( irq_ratelimit_threshold ) + init_timer(&irq_ratelimit_timer, irq_ratelimit_timer_fn, NULL, 0); + return 0; +} +__initcall(irq_ratelimit_init); + +int request_irq(unsigned int irq, + void (*handler)(int, void *, struct cpu_user_regs *), + unsigned long irqflags, const char * devname, void *dev_id) +{ + struct irqaction * action; + int retval; + + /* + * Sanity-check: shared interrupts must pass in a real dev-ID, + * otherwise we'll have trouble later trying to figure out + * which interrupt is which (messes up the interrupt freeing + * logic etc). + */ + if (irq >= NR_IRQS) + return -EINVAL; + if (!handler) + return -EINVAL; + + action = xmalloc(struct irqaction); + if (!action) + return -ENOMEM; + + action->handler = handler; + action->name = devname; + action->dev_id = dev_id; + + retval = setup_irq(irq, action); + if (retval) + xfree(action); + + return retval; +} + void free_irq(unsigned int irq) { unsigned int vector = irq_to_vector(irq); @@ -168,9 +238,10 @@ struct pending_eoi { static DEFINE_PER_CPU(struct pending_eoi, pending_eoi[NR_VECTORS]); #define pending_eoi_sp(p) ((p)[NR_VECTORS-1].vector) +extern struct hw_interrupt_type ioapic_level_type; + static void __do_IRQ_guest(int vector) { - unsigned int irq = vector_to_irq(vector); irq_desc_t *desc = &irq_desc[vector]; irq_guest_action_t *action = (irq_guest_action_t *)desc->action; struct domain *d; @@ -186,6 +257,37 @@ static void __do_IRQ_guest(int vector) return; } + if ( action->nr_guests == 1 && action->guest[0]->domain_id != 0 && + desc->handler != &ioapic_level_type ) + { + if ( irq_ratelimit_timer.function && /* irq rate limiting enabled? */ + unlikely(desc->rl_cnt++ >= irq_ratelimit_threshold) ) + { + s_time_t now = NOW(); + if ( now < (desc->rl_quantum_start + MILLISECS(10)) ) + { + desc->handler->disable(vector); + /* + * If handler->disable doesn't actually mask the interrupt, a + * disabled irq still can fire. This check also avoids possible + * deadlocks if ratelimit_timer_fn runs at the same time. + */ + if ( likely(list_empty(&desc->rl_link)) ) + { + spin_lock(&irq_ratelimit_lock); + if ( list_empty(&irq_ratelimit_list) ) + set_timer(&irq_ratelimit_timer, now + MILLISECS(10)); + list_add(&desc->rl_link, &irq_ratelimit_list); + spin_unlock(&irq_ratelimit_lock); + } + desc->handler->end(vector); + return; + } + desc->rl_cnt = 0; + desc->rl_quantum_start = now; + } + } + if ( action->ack_type == ACKTYPE_EOI ) { sp = pending_eoi_sp(peoi); @@ -199,12 +301,44 @@ static void __do_IRQ_guest(int vector) for ( i = 0; i < action->nr_guests; i++ ) { + unsigned int irq; d = action->guest[i]; + irq = domain_vector_to_irq(d, vector); if ( (action->ack_type != ACKTYPE_NONE) && !test_and_set_bit(irq, d->pirq_mask) ) action->in_flight++; - send_guest_pirq(d, irq); + if ( !hvm_do_IRQ_dpci(d, irq) ) + send_guest_pirq(d, irq); + } +} + +/* + * Retrieve Xen irq-descriptor corresponding to a domain-specific irq. + * The descriptor is returned locked. This function is safe against changes + * to the per-domain irq-to-vector mapping. + */ +irq_desc_t *domain_spin_lock_irq_desc( + struct domain *d, int irq, unsigned long *pflags) +{ + unsigned int vector; + unsigned long flags; + irq_desc_t *desc; + + for ( ; ; ) + { + vector = domain_irq_to_vector(d, irq); + if ( vector <= 0 ) + return NULL; + desc = &irq_desc[vector]; + spin_lock_irqsave(&desc->lock, flags); + if ( vector == domain_irq_to_vector(d, irq) ) + break; + spin_unlock_irqrestore(&desc->lock, flags); } + + if ( pflags != NULL ) + *pflags = flags; + return desc; } /* Flush all ready EOIs from the top of this CPU's pending-EOI stack. */ @@ -270,11 +404,15 @@ static void __pirq_guest_eoi(struct doma irq_desc_t *desc; irq_guest_action_t *action; cpumask_t cpu_eoi_map; + int vector; - desc = &irq_desc[irq_to_vector(irq)]; - action = (irq_guest_action_t *)desc->action; + ASSERT(local_irq_is_enabled()); + desc = domain_spin_lock_irq_desc(d, irq, NULL); + if ( desc == NULL ) + return; - spin_lock_irq(&desc->lock); + action = (irq_guest_action_t *)desc->action; + vector = desc - irq_desc; ASSERT(!test_bit(irq, d->pirq_mask) || (action->ack_type != ACKTYPE_NONE)); @@ -289,7 +427,7 @@ static void __pirq_guest_eoi(struct doma if ( action->ack_type == ACKTYPE_UNMASK ) { ASSERT(cpus_empty(action->cpu_eoi_map)); - desc->handler->end(irq_to_vector(irq)); + desc->handler->end(vector); spin_unlock_irq(&desc->lock); return; } @@ -341,13 +479,13 @@ int pirq_guest_unmask(struct domain *d) } extern int ioapic_ack_new; -int pirq_acktype(int irq) +int pirq_acktype(struct domain *d, int irq) { irq_desc_t *desc; unsigned int vector; - vector = irq_to_vector(irq); - if ( vector == 0 ) + vector = domain_irq_to_vector(d, irq); + if ( vector <= 0 ) return ACKTYPE_NONE; desc = &irq_desc[vector]; @@ -364,6 +502,13 @@ int pirq_acktype(int irq) return ACKTYPE_NONE; /* + * MSIs are treated as edge-triggered interrupts, except + * when there is no proper way to mask them. + */ + if ( desc->handler == &pci_msi_type ) + return msi_maskable_irq(desc->msi_desc) ? ACKTYPE_NONE : ACKTYPE_EOI; + + /* * Level-triggered IO-APIC interrupts need to be acknowledged on the CPU * on which they were received. This is because we tickle the LAPIC to EOI. */ @@ -387,23 +532,20 @@ int pirq_acktype(int irq) return 0; } -int pirq_shared(int irq) +int pirq_shared(struct domain *d, int irq) { - unsigned int vector; irq_desc_t *desc; irq_guest_action_t *action; unsigned long flags; int shared; - vector = irq_to_vector(irq); - if ( vector == 0 ) + desc = domain_spin_lock_irq_desc(d, irq, &flags); + if ( desc == NULL ) return 0; - desc = &irq_desc[vector]; - - spin_lock_irqsave(&desc->lock, flags); action = (irq_guest_action_t *)desc->action; shared = ((desc->status & IRQ_GUEST) && (action->nr_guests > 1)); + spin_unlock_irqrestore(&desc->lock, flags); return shared; @@ -413,21 +555,23 @@ int pirq_guest_bind(struct vcpu *v, int { unsigned int vector; irq_desc_t *desc; - irq_guest_action_t *action; - unsigned long flags; + irq_guest_action_t *action, *newaction = NULL; int rc = 0; cpumask_t cpumask = CPU_MASK_NONE; - retry: - vector = irq_to_vector(irq); - if ( vector == 0 ) - return -EINVAL; - - desc = &irq_desc[vector]; + WARN_ON(!spin_is_locked(&v->domain->event_lock)); + BUG_ON(!local_irq_is_enabled()); - spin_lock_irqsave(&desc->lock, flags); + retry: + desc = domain_spin_lock_irq_desc(v->domain, irq, NULL); + if ( desc == NULL ) + { + rc = -EINVAL; + goto out; + } action = (irq_guest_action_t *)desc->action; + vector = desc - irq_desc; if ( !(desc->status & IRQ_GUEST) ) { @@ -437,23 +581,29 @@ int pirq_guest_bind(struct vcpu *v, int "Cannot bind IRQ %d to guest. In use by '%s'.\n", irq, desc->action->name); rc = -EBUSY; - goto out; + goto unlock_out; } - action = xmalloc(irq_guest_action_t); - if ( (desc->action = (struct irqaction *)action) == NULL ) + if ( newaction == NULL ) { + spin_unlock_irq(&desc->lock); + if ( (newaction = xmalloc(irq_guest_action_t)) != NULL ) + goto retry; gdprintk(XENLOG_INFO, - "Cannot bind IRQ %d to guest. Out of memory.\n", - irq); + "Cannot bind IRQ %d to guest. Out of memory.\n", + irq); rc = -ENOMEM; goto out; } + action = newaction; + desc->action = (struct irqaction *)action; + newaction = NULL; + action->nr_guests = 0; action->in_flight = 0; action->shareable = will_share; - action->ack_type = pirq_acktype(irq); + action->ack_type = pirq_acktype(v->domain, irq); cpus_clear(action->cpu_eoi_map); desc->depth = 0; @@ -468,11 +618,13 @@ int pirq_guest_bind(struct vcpu *v, int } else if ( !will_share || !action->shareable ) { - gdprintk(XENLOG_INFO, "Cannot bind IRQ %d to guest. " - "Will not share with others.\n", - irq); + gdprintk(XENLOG_INFO, "Cannot bind IRQ %d to guest. %s.\n", + irq, + will_share ? + "Others do not share" : + "Will not share with others"); rc = -EBUSY; - goto out; + goto unlock_out; } else if ( action->nr_guests == 0 ) { @@ -482,7 +634,7 @@ int pirq_guest_bind(struct vcpu *v, int */ ASSERT(action->ack_type == ACKTYPE_EOI); ASSERT(desc->status & IRQ_DISABLED); - spin_unlock_irqrestore(&desc->lock, flags); + spin_unlock_irq(&desc->lock); cpu_relax(); goto retry; } @@ -492,35 +644,37 @@ int pirq_guest_bind(struct vcpu *v, int gdprintk(XENLOG_INFO, "Cannot bind IRQ %d to guest. " "Already at max share.\n", irq); rc = -EBUSY; - goto out; + goto unlock_out; } action->guest[action->nr_guests++] = v->domain; + unlock_out: + spin_unlock_irq(&desc->lock); out: - spin_unlock_irqrestore(&desc->lock, flags); + if ( newaction != NULL ) + xfree(newaction); return rc; } -int pirq_guest_unbind(struct domain *d, int irq) +static irq_guest_action_t *__pirq_guest_unbind( + struct domain *d, int irq, irq_desc_t *desc) { - unsigned int vector = irq_to_vector(irq); - irq_desc_t *desc = &irq_desc[vector]; + unsigned int vector; irq_guest_action_t *action; cpumask_t cpu_eoi_map; - unsigned long flags; int i; - BUG_ON(vector == 0); - - spin_lock_irqsave(&desc->lock, flags); + BUG_ON(!(desc->status & IRQ_GUEST)); action = (irq_guest_action_t *)desc->action; + vector = desc - irq_desc; - i = 0; - while ( action->guest[i] && (action->guest[i] != d) ) - i++; - memmove(&action->guest[i], &action->guest[i+1], IRQ_MAX_GUESTS-i-1); + for ( i = 0; (i < action->nr_guests) && (action->guest[i] != d); i++ ) + continue; + BUG_ON(i == action->nr_guests); + memmove(&action->guest[i], &action->guest[i+1], + (action->nr_guests-i-1) * sizeof(action->guest[0])); action->nr_guests--; switch ( action->ack_type ) @@ -537,9 +691,9 @@ int pirq_guest_unbind(struct domain *d, (action->nr_guests != 0) ) { cpu_eoi_map = action->cpu_eoi_map; - spin_unlock_irqrestore(&desc->lock, flags); + spin_unlock_irq(&desc->lock); on_selected_cpus(cpu_eoi_map, set_eoi_ready, desc, 1, 0); - spin_lock_irqsave(&desc->lock, flags); + spin_lock_irq(&desc->lock); } break; } @@ -551,7 +705,7 @@ int pirq_guest_unbind(struct domain *d, BUG_ON(test_bit(irq, d->pirq_mask)); if ( action->nr_guests != 0 ) - goto out; + return NULL; BUG_ON(action->in_flight != 0); @@ -571,21 +725,274 @@ int pirq_guest_unbind(struct domain *d, if ( !cpus_empty(cpu_eoi_map) ) { BUG_ON(action->ack_type != ACKTYPE_EOI); - spin_unlock_irqrestore(&desc->lock, flags); + spin_unlock_irq(&desc->lock); on_selected_cpus(cpu_eoi_map, set_eoi_ready, desc, 1, 1); - spin_lock_irqsave(&desc->lock, flags); + spin_lock_irq(&desc->lock); } BUG_ON(!cpus_empty(action->cpu_eoi_map)); desc->action = NULL; - xfree(action); desc->status &= ~IRQ_GUEST; + desc->status &= ~IRQ_INPROGRESS; desc->handler->shutdown(vector); + /* Caller frees the old guest descriptor block. */ + return action; +} + +void pirq_guest_unbind(struct domain *d, int irq) +{ + irq_guest_action_t *oldaction = NULL; + irq_desc_t *desc; + int vector; + + WARN_ON(!spin_is_locked(&d->event_lock)); + + BUG_ON(!local_irq_is_enabled()); + desc = domain_spin_lock_irq_desc(d, irq, NULL); + + if ( desc == NULL ) + { + vector = -domain_irq_to_vector(d, irq); + BUG_ON(vector <= 0); + desc = &irq_desc[vector]; + spin_lock_irq(&desc->lock); + d->arch.pirq_vector[irq] = d->arch.vector_pirq[vector] = 0; + } + else + { + oldaction = __pirq_guest_unbind(d, irq, desc); + } + + spin_unlock_irq(&desc->lock); + + if ( oldaction != NULL ) + xfree(oldaction); +} + +int pirq_guest_force_unbind(struct domain *d, int irq) +{ + irq_desc_t *desc; + irq_guest_action_t *action, *oldaction = NULL; + int i, bound = 0; + + WARN_ON(!spin_is_locked(&d->event_lock)); + + BUG_ON(!local_irq_is_enabled()); + desc = domain_spin_lock_irq_desc(d, irq, NULL); + BUG_ON(desc == NULL); + + if ( !(desc->status & IRQ_GUEST) ) + goto out; + + action = (irq_guest_action_t *)desc->action; + for ( i = 0; (i < action->nr_guests) && (action->guest[i] != d); i++ ) + continue; + if ( i == action->nr_guests ) + goto out; + + bound = 1; + oldaction = __pirq_guest_unbind(d, irq, desc); + out: - spin_unlock_irqrestore(&desc->lock, flags); - return 0; + spin_unlock_irq(&desc->lock); + + if ( oldaction != NULL ) + xfree(oldaction); + + return bound; +} + +int get_free_pirq(struct domain *d, int type, int index) +{ + int i; + + ASSERT(spin_is_locked(&d->event_lock)); + + if ( type == MAP_PIRQ_TYPE_GSI ) + { + for ( i = 16; i < NR_IRQS; i++ ) + if ( !d->arch.pirq_vector[i] ) + break; + if ( i == NR_IRQS ) + return -ENOSPC; + } + else + { + for ( i = NR_IRQS - 1; i >= 16; i-- ) + if ( !d->arch.pirq_vector[i] ) + break; + if ( i == 16 ) + return -ENOSPC; + } + + return i; +} + +int map_domain_pirq( + struct domain *d, int pirq, int vector, int type, void *data) +{ + int ret = 0; + int old_vector, old_pirq; + irq_desc_t *desc; + unsigned long flags; + struct msi_desc *msi_desc; + struct pci_dev *pdev = NULL; + + ASSERT(spin_is_locked(&pcidevs_lock)); + ASSERT(spin_is_locked(&d->event_lock)); + + if ( !IS_PRIV(current->domain) ) + return -EPERM; + + if ( pirq < 0 || pirq >= NR_IRQS || vector < 0 || vector >= NR_VECTORS ) + { + dprintk(XENLOG_G_ERR, "dom%d: invalid pirq %d or vector %d\n", + d->domain_id, pirq, vector); + return -EINVAL; + } + + old_vector = domain_irq_to_vector(d, pirq); + old_pirq = domain_vector_to_irq(d, vector); + + if ( (old_vector && (old_vector != vector) ) || + (old_pirq && (old_pirq != pirq)) ) + { + dprintk(XENLOG_G_ERR, "dom%d: pirq %d or vector %d already mapped\n", + d->domain_id, pirq, vector); + return -EINVAL; + } + + ret = irq_permit_access(d, pirq); + if ( ret ) + { + dprintk(XENLOG_G_ERR, "dom%d: could not permit access to irq %d\n", + d->domain_id, pirq); + return ret; + } + + desc = &irq_desc[vector]; + + if ( type == MAP_PIRQ_TYPE_MSI ) + { + struct msi_info *msi = (struct msi_info *)data; + + ret = -ENODEV; + if ( !cpu_has_apic ) + goto done; + + pdev = pci_get_pdev(msi->bus, msi->devfn); + ret = pci_enable_msi(msi, &msi_desc); + if ( ret ) + goto done; + + spin_lock_irqsave(&desc->lock, flags); + + if ( desc->handler != &no_irq_type ) + dprintk(XENLOG_G_ERR, "dom%d: vector %d in use\n", + d->domain_id, vector); + desc->handler = &pci_msi_type; + d->arch.pirq_vector[pirq] = vector; + d->arch.vector_pirq[vector] = pirq; + setup_msi_irq(pdev, msi_desc); + spin_unlock_irqrestore(&desc->lock, flags); + } else + { + spin_lock_irqsave(&desc->lock, flags); + d->arch.pirq_vector[pirq] = vector; + d->arch.vector_pirq[vector] = pirq; + spin_unlock_irqrestore(&desc->lock, flags); + } + + done: + return ret; +} + +/* The pirq should have been unbound before this call. */ +int unmap_domain_pirq(struct domain *d, int pirq) +{ + unsigned long flags; + irq_desc_t *desc; + int vector, ret = 0; + bool_t forced_unbind; + struct msi_desc *msi_desc = NULL; + + if ( (pirq < 0) || (pirq >= NR_IRQS) ) + return -EINVAL; + + if ( !IS_PRIV(current->domain) ) + return -EINVAL; + + ASSERT(spin_is_locked(&pcidevs_lock)); + ASSERT(spin_is_locked(&d->event_lock)); + + vector = domain_irq_to_vector(d, pirq); + if ( vector <= 0 ) + { + dprintk(XENLOG_G_ERR, "dom%d: pirq %d not mapped\n", + d->domain_id, pirq); + ret = -EINVAL; + goto done; + } + + forced_unbind = pirq_guest_force_unbind(d, pirq); + if ( forced_unbind ) + dprintk(XENLOG_G_WARNING, "dom%d: forcing unbind of pirq %d\n", + d->domain_id, pirq); + + desc = &irq_desc[vector]; + + if ( (msi_desc = desc->msi_desc) != NULL ) + pci_disable_msi(msi_desc); + + spin_lock_irqsave(&desc->lock, flags); + + BUG_ON(vector != domain_irq_to_vector(d, pirq)); + + if ( msi_desc ) + teardown_msi_vector(vector); + + if ( desc->handler == &pci_msi_type ) + desc->handler = &no_irq_type; + + if ( !forced_unbind ) + { + d->arch.pirq_vector[pirq] = 0; + d->arch.vector_pirq[vector] = 0; + } + else + { + d->arch.pirq_vector[pirq] = -vector; + d->arch.vector_pirq[vector] = -pirq; + } + + spin_unlock_irqrestore(&desc->lock, flags); + if (msi_desc) + msi_free_vector(msi_desc); + + ret = irq_deny_access(d, pirq); + if ( ret ) + dprintk(XENLOG_G_ERR, "dom%d: could not deny access to irq %d\n", + d->domain_id, pirq); + + done: + return ret; +} + +void free_domain_pirqs(struct domain *d) +{ + int i; + + spin_lock(&pcidevs_lock); + spin_lock(&d->event_lock); + + for ( i = 0; i < NR_IRQS; i++ ) + if ( d->arch.pirq_vector[i] > 0 ) + unmap_domain_pirq(d, i); + + spin_unlock(&d->event_lock); + spin_unlock(&pcidevs_lock); } extern void dump_ioapic_irq_info(void); @@ -627,7 +1034,8 @@ static void dump_irqs(unsigned char key) (test_bit(d->pirq_to_evtchn[irq], shared_info_addr(d, evtchn_pending)) ? 'P' : '-'), - (test_bit(d->pirq_to_evtchn[irq]/BITS_PER_GUEST_LONG(d), + (test_bit(d->pirq_to_evtchn[irq] / + BITS_PER_EVTCHN_WORD(d), vcpu_info_addr(d->vcpu[0], evtchn_pending_sel)) ? 'S' : '-'), (test_bit(d->pirq_to_evtchn[irq], diff -Naurp xen/arch/x86/machine_kexec.c xen-redhat/arch/x86/machine_kexec.c --- xen/arch/x86/machine_kexec.c +++ xen-redhat/arch/x86/machine_kexec.c @@ -140,6 +140,20 @@ void machine_kexec(xen_kexec_image_t *im } } +void arch_crash_save_vmcoreinfo(void) +{ + VMCOREINFO_SYMBOL(dom_xen); + VMCOREINFO_SYMBOL(dom_io); + +#ifdef CONFIG_X86_PAE + VMCOREINFO_SYMBOL_ALIAS(pgd_l3, idle_pg_table); +#endif +#ifdef CONFIG_X86_64 + VMCOREINFO_SYMBOL_ALIAS(pgd_l4, idle_pg_table); +#endif +} + + /* * Local variables: * mode: C diff -Naurp xen/arch/x86/Makefile xen-redhat/arch/x86/Makefile --- xen/arch/x86/Makefile +++ xen-redhat/arch/x86/Makefile @@ -12,6 +12,7 @@ obj-y += apic.o obj-y += bitops.o obj-y += clear_page.o obj-y += compat.o +obj-y += debug.o obj-y += delay.o obj-y += dmi_scan.o obj-y += domctl.o @@ -24,6 +25,7 @@ obj-y += platform_hypercall.o obj-y += i387.o obj-y += i8259.o obj-y += io_apic.o +obj-y += ioport_emulate.o obj-y += irq.o obj-y += microcode.o obj-y += mm.o @@ -45,6 +47,8 @@ obj-y += usercopy.o obj-y += x86_emulate.o obj-y += machine_kexec.o obj-y += crash.o +obj-y += pci.o +obj-y += msi.o obj-$(crash_debug) += gdbstub.o diff -Naurp xen/arch/x86/mm/hap/hap.c xen-redhat/arch/x86/mm/hap/hap.c --- xen/arch/x86/mm/hap/hap.c +++ xen-redhat/arch/x86/mm/hap/hap.c @@ -61,7 +61,7 @@ int hap_enable_log_dirty(struct domain * hap_unlock(d); /* set l1e entries of P2M table to NOT_WRITABLE. */ - p2m_set_flags_global(d, (_PAGE_PRESENT|_PAGE_USER)); + p2m_change_entry_type_global(d, (_PAGE_PRESENT|_PAGE_USER)); flush_tlb_mask(d->domain_dirty_cpumask); return 0; } @@ -73,14 +73,14 @@ int hap_disable_log_dirty(struct domain hap_unlock(d); /* set l1e entries of P2M table with normal mode */ - p2m_set_flags_global(d, __PAGE_HYPERVISOR|_PAGE_USER); + p2m_change_entry_type_global(d, (__PAGE_HYPERVISOR|_PAGE_USER)); return 0; } void hap_clean_dirty_bitmap(struct domain *d) { /* mark physical memory as NOT_WRITEABLE and flush the TLB */ - p2m_set_flags_global(d, (_PAGE_PRESENT|_PAGE_USER)); + p2m_change_entry_type_global(d, (_PAGE_PRESENT|_PAGE_USER)); flush_tlb_mask(d->domain_dirty_cpumask); } @@ -593,6 +593,7 @@ int hap_invlpg(struct vcpu *v, unsigned */ void hap_update_cr3(struct vcpu *v, int do_locking) { + hvm_update_guest_cr(v, 3); } void hap_update_paging_modes(struct vcpu *v) @@ -626,8 +627,11 @@ void hap_update_paging_modes(struct vcpu mfn_t mmfn = hap_make_monitor_table(v); v->arch.monitor_table = pagetable_from_mfn(mmfn); make_cr3(v, mfn_x(mmfn)); + hvm_update_host_cr3(v); } + hap_update_cr3(v, 1); + hap_unlock(d); } @@ -674,9 +678,16 @@ void hap_write_p2m_entry(struct vcpu *v, unsigned long gfn, l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level) { + uint32_t old_flags; + hap_lock(v->domain); + old_flags = l1e_get_flags(*p); safe_write_pte(p, new); + if ( (old_flags & _PAGE_PRESENT) + && (level == 1 || (level == 2 && (old_flags & _PAGE_PSE))) ) + flush_tlb_mask(v->domain->domain_dirty_cpumask); + #if CONFIG_PAGING_LEVELS == 3 /* install P2M in monitor table for PAE Xen */ if ( level == 3 ) diff -Naurp xen/arch/x86/mm/hap/support.c xen-redhat/arch/x86/mm/hap/support.c --- xen/arch/x86/mm/hap/support.c +++ xen-redhat/arch/x86/mm/hap/support.c @@ -65,7 +65,7 @@ unsigned long hap_gva_to_gfn_protected_m gpfn = (gcr3 >> PAGE_SHIFT); for ( lev = mode; lev >= 1; lev-- ) { - mfn = get_mfn_from_gpfn( gpfn ); + mfn = gmfn_to_mfn(v->domain, gpfn); if ( mfn == INVALID_MFN ) { HAP_PRINTK("bad pfn=0x%lx from gva=0x%lx at lev%d\n", gpfn, gva, lev); @@ -148,7 +148,7 @@ unsigned long hap_gva_to_gfn_pae_mode(st gpfn = (gcr3 >> PAGE_SHIFT); for ( lev = mode; lev >= 1; lev-- ) { - mfn = get_mfn_from_gpfn( gpfn ); + mfn = gmfn_to_mfn(v->domain, gpfn); if ( mfn == INVALID_MFN ) { HAP_PRINTK("bad pfn=0x%lx from gva=0x%lx at lev%d\n", gpfn, gva, lev); @@ -242,7 +242,7 @@ unsigned long hap_gva_to_gfn_long_mode(s gpfn = (gcr3 >> PAGE_SHIFT); for ( lev = mode; lev >= 1; lev-- ) { - mfn = get_mfn_from_gpfn( gpfn ); + mfn = gmfn_to_mfn(v->domain, gpfn); if ( mfn == INVALID_MFN ) { HAP_PRINTK("bad pfn=0x%lx from gva=0x%lx at lev%d\n", gpfn, gva, lev); diff -Naurp xen/arch/x86/mm/Makefile xen-redhat/arch/x86/mm/Makefile --- xen/arch/x86/mm/Makefile +++ xen-redhat/arch/x86/mm/Makefile @@ -3,3 +3,4 @@ subdir-y += hap obj-y += paging.o obj-y += p2m.o +obj-y += p2m-ept.o diff -Naurp xen/arch/x86/mm/p2m.c xen-redhat/arch/x86/mm/p2m.c --- xen/arch/x86/mm/p2m.c +++ xen-redhat/arch/x86/mm/p2m.c @@ -27,11 +27,16 @@ #include <asm/page.h> #include <asm/paging.h> #include <asm/p2m.h> +#include <asm/hvm/vmx/vmx.h> /* ept_p2m_init() */ +#include <xen/iommu.h> /* Debugging and auditing of the P2M code? */ #define P2M_AUDIT 0 #define P2M_DEBUGGING 1 +static int opt_hap_1gb = 0; +boolean_param("hap_1gb", opt_hap_1gb); + /* * The P2M lock. This protects all updates to the p2m table. * Updates are expected to be safe against concurrent reads, @@ -47,6 +52,9 @@ (_d)->arch.p2m.locker_function = "nobody"; \ } while (0) +#define p2m_locked_by_me(_d) \ + (current->processor == (_d)->arch.p2m.locker) + #define p2m_lock(_d) \ do { \ if ( unlikely((_d)->arch.p2m.locker == current->processor) )\ @@ -92,8 +100,6 @@ #undef page_to_mfn #define page_to_mfn(_pg) (_mfn((_pg) - frame_table)) - - // Find the next level's P2M entry, checking for out-of-range gfn's... // Returns NULL on error. // @@ -123,9 +129,11 @@ p2m_next_level(struct domain *d, mfn_t * unsigned long *gfn_remainder, unsigned long gfn, u32 shift, u32 max, unsigned long type) { + l1_pgentry_t *l1_entry; l1_pgentry_t *p2m_entry; l1_pgentry_t new_entry; void *next; + int i; ASSERT(d->arch.p2m.alloc_page); if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn, @@ -140,10 +148,8 @@ p2m_next_level(struct domain *d, mfn_t * list_add_tail(&pg->list, &d->arch.p2m.pages); pg->u.inuse.type_info = type | 1 | PGT_validated; pg->count_info = 1; - new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), __PAGE_HYPERVISOR|_PAGE_USER); - switch ( type ) { case PGT_l3_page_table: paging_write_p2m_entry(d, gfn, @@ -166,6 +172,70 @@ p2m_next_level(struct domain *d, mfn_t * break; } } + + /* split 1GB pages into 2MB pages */ + if ( type == PGT_l2_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) + { + unsigned long flags, pfn; + struct page_info *pg = d->arch.p2m.alloc_page(d); + if ( pg == NULL ) + return 0; + list_add_tail(&pg->list, &d->arch.p2m.pages); + pg->u.inuse.type_info = PGT_l1_page_table | 1| PGT_validated; + pg->count_info = 1; + + flags = l1e_get_flags(*p2m_entry); + pfn = l1e_get_pfn(*p2m_entry); + + l1_entry = map_domain_page(mfn_x(page_to_mfn(pg))); + for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) + { + new_entry = l1e_from_pfn(pfn + (i * L1_PAGETABLE_ENTRIES), flags); + paging_write_p2m_entry(d, gfn, l1_entry+i, *table_mfn, new_entry, + 2); + } + unmap_domain_page(l1_entry); + new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), + __PAGE_HYPERVISOR|_PAGE_USER); + paging_write_p2m_entry(d, gfn, + p2m_entry, *table_mfn, new_entry, 3); + } + + /* split single large page into 4KB page in P2M table */ + if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) + { + unsigned long flags, pfn; + struct page_info *pg = d->arch.p2m.alloc_page(d); + if ( pg == NULL ) + return 0; + list_add_tail(&pg->list, &d->arch.p2m.pages); + pg->u.inuse.type_info = PGT_l1_page_table | 1 | PGT_validated; + pg->count_info = 1; + + /* New splintered mappings inherit the flags of the old superpage, + * with a little reorganisation for the _PAGE_PSE_PAT bit. */ + flags = l1e_get_flags(*p2m_entry); + pfn = l1e_get_pfn(*p2m_entry); + if ( pfn & 1 ) /* ==> _PAGE_PSE_PAT was set */ + pfn -= 1; /* Clear it; _PAGE_PSE becomes _PAGE_PAT */ + else + flags &= ~_PAGE_PSE; /* Clear _PAGE_PSE (== _PAGE_PAT) */ + + l1_entry = map_domain_page(mfn_x(page_to_mfn(pg))); + for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) + { + new_entry = l1e_from_pfn(pfn + i, flags); + paging_write_p2m_entry(d, gfn, + l1_entry+i, *table_mfn, new_entry, 1); + } + unmap_domain_page(l1_entry); + + new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), + __PAGE_HYPERVISOR|_PAGE_USER); + paging_write_p2m_entry(d, gfn, + p2m_entry, *table_mfn, new_entry, 2); + } + *table_mfn = _mfn(l1e_get_pfn(*p2m_entry)); next = map_domain_page(mfn_x(*table_mfn)); unmap_domain_page(*table); @@ -176,7 +246,8 @@ p2m_next_level(struct domain *d, mfn_t * // Returns 0 on error (out of memory) static int -set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, u32 l1e_flags) +p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, + int order, u32 l1e_flags) { // XXX -- this might be able to be faster iff current->domain == d mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table); @@ -184,7 +255,12 @@ set_p2m_entry(struct domain *d, unsigned unsigned long gfn_remainder = gfn; l1_pgentry_t *p2m_entry; l1_pgentry_t entry_content; + l2_pgentry_t l2e_content; + p2m_type_t p2mt = p2m_flags_to_type(l1e_flags); int rv=0; +#if CONFIG_PAGING_LEVELS >= 3 + l3_pgentry_t l3e_content; +#endif #if CONFIG_PAGING_LEVELS >= 4 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, @@ -193,40 +269,104 @@ set_p2m_entry(struct domain *d, unsigned goto out; #endif #if CONFIG_PAGING_LEVELS >= 3 - // When using PAE Xen, we only allow 33 bits of pseudo-physical - // address in translated guests (i.e. 8 GBytes). This restriction - // comes from wanting to map the P2M table into the 16MB RO_MPT hole - // in Xen's address space for translated PV guests. - // - if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, - L3_PAGETABLE_SHIFT - PAGE_SHIFT, - (CONFIG_PAGING_LEVELS == 3 - ? 8 - : L3_PAGETABLE_ENTRIES), - PGT_l2_page_table) ) + /* Try to allocate 1GB page table if this feature is supported. + * When using PAE Xen, we only allow 33 bits of pseudo-physical + * address in translated guests (i.e. 8 GBytes). This restriction + * comes from wanting to map the P2M table into the 16MB RO_MPT hole + * in Xen's address space for translated PV guests. + * When using AMD's NPT on PAE Xen, we are restricted to 4GB. + */ + if ( order == 18 ) /* 1GB page */ + { + p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn, + L3_PAGETABLE_SHIFT - PAGE_SHIFT, + L3_PAGETABLE_ENTRIES); + ASSERT(p2m_entry); + if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) && + !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) + { + P2M_ERROR("configure P2M table L3 entry with large page\n"); + domain_crash(d); + goto out; + } + + if ( mfn_valid(mfn) ) + l3e_content = l3e_from_pfn(mfn_x(mfn), + __PAGE_HYPERVISOR|_PAGE_USER|_PAGE_PSE); + else + l3e_content = l3e_empty(); + + entry_content.l1 = l3e_content.l3; + paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 3); + } + else if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, + L3_PAGETABLE_SHIFT - PAGE_SHIFT, + ((CONFIG_PAGING_LEVELS == 3) + ? (hvm_funcs.hap_supported ? 4 : 8) + : L3_PAGETABLE_ENTRIES), + PGT_l2_page_table) ) goto out; #endif - if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, - L2_PAGETABLE_SHIFT - PAGE_SHIFT, - L2_PAGETABLE_ENTRIES, PGT_l1_page_table) ) - goto out; + + if ( order == 0 ) + { + if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, + L2_PAGETABLE_SHIFT - PAGE_SHIFT, + L2_PAGETABLE_ENTRIES, PGT_l1_page_table) ) + goto out; + + p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn, + 0, L1_PAGETABLE_ENTRIES); + ASSERT(p2m_entry); + + if ( mfn_valid(mfn) || p2mt == p2m_mmio_direct ) + entry_content = l1e_from_pfn(mfn_x(mfn), l1e_flags); + else + entry_content = l1e_empty(); + + /* level 1 entry */ + paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 1); + } + else if ( order == 9 ) + { + p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn, + L2_PAGETABLE_SHIFT - PAGE_SHIFT, + L2_PAGETABLE_ENTRIES); + ASSERT(p2m_entry); + + if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) && + !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) + { + P2M_ERROR("configure P2M table 4KB L2 entry with large page\n"); + domain_crash(d); + goto out; + } - p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn, - 0, L1_PAGETABLE_ENTRIES); - ASSERT(p2m_entry); + if ( mfn_valid(mfn) ) + l2e_content = l2e_from_pfn(mfn_x(mfn), + __PAGE_HYPERVISOR|_PAGE_USER|_PAGE_PSE); + else + l2e_content = l2e_empty(); + + entry_content.l1 = l2e_content.l2; + paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 2); + } /* Track the highest gfn for which we have ever had a valid mapping */ if ( mfn_valid(mfn) && (gfn > d->arch.p2m.max_mapped_pfn) ) - d->arch.p2m.max_mapped_pfn = gfn; - - if ( mfn_valid(mfn) ) - entry_content = l1e_from_pfn(mfn_x(mfn), l1e_flags); - else - entry_content = l1e_empty(); - - /* level 1 entry */ - paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 1); - + d->arch.p2m.max_mapped_pfn = gfn + (1UL << order) - 1; + + if ( iommu_enabled ) + { + int i; + if ( p2mt == p2m_ram_rw ) + for ( i = 0; i < (1UL << order); i++ ) + iommu_map_page(d, gfn+i, mfn_x(mfn)+i ); + else + for ( i = 0; i < (1UL << order); i++ ) + iommu_unmap_page(d, gfn+i); + } + /* Success */ rv = 1; @@ -235,15 +375,60 @@ set_p2m_entry(struct domain *d, unsigned return rv; } +static mfn_t +p2m_gfn_to_mfn(struct domain *d, unsigned long gfn); /* Init the datastructures for later use by the p2m code */ void p2m_init(struct domain *d) { p2m_lock_init(d); INIT_LIST_HEAD(&d->arch.p2m.pages); + + d->arch.p2m.set_entry = p2m_set_entry; + d->arch.p2m.get_entry = p2m_gfn_to_mfn; + d->arch.p2m.get_entry_fast = p2m_gfn_to_mfn_fast; + d->arch.p2m.change_entry_type_global = p2m_set_flags_global; + + if ( is_hvm_domain(d) && hap_enabled(d) && + (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) ) + ept_p2m_init(d); +} + +void p2m_change_entry_type_global(struct domain *d, u32 l1e_flags) +{ + p2m_lock(d); + d->arch.p2m.change_entry_type_global(d, l1e_flags); + p2m_unlock(d); } +static inline +int set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, + int page_order, u32 l1e_flags) +{ + unsigned long todo = 1ul << page_order; + unsigned int order; + int rc = 0; + + while ( todo ) + { + /* decide which page mode to use */ + if ( hap_enabled(d) ) + order = ( (((gfn | mfn_x(mfn) | todo) & ((1ul << 18) - 1)) == 0) && + hap_1gb_pgtb(d) && opt_hap_1gb ) ? 18 : + (((gfn | mfn_x(mfn) | todo) & ((1ul << 9) - 1)) == 0) ? 9 : 0; + else + order = 0; + + rc = d->arch.p2m.set_entry(d, gfn, mfn, order, l1e_flags); + gfn += 1ul << order; + if ( mfn_x(mfn) != INVALID_MFN ) + mfn = _mfn(mfn_x(mfn) + (1ul << order)); + todo -= 1ul << order; + } + return rc; +} + // Allocate a new p2m table for a domain. // // The structure of the p2m table is that of a pagetable for xen (i.e. it is @@ -305,7 +490,7 @@ int p2m_alloc_table(struct domain *d, /* Initialise physmap tables for slot zero. Other code assumes this. */ gfn = 0; mfn = _mfn(INVALID_MFN); - if ( !set_p2m_entry(d, gfn, mfn, __PAGE_HYPERVISOR|_PAGE_USER) ) + if ( !set_p2m_entry(d, gfn, mfn, 0, __PAGE_HYPERVISOR|_PAGE_USER) ) goto error; for ( entry = d->page_list.next; @@ -323,7 +508,7 @@ int p2m_alloc_table(struct domain *d, (gfn != 0x55555555L) #endif && gfn != INVALID_M2P_ENTRY - && !set_p2m_entry(d, gfn, mfn, __PAGE_HYPERVISOR|_PAGE_USER) ) + && !set_p2m_entry(d, gfn, mfn, 0, __PAGE_HYPERVISOR|_PAGE_USER) ) goto error; } @@ -358,7 +543,7 @@ void p2m_teardown(struct domain *d) } mfn_t -gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn) +p2m_gfn_to_mfn(struct domain *d, unsigned long gpfn) /* Read another domain's p2m entries */ { mfn_t mfn; @@ -405,6 +590,14 @@ gfn_to_mfn_foreign(struct domain *d, uns unmap_domain_page(l3e); return _mfn(INVALID_MFN); } + else if ( (l3e_get_flags(*l3e) & _PAGE_PSE) ) + { + mfn = _mfn(l3e_get_pfn(*l3e) + + l2_table_offset(addr) * L2_PAGETABLE_ENTRIES + + l1_table_offset(addr)); + unmap_domain_page(l3e); + return mfn_valid(mfn) ? mfn : _mfn(INVALID_MFN); + } mfn = _mfn(l3e_get_pfn(*l3e)); unmap_domain_page(l3e); } @@ -417,6 +610,14 @@ gfn_to_mfn_foreign(struct domain *d, uns unmap_domain_page(l2e); return _mfn(INVALID_MFN); } + else if ( (l2e_get_flags(*l2e) & _PAGE_PSE) ) + { + mfn = _mfn(l2e_get_pfn(*l2e) + l1_table_offset(addr)); + unmap_domain_page(l2e); + + return mfn_valid(mfn) ? mfn : _mfn(INVALID_MFN); + } + mfn = _mfn(l2e_get_pfn(*l2e)); unmap_domain_page(l2e); @@ -504,7 +705,7 @@ static void audit_p2m(struct domain *d) /* This m2p entry is stale: the domain has another frame in * this physical slot. No great disaster, but for neatness, * blow away the m2p entry. */ - set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY, __PAGE_HYPERVISOR|_PAGE_USER); + set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY); } if ( test_linear && (gfn <= d->arch.p2m.max_mapped_pfn) ) @@ -562,6 +763,30 @@ static void audit_p2m(struct domain *d) gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT); continue; } + + /* check for super page */ + if ( l3e_get_flags(l3e[i3]) & _PAGE_PSE ) + { + mfn = l3e_get_pfn(l3e[i3]); + ASSERT(mfn_valid(_mfn(mfn))); + /* we have to cover 512x512 4K pages */ + for ( i2 = 0; + i2 < (L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES); + i2++) + { + m2pfn = get_gpfn_from_mfn(mfn+i2); + if ( m2pfn != (gfn + i2) ) + { + pmbad++; + P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx" + " -> gfn %#lx\n", gfn+i2, mfn+i2, + m2pfn); + BUG(); + } + } + gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT); + continue; + } l2e = map_domain_page(mfn_x(_mfn(l3e_get_pfn(l3e[i3])))); #endif /* all levels... */ for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ ) @@ -571,6 +796,29 @@ static void audit_p2m(struct domain *d) gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT); continue; } + + + /* check for super page */ + if ( l2e_get_flags(l2e[i2]) & _PAGE_PSE ) + { + mfn = l2e_get_pfn(l2e[i2]); + ASSERT(mfn_valid(_mfn(mfn))); + for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++) + { + m2pfn = get_gpfn_from_mfn(mfn+i1); + if ( m2pfn != (gfn + i) ) + { + pmbad++; + P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx" + " -> gfn %#lx\n", gfn+i, mfn+i, + m2pfn); + BUG(); + } + } + gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT); + continue; + } + l1e = map_domain_page(mfn_x(_mfn(l2e_get_pfn(l2e[i2])))); for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ ) @@ -624,8 +872,10 @@ static void audit_p2m(struct domain *d) static void -p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn) +p2m_remove_page(struct domain *d, unsigned long gfn, + unsigned long mfn, int order) { + unsigned long i; if ( !paging_mode_translate(d) ) return; P2M_DEBUG("removing gfn=%#lx mfn=%#lx\n", gfn, mfn); @@ -633,30 +883,50 @@ p2m_remove_page(struct domain *d, unsign ASSERT(mfn_x(gfn_to_mfn(d, gfn)) == mfn); //ASSERT(mfn_to_gfn(d, mfn) == gfn); - set_p2m_entry(d, gfn, _mfn(INVALID_MFN), __PAGE_HYPERVISOR|_PAGE_USER); - set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY); + set_p2m_entry(d, gfn, _mfn(INVALID_MFN), order, + __PAGE_HYPERVISOR|_PAGE_USER); + for ( i = 0; i < (1UL << order); i++ ) + set_gpfn_from_mfn(mfn+i, INVALID_M2P_ENTRY); } void guest_physmap_remove_page(struct domain *d, unsigned long gfn, - unsigned long mfn) + unsigned long mfn, int order) { p2m_lock(d); audit_p2m(d); - p2m_remove_page(d, gfn, mfn); + p2m_remove_page(d, gfn, mfn, order); audit_p2m(d); p2m_unlock(d); } -void -guest_physmap_add_page(struct domain *d, unsigned long gfn, - unsigned long mfn) +int +guest_physmap_add_entry(struct domain *d, unsigned long gfn, + unsigned long mfn, int order, u32 l1e_flags) { - unsigned long ogfn; + unsigned long ogfn, i; mfn_t omfn; + int rc = 0; if ( !paging_mode_translate(d) ) - return; + return -EINVAL; + +#if CONFIG_PAGING_LEVELS == 3 + /* + * 32bit AMD nested paging does not support over 4GB guest due to + * hardware translation limit. This limitation is checked by comparing + * gfn with 0xfffffUL. + */ + if ( paging_mode_hap(d) && (gfn > 0xfffffUL) && + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) ) + { + if ( !test_and_set_bool(d->arch.hvm_domain.amd_npt_4gb_warning) ) + dprintk(XENLOG_WARNING, "Dom%d failed to populate memory beyond" + " 4GB: remove 'hap' Xen boot parameter.\n", + d->domain_id); + return -EINVAL; + } +#endif p2m_lock(d); audit_p2m(d); @@ -666,8 +936,11 @@ guest_physmap_add_page(struct domain *d, omfn = gfn_to_mfn(d, gfn); if ( mfn_valid(omfn) ) { - set_p2m_entry(d, gfn, _mfn(INVALID_MFN), __PAGE_HYPERVISOR|_PAGE_USER); - set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY); + if ( !set_p2m_entry(d, gfn, _mfn(INVALID_MFN), order, l1e_flags) ) + rc = -EINVAL; + + for ( i = 0; i < (1UL << order); i++) + set_gpfn_from_mfn(mfn_x(omfn)+i, INVALID_M2P_ENTRY); } ogfn = mfn_to_gfn(d, _mfn(mfn)); @@ -688,15 +961,34 @@ guest_physmap_add_page(struct domain *d, P2M_DEBUG("old gfn=%#lx -> mfn %#lx\n", ogfn , mfn_x(omfn)); if ( mfn_x(omfn) == mfn ) - p2m_remove_page(d, ogfn, mfn); + p2m_remove_page(d, ogfn, mfn, order); } } - set_p2m_entry(d, gfn, _mfn(mfn), __PAGE_HYPERVISOR|_PAGE_USER); - set_gpfn_from_mfn(mfn, gfn); + if ( !set_p2m_entry(d, gfn, _mfn(mfn), order, l1e_flags) ) + rc = -EINVAL; + + for ( i = 0; i < (1UL << order); i++ ) + set_gpfn_from_mfn(mfn+i, gfn+i); audit_p2m(d); p2m_unlock(d); + + return rc; +} + +int +guest_physmap_add_page(struct domain *d, unsigned long gfn, + unsigned long mfn, int order) +{ + int ret = 0; + + + ret = guest_physmap_add_entry(d, gfn, mfn, order, + __PAGE_HYPERVISOR | _PAGE_USER); + + /* TODO: fix exit path when failure */ + return ret; } /* This function goes through P2M table and modify l1e flags of all pages. Note @@ -706,15 +998,16 @@ guest_physmap_add_page(struct domain *d, */ void p2m_set_flags_global(struct domain *d, u32 l1e_flags) { - unsigned long mfn, gfn; + unsigned long mfn, gfn, flags; l1_pgentry_t l1e_content; l1_pgentry_t *l1e; l2_pgentry_t *l2e; - mfn_t l1mfn; + mfn_t l1mfn, l2mfn; int i1, i2; #if CONFIG_PAGING_LEVELS >= 3 l3_pgentry_t *l3e; int i3; + mfn_t l3mfn; #if CONFIG_PAGING_LEVELS == 4 l4_pgentry_t *l4e; int i4; @@ -727,13 +1020,15 @@ void p2m_set_flags_global(struct domain if ( pagetable_get_pfn(d->arch.phys_table) == 0 ) return; - p2m_lock(d); - + ASSERT(p2m_locked_by_me(d)); + #if CONFIG_PAGING_LEVELS == 4 l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table))); #elif CONFIG_PAGING_LEVELS == 3 + l3mfn = _mfn(mfn_x(pagetable_get_mfn(d->arch.phys_table))); l3e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table))); #else /* CONFIG_PAGING_LEVELS == 2 */ + l2mfn = _mfn(mfn_x(pagetable_get_mfn(d->arch.phys_table))); l2e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table))); #endif @@ -745,6 +1040,7 @@ void p2m_set_flags_global(struct domain { continue; } + l3mfn = _mfn(l4e_get_pfn(l4e[i4])); l3e = map_domain_page(l4e_get_pfn(l4e[i4])); #endif /* now at levels 3 or 4... */ for ( i3 = 0; @@ -755,6 +1051,19 @@ void p2m_set_flags_global(struct domain { continue; } + if ( (l3e_get_flags(l3e[i3]) & _PAGE_PSE) ) + { + flags = l3e_get_flags(l3e[i3]); + mfn = l3e_get_pfn(l3e[i3]); + gfn = get_gpfn_from_mfn(mfn); + flags = l1e_flags; + l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE); + paging_write_p2m_entry(d, gfn, (l1_pgentry_t *)&l3e[i3], + l3mfn, l1e_content, 3); + continue; + } + + l2mfn = _mfn(l3e_get_pfn(l3e[i3])); l2e = map_domain_page(l3e_get_pfn(l3e[i3])); #endif /* all levels... */ for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ ) @@ -764,6 +1073,18 @@ void p2m_set_flags_global(struct domain continue; } + if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE) ) + { + flags = l2e_get_flags(l2e[i2]); + mfn = l2e_get_pfn(l2e[i2]); + gfn = get_gpfn_from_mfn(mfn); + flags = l1e_flags; + l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE); + paging_write_p2m_entry(d, gfn, (l1_pgentry_t *)&l2e[i2], + l2mfn, l1e_content, 2); + continue; + } + l1mfn = _mfn(l2e_get_pfn(l2e[i2])); l1e = map_domain_page(mfn_x(l1mfn)); @@ -797,7 +1118,6 @@ void p2m_set_flags_global(struct domain unmap_domain_page(l2e); #endif - p2m_unlock(d); } /* This function traces through P2M table and modifies l1e flags of a specific @@ -813,13 +1133,56 @@ int p2m_set_flags(struct domain *d, padd gfn = gpa >> PAGE_SHIFT; mfn = gfn_to_mfn(d, gfn); if ( mfn_valid(mfn) ) - set_p2m_entry(d, gfn, mfn, l1e_flags); + set_p2m_entry(d, gfn, mfn, 0, l1e_flags); p2m_unlock(d); return 1; } +int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn) +{ + int rc = 0; + unsigned long omfn; + + if ( !paging_mode_translate(d) ) + return 0; + + omfn = gmfn_to_mfn(d, gfn); + if ( INVALID_MFN != omfn ) + { + ASSERT(mfn_valid(_mfn(omfn))); + set_gpfn_from_mfn(omfn, INVALID_M2P_ENTRY); + } + + rc = set_p2m_entry(d, gfn, mfn, 0, p2m_type_to_flags(p2m_mmio_direct)); + if ( 0 == rc ) + gdprintk(XENLOG_ERR, + "set_mmio_p2m_entry: set_p2m_entry failed! mfn=%08lx\n", + gmfn_to_mfn(d, gfn)); + return rc; +} + +int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn) +{ + int rc = 0; + unsigned long mfn; + + if ( !paging_mode_translate(d) ) + return 0; + + mfn = gmfn_to_mfn(d, gfn); + if ( INVALID_MFN == mfn ) + { + gdprintk(XENLOG_ERR, + "clear_mmio_p2m_entry: gfn_to_mfn failed! gfn=%08lx\n", gfn); + return 0; + } + rc = set_p2m_entry(d, gfn, _mfn(INVALID_MFN), 0, 0); + + return rc; +} + /* * Local variables: * mode: C diff -Naurp xen/arch/x86/mm/p2m-ept.c xen-redhat/arch/x86/mm/p2m-ept.c --- xen/arch/x86/mm/p2m-ept.c +++ xen-redhat/arch/x86/mm/p2m-ept.c @@ -0,0 +1,570 @@ +/* + * p2m-ept.c: use the EPT page table as p2m + * Copyright (c) 2007, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + */ + +#include <xen/config.h> +#include <xen/domain_page.h> +#include <xen/sched.h> +#include <asm/current.h> +#include <asm/types.h> +#include <asm/domain.h> +#include <asm/hvm/vmx/vmx.h> +#include <xen/iocap.h> +#include <asm/mtrr.h> + +#if 1 /* XEN_VERSION == 3 && XEN_SUBVERSION < 2 */ + +static int ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, + int order, p2m_type_t p2mt); +mfn_t ept_get_entry(struct domain *d, unsigned long gfn, p2m_type_t *t); +static mfn_t ept_get_entry_fast(unsigned long gfn, p2m_type_t *t); + +static p2m_type_t ept_flags_to_p2m_type(u32 l1e_flags) +{ + if ( l1e_flags & _PAGE_RW ) + return p2m_ram_rw; + else if ( paging_mode_log_dirty(current->domain) ) + return p2m_ram_logdirty; + return p2m_invalid; +} + +static inline int +compat_ept_set_entry(struct domain *d, unsigned long gfn, + mfn_t mfn, int order, u32 l1e_flags) +{ + p2m_type_t t = ept_flags_to_p2m_type(l1e_flags); + if ( t == p2m_ram_rw && mfn_x(mfn) != INVALID_MFN && + iomem_access_permitted(d, mfn_x(mfn), mfn_x(mfn)) ) + t = p2m_mmio_direct; + + return ept_set_entry(d, gfn, mfn, order, t); +} + +static mfn_t compat_ept_get_entry(struct domain *d, unsigned long gfn) +{ + p2m_type_t dummy; + return ept_get_entry(d, gfn, &dummy); +} + +static mfn_t compat_ept_get_entry_fast(unsigned long gfn) +{ + p2m_type_t dummy; + return ept_get_entry_fast(gfn, &dummy); +} +#else + +#define compat_ept_set_entry ept_set_entry +#define compat_ept_get_entry ept_get_entry +#define compat_ept_get_entry_fast + +#endif + +uint8_t epte_get_entry_emt( + struct domain *d, unsigned long gfn, + unsigned long mfn, uint8_t *igmt, int direct_mmio) +{ + struct vcpu *v = current; + + *igmt = 0; + + if ( (current->domain != d) && ((v = d->vcpu[0]) == NULL) ) + return MTRR_TYPE_WRBACK; + + if ( !mfn_valid(mfn) ) + return MTRR_TYPE_UNCACHABLE; + + if ( !iommu_enabled ) + { + *igmt = 1; + return MTRR_TYPE_WRBACK; + } + + if ( direct_mmio ) + return MTRR_TYPE_UNCACHABLE; + + if ( iommu_snoop ) + { + *igmt = 1; + return MTRR_TYPE_WRBACK; + } + + return MTRR_TYPE_WRBACK; +} + +static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type) +{ + switch(type) + { + case p2m_ram_rw: + entry->r = entry->w = entry->x = 1; + return; + case p2m_mmio_direct: + entry->r = entry->w = entry->x = 1; + return; + case p2m_ram_logdirty: + case p2m_ram_ro: + entry->r = entry->x = 1; + entry->w = 0; + return; + case p2m_invalid: + case p2m_mmio_dm: + default: + return; + } +} + +#define GUEST_TABLE_NORMAL_PAGE 1 +#define GUEST_TABLE_SUPER_PAGE 2 +#define GUEST_TABLE_SPLIT_PAGE 3 + +static struct page_info *ept_alloc_middle_page(struct domain *d) +{ + struct page_info *pg; + + pg = d->arch.p2m.alloc_page(d); + if ( pg == NULL ) + return NULL; + + pg->count_info = 1; + pg->u.inuse.type_info = 1 | PGT_validated; + list_add_tail(&pg->list, &d->arch.p2m.pages); + + return pg; +} + +static ept_entry_t ept_set_middle_entry(ept_entry_t *ept_entry, + struct page_info *pg) +{ + ept_entry_t e; + + e.epte = 0; + e.mfn = page_to_mfn(pg); + /* last step */ + e.r = e.w = e.x = 1; + *ept_entry = e; + return e; +} + +static int ept_next_level(struct domain *d, bool_t read_only, + ept_entry_t **table, unsigned long *gfn_remainder, + u32 shift, int order) +{ + ept_entry_t ept_entry, *next; + struct page_info *pg; + u32 index; + + index = *gfn_remainder >> shift; + + ept_entry = (*table)[index]; + + if ( !(ept_entry.epte & 0x7) ) + { + if ( read_only ) + return 0; + + pg = ept_alloc_middle_page(d); + if ( pg == NULL ) + return 0; + ept_entry = ept_set_middle_entry((*table) + index, pg); + } + + if ( !ept_entry.sp_avail ) + { + *gfn_remainder &= (1UL << shift) - 1; + next = map_domain_page(ept_entry.mfn); + unmap_domain_page(*table); + *table = next; + return GUEST_TABLE_NORMAL_PAGE; + } + else + { + if ( order == shift || read_only ) + return GUEST_TABLE_SUPER_PAGE; + else + return GUEST_TABLE_SPLIT_PAGE; + } +} + +static int +ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, + int order, p2m_type_t p2mt) +{ + ept_entry_t *table = NULL; + unsigned long gfn_remainder = gfn, offset=0; + ept_entry_t ept_entry; + u32 index; + int i, rv = 0, ret = 0; + int walk_level = order / EPT_TABLE_ORDER; + int direct_mmio = (p2mt == p2m_mmio_direct); + uint8_t igmt = 0; + int need_modify_vtd_table = 1; + + /* We only support 4k and 2m pages now */ + + BUG_ON(order && order != EPT_TABLE_ORDER); + + if ( order != 0 ) + if ( (gfn & ((1UL << order) - 1)) ) + return 1; + + table = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table))); + + ASSERT(table != NULL); + + for ( i = EPT_DEFAULT_GAW; i > walk_level; i-- ) + { + ret = ept_next_level(d, 0, &table, + &gfn_remainder, i * EPT_TABLE_ORDER, order); + if ( !ret ) + goto out; + else if ( ret != GUEST_TABLE_NORMAL_PAGE ) + break; + } + + index = gfn_remainder >> ( i ? (i * EPT_TABLE_ORDER): order); + walk_level = ( i ? ( i * EPT_TABLE_ORDER) : order) / EPT_TABLE_ORDER; + offset = (gfn_remainder & ( ((1 << (i*EPT_TABLE_ORDER)) - 1))); + + ept_entry = table[index]; + + if ( ret != GUEST_TABLE_SPLIT_PAGE ) + { + if ( mfn_valid(mfn_x(mfn)) || (p2mt == p2m_mmio_direct) ) + { + ept_entry.emt = epte_get_entry_emt(d, gfn, mfn_x(mfn), + &igmt, direct_mmio); + ept_entry.igmt = igmt; + ept_entry.sp_avail = walk_level ? 1 : 0; + + if ( ret == GUEST_TABLE_SUPER_PAGE ) + { + if ( ept_entry.mfn == (mfn_x(mfn) - offset) ) + need_modify_vtd_table = 0; + else + ept_entry.mfn = mfn_x(mfn) - offset; + + if ( ept_entry.avail1 == p2m_ram_logdirty && + p2mt == p2m_ram_rw ) + for ( i = 0; i < (1UL << order); i++ ) + paging_mark_dirty(d, mfn_x(mfn)-offset+i); + } + else + { + if ( ept_entry.mfn == mfn_x(mfn) ) + need_modify_vtd_table = 0; + else + ept_entry.mfn = mfn_x(mfn); + } + + + ept_entry.avail1 = p2mt; + ept_entry.rsvd = 0; + ept_entry.avail2 = 0; + /* last step */ + ept_entry.r = ept_entry.w = ept_entry.x = 1; + ept_p2m_type_to_flags(&ept_entry, p2mt); + } + else + ept_entry.epte = 0; + + table[index] = ept_entry; + } + else + { + /* It's super page before, now set one of the 4k pages, so + * we should split the 2m page to 4k pages now. + */ + + ept_entry_t *split_table = NULL; + ept_entry_t split_ept_entry; + unsigned long split_mfn = ept_entry.mfn; + p2m_type_t split_p2mt = ept_entry.avail1; + struct page_info *pg; + + /* alloc new page for new ept middle level entry which is + * before a leaf super entry + */ + + pg = ept_alloc_middle_page(d); + if ( pg == NULL ) + goto out; + + /* split the super page before to 4k pages */ + + split_table = map_domain_page(page_to_mfn(pg)); + offset = gfn & ((1 << EPT_TABLE_ORDER) - 1); + + for ( i = 0; i < 512; i++ ) + { + split_ept_entry = split_table[i]; + split_ept_entry.emt = epte_get_entry_emt(d, + gfn-offset+i, split_mfn+i, + &igmt, direct_mmio); + split_ept_entry.igmt = igmt; + + split_ept_entry.sp_avail = 0; + + split_ept_entry.mfn = split_mfn+i; + + split_ept_entry.avail1 = split_p2mt; + split_ept_entry.rsvd = 0; + split_ept_entry.avail2 = 0; + /* last step */ + split_ept_entry.r = split_ept_entry.w = split_ept_entry.x = 1; + ept_p2m_type_to_flags(&split_ept_entry, split_p2mt); + split_table[i] = split_ept_entry; + } + + /* Set the destinated 4k page as normal */ + split_ept_entry = split_table[offset]; + split_ept_entry.emt = epte_get_entry_emt(d, gfn, mfn_x(mfn), + &igmt, direct_mmio); + split_ept_entry.igmt = igmt; + if ( split_ept_entry.mfn == mfn_x(mfn) ) + need_modify_vtd_table = 0; + else + split_ept_entry.mfn = mfn_x(mfn); + + split_ept_entry.avail1 = p2mt; + ept_p2m_type_to_flags(&split_ept_entry, p2mt); + split_table[offset] = split_ept_entry; + + unmap_domain_page(split_table); + + ept_set_middle_entry(table + index, pg); + } + + /* Track the highest gfn for which we have ever had a valid mapping */ + if ( mfn_valid(mfn_x(mfn)) + && (gfn + (1UL << order) - 1 > d->arch.p2m.max_mapped_pfn) ) + d->arch.p2m.max_mapped_pfn = gfn + (1UL << order) - 1; + + /* Success */ + rv = 1; + + out: + unmap_domain_page(table); + ept_sync_domain(d); + + /* support pci pass-through */ + if ( iommu_enabled && is_hvm_domain(d) + && need_modify_vtd_table) + { + if ( p2mt == p2m_ram_rw ) + { + if ( order == EPT_TABLE_ORDER ) + { + for ( i = 0; i < ( 1 << order ); i++ ) + iommu_map_page(d, gfn-offset+i, mfn_x(mfn)-offset+i); + } + else if ( !order ) + iommu_map_page(d, gfn, mfn_x(mfn)); + } + else + { + if ( order == EPT_TABLE_ORDER ) + { + for ( i = 0; i < ( 1 << order ); i++ ) + iommu_unmap_page(d, gfn-offset+i); + } + else if ( !order ) + iommu_unmap_page(d, gfn); + } + } + + return rv; +} + +/* Read ept p2m entries */ +mfn_t ept_get_entry(struct domain *d, unsigned long gfn, p2m_type_t *t) +{ + ept_entry_t *table = + map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table))); + unsigned long gfn_remainder = gfn; + ept_entry_t ept_entry; + u32 index; + int i, ret=0; + mfn_t mfn = _mfn(INVALID_MFN); + + *t = p2m_mmio_dm; + + /* This pfn is higher than the highest the p2m map currently holds */ + if ( gfn > d->arch.p2m.max_mapped_pfn ) + goto out; + + /* should check if gfn obeys GAW here */ + + for ( i = EPT_DEFAULT_GAW; i > 0; i-- ) + { + ret = ept_next_level(d, 1, &table, &gfn_remainder, + i * EPT_TABLE_ORDER, 0); + if ( !ret ) + goto out; + else if ( ret == GUEST_TABLE_SUPER_PAGE ) + break; + } + + index = gfn_remainder >> ( i * EPT_TABLE_ORDER); + ept_entry = table[index]; + + if ( ept_entry.avail1 != p2m_invalid ) + { + *t = ept_entry.avail1; + mfn = _mfn(ept_entry.mfn); + if ( i ) + { + /* we may meet super pages, and to split into 4k pages + * to emulate p2m table + */ + unsigned long split_mfn = + mfn_x(mfn) + (gfn_remainder & ( ((1 << (i*EPT_TABLE_ORDER)) - 1 ))); + mfn = _mfn(split_mfn); + } + } + + out: + unmap_domain_page(table); + return mfn; +} + +static mfn_t ept_get_entry_fast(unsigned long gfn, p2m_type_t *t) +{ + return ept_get_entry(current->domain, gfn, t); +} + +/* Walk the whole p2m table, changing any entries of the old type + * to the new type. This is used in hardware-assisted paging to + * quickly enable or diable log-dirty tracking */ + +static void ept_change_entry_type_global(struct domain *d, + p2m_type_t ot, p2m_type_t nt) +{ + ept_entry_t *l4e, *l3e, *l2e, *l1e; + int i4, i3, i2, i1; + + if ( pagetable_get_pfn(d->arch.phys_table) == 0 ) + return; + + BUG_ON(EPT_DEFAULT_GAW != 3); + + l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table))); + for (i4 = 0; i4 < EPT_PAGETABLE_ENTRIES; i4++ ) + { + if ( !l4e[i4].epte ) + continue; + if ( !l4e[i4].sp_avail ) + { + l3e = map_domain_page(l4e[i4].mfn); + for ( i3 = 0; i3 < EPT_PAGETABLE_ENTRIES; i3++ ) + { + if ( !l3e[i3].epte ) + continue; + if ( !l3e[i3].sp_avail ) + { + l2e = map_domain_page(l3e[i3].mfn); + for ( i2 = 0; i2 < EPT_PAGETABLE_ENTRIES; i2++ ) + { + if ( !l2e[i2].epte ) + continue; + if ( !l2e[i2].sp_avail ) + { + l1e = map_domain_page(l2e[i2].mfn); + for ( i1 = 0; i1 < EPT_PAGETABLE_ENTRIES; i1++ ) + { + if ( !l1e[i1].epte ) + continue; + if ( l1e[i1].avail1 != ot ) + continue; + l1e[i1].avail1 = nt; + ept_p2m_type_to_flags(l1e+i1, nt); + } + unmap_domain_page(l1e); + } + else + { + if ( l2e[i2].avail1 != ot ) + continue; + l2e[i2].avail1 = nt; + ept_p2m_type_to_flags(l2e+i2, nt); + } + } + unmap_domain_page(l2e); + } + else + { + if ( l3e[i3].avail1 != ot ) + continue; + l3e[i3].avail1 = nt; + ept_p2m_type_to_flags(l3e+i3, nt); + } + } + unmap_domain_page(l3e); + } + else + { + if ( l4e[i4].avail1 != ot ) + continue; + l4e[i4].avail1 = nt; + ept_p2m_type_to_flags(l4e+i4, nt); + } + } + unmap_domain_page(l4e); + + ept_sync_domain(d); +} + +static void __ept_change_entry_type_global(struct domain *d, + u32 l1e_flags) +{ + p2m_type_t nt,ot; + + if ( l1e_flags == (__PAGE_HYPERVISOR|_PAGE_USER) ) + { + nt = p2m_ram_rw; + ot = p2m_ram_logdirty; + } + else if ( l1e_flags == (_PAGE_PRESENT|_PAGE_USER) ) + { + nt = p2m_ram_logdirty; + ot = p2m_ram_rw; + } + else + { + nt = ot = p2m_ram_rw; + BUG(); + } + + ept_change_entry_type_global(d, ot, nt); +} + +void ept_p2m_init(struct domain *d) +{ + d->arch.p2m.set_entry = compat_ept_set_entry; + d->arch.p2m.get_entry = compat_ept_get_entry; + d->arch.p2m.get_entry_fast = compat_ept_get_entry_fast; + d->arch.p2m.change_entry_type_global = __ept_change_entry_type_global; +} + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -Naurp xen/arch/x86/mm/paging.c xen-redhat/arch/x86/mm/paging.c --- xen/arch/x86/mm/paging.c +++ xen-redhat/arch/x86/mm/paging.c @@ -27,10 +27,6 @@ #include <asm/hap.h> #include <asm/guest_access.h> -/* Xen command-line option to enable hardware-assisted paging */ -int opt_hap_enabled; -boolean_param("hap", opt_hap_enabled); - /* Printouts */ #define PAGING_PRINTK(_f, _a...) \ debugtrace_printk("pg: %s(): " _f, __func__, ##_a) @@ -362,14 +358,14 @@ void paging_domain_init(struct domain *d shadow_domain_init(d); /* ... but we will use hardware assistance if it's available. */ - if ( opt_hap_enabled && is_hvm_domain(d) ) + if ( hap_enabled(d) ) hap_domain_init(d); } /* vcpu paging struct initialization goes here */ void paging_vcpu_init(struct vcpu *v) { - if ( opt_hap_enabled && is_hvm_vcpu(v) ) + if ( hap_enabled(v->domain) ) hap_vcpu_init(v); else shadow_vcpu_init(v); @@ -429,7 +425,7 @@ int paging_domctl(struct domain *d, xen_ } /* Here, dispatch domctl to the appropriate paging code */ - if ( opt_hap_enabled && is_hvm_domain(d) ) + if ( hap_enabled(d) ) return hap_domctl(d, sc, u_domctl); else return shadow_domctl(d, sc, u_domctl); @@ -438,7 +434,7 @@ int paging_domctl(struct domain *d, xen_ /* Call when destroying a domain */ void paging_teardown(struct domain *d) { - if ( opt_hap_enabled && is_hvm_domain(d) ) + if ( hap_enabled(d) ) hap_teardown(d); else shadow_teardown(d); @@ -450,7 +446,7 @@ void paging_teardown(struct domain *d) /* Call once all of the references to the domain have gone away */ void paging_final_teardown(struct domain *d) { - if ( opt_hap_enabled && is_hvm_domain(d) ) + if ( hap_enabled(d) ) hap_final_teardown(d); else shadow_final_teardown(d); @@ -460,7 +456,7 @@ void paging_final_teardown(struct domain * creation. */ int paging_enable(struct domain *d, u32 mode) { - if ( opt_hap_enabled && is_hvm_domain(d) ) + if ( hap_enabled(d) ) return hap_enable(d, mode | PG_HAP_enable); else return shadow_enable(d, mode | PG_SH_enable); diff -Naurp xen/arch/x86/mm/shadow/common.c xen-redhat/arch/x86/mm/shadow/common.c --- xen/arch/x86/mm/shadow/common.c +++ xen-redhat/arch/x86/mm/shadow/common.c @@ -101,7 +101,7 @@ int _shadow_mode_refcounts(struct domain /* x86 emulator support for the shadow code */ -struct segment_register *hvm_get_seg_reg( +static struct segment_register *hvm_get_seg_reg( enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt) { struct segment_register *seg_reg = &sh_ctxt->seg_reg[seg]; @@ -110,10 +110,6 @@ struct segment_register *hvm_get_seg_reg return seg_reg; } -enum hvm_access_type { - hvm_access_insn_fetch, hvm_access_read, hvm_access_write -}; - static int hvm_translate_linear_addr( enum x86_segment seg, unsigned long offset, @@ -123,76 +119,18 @@ static int hvm_translate_linear_addr( unsigned long *paddr) { struct segment_register *reg = hvm_get_seg_reg(seg, sh_ctxt); - unsigned long limit, addr = offset; - uint32_t last_byte; + int okay; - if ( sh_ctxt->ctxt.addr_size != 64 ) - { - /* - * COMPATIBILITY MODE: Apply segment checks and add base. - */ + okay = hvm_virtual_to_linear_addr( + seg, reg, offset, bytes, access_type, sh_ctxt->ctxt.addr_size, paddr); - switch ( access_type ) - { - case hvm_access_read: - if ( (reg->attr.fields.type & 0xa) == 0x8 ) - goto gpf; /* execute-only code segment */ - break; - case hvm_access_write: - if ( (reg->attr.fields.type & 0xa) != 0x2 ) - goto gpf; /* not a writable data segment */ - break; - default: - break; - } - - /* Calculate the segment limit, including granularity flag. */ - limit = reg->limit; - if ( reg->attr.fields.g ) - limit = (limit << 12) | 0xfff; - - last_byte = offset + bytes - 1; - - /* Is this a grows-down data segment? Special limit check if so. */ - if ( (reg->attr.fields.type & 0xc) == 0x4 ) - { - /* Is upper limit 0xFFFF or 0xFFFFFFFF? */ - if ( !reg->attr.fields.db ) - last_byte = (uint16_t)last_byte; - - /* Check first byte and last byte against respective bounds. */ - if ( (offset <= limit) || (last_byte < offset) ) - goto gpf; - } - else if ( (last_byte > limit) || (last_byte < offset) ) - goto gpf; /* last byte is beyond limit or wraps 0xFFFFFFFF */ - - /* - * Hardware truncates to 32 bits in compatibility mode. - * It does not truncate to 16 bits in 16-bit address-size mode. - */ - addr = (uint32_t)(addr + reg->base); - } - else + if ( !okay ) { - /* - * LONG MODE: FS and GS add segment base. Addresses must be canonical. - */ - - if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) ) - addr += reg->base; - - if ( !is_canonical_address(addr) ) - goto gpf; + hvm_inject_exception(TRAP_gp_fault, 0, 0); + return X86EMUL_EXCEPTION; } - *paddr = addr; - return 0; - - gpf: - /* Inject #GP(0). */ - hvm_inject_exception(TRAP_gp_fault, 0, 0); - return X86EMUL_EXCEPTION; + return 0; } static int @@ -771,14 +709,29 @@ shadow_order(unsigned int shadow_type) } -/* Do we have a free chunk of at least this order? */ -static inline int chunk_is_available(struct domain *d, int order) +static inline unsigned int +shadow_max_order(struct domain *d) { - int i; - - for ( i = order; i <= SHADOW_MAX_ORDER; i++ ) - if ( !list_empty(&d->arch.paging.shadow.freelists[i]) ) - return 1; + return is_hvm_domain(d) ? SHADOW_MAX_ORDER : 0; +} + +/* Do we have at total of count pages of the requested order free? */ +static inline int space_is_available( + struct domain *d, + unsigned int order, + unsigned int count) +{ + for ( ; order <= shadow_max_order(d); ++order ) + { + unsigned int n = count; + const struct list_head *p; + + list_for_each ( p, &d->arch.paging.shadow.freelists[order] ) + if ( --n == 0 ) + return 1; + count = (count + 1) >> 1; + } + return 0; } @@ -814,12 +767,12 @@ static void shadow_unhook_mappings(struc } -/* Make sure there is at least one chunk of the required order available - * in the shadow page pool. This must be called before any calls to - * shadow_alloc(). Since this will free existing shadows to make room, - * it must be called early enough to avoid freeing shadows that the - * caller is currently working on. */ -void shadow_prealloc(struct domain *d, unsigned int order) +/* Make sure there are at least count order-sized pages + * available in the shadow page pool. */ +static void _shadow_prealloc( + struct domain *d, + unsigned int order, + unsigned int count) { /* Need a vpcu for calling unpins; for now, since we don't have * per-vcpu shadows, any will do */ @@ -830,7 +783,8 @@ void shadow_prealloc(struct domain *d, u mfn_t smfn; int i; - if ( chunk_is_available(d, order) ) return; + ASSERT(order <= shadow_max_order(d)); + if ( space_is_available(d, order, count) ) return; v = current; if ( v->domain != d ) @@ -847,8 +801,8 @@ void shadow_prealloc(struct domain *d, u /* Unpin this top-level shadow */ sh_unpin(v, smfn); - /* See if that freed up a chunk of appropriate size */ - if ( chunk_is_available(d, order) ) return; + /* See if that freed up enough space */ + if ( space_is_available(d, order, count) ) return; } /* Stage two: all shadow pages are in use in hierarchies that are @@ -865,8 +819,8 @@ void shadow_prealloc(struct domain *d, u pagetable_get_mfn(v2->arch.shadow_table[i])); cpus_or(flushmask, v2->vcpu_dirty_cpumask, flushmask); - /* See if that freed up a chunk of appropriate size */ - if ( chunk_is_available(d, order) ) + /* See if that freed up enough space */ + if ( space_is_available(d, order, count) ) { flush_tlb_mask(flushmask); return; @@ -876,15 +830,26 @@ void shadow_prealloc(struct domain *d, u /* Nothing more we can do: all remaining shadows are of pages that * hold Xen mappings for some vcpu. This can never happen. */ - SHADOW_ERROR("Can't pre-allocate %i shadow pages!\n" + SHADOW_ERROR("Can't pre-allocate %u order-%u shadow pages!\n" " shadow pages total = %u, free = %u, p2m=%u\n", - 1 << order, + count, order, d->arch.paging.shadow.total_pages, d->arch.paging.shadow.free_pages, d->arch.paging.shadow.p2m_pages); BUG(); } +/* Make sure there are at least count pages of the order according to + * type available in the shadow page pool. + * This must be called before any calls to shadow_alloc(). Since this + * will free existing shadows to make room, it must be called early enough + * to avoid freeing shadows that the caller is currently working on. */ +void shadow_prealloc(struct domain *d, u32 type, unsigned int count) +{ + ASSERT(type != SH_type_p2m_table); + return _shadow_prealloc(d, shadow_order(type), count); +} + /* Deliberately free all the memory we can: this will tear down all of * this domain's shadows */ static void shadow_blow_tables(struct domain *d) @@ -961,7 +926,9 @@ mfn_t shadow_alloc(struct domain *d, int i; ASSERT(shadow_locked_by_me(d)); - ASSERT(order <= SHADOW_MAX_ORDER); + if (shadow_type == SH_type_p2m_table && order > shadow_max_order(d)) + order = shadow_max_order(d); + ASSERT(order <= shadow_max_order(d)); ASSERT(shadow_type != SH_type_none); perfc_incr(shadow_alloc); @@ -1062,7 +1029,7 @@ void shadow_free(struct domain *d, mfn_t } /* Merge chunks as far as possible. */ - while ( order < SHADOW_MAX_ORDER ) + for ( ; order < shadow_max_order(d); ++order ) { mask = 1 << order; if ( (mfn_x(shadow_page_to_mfn(sp)) & mask) ) { @@ -1077,7 +1044,6 @@ void shadow_free(struct domain *d, mfn_t break; list_del(&(sp+mask)->list); } - order++; } sp->order = order; @@ -1099,16 +1065,18 @@ sh_alloc_p2m_pages(struct domain *d) { struct page_info *pg; u32 i; + unsigned int order = shadow_max_order(d); + ASSERT(shadow_locked_by_me(d)); if ( d->arch.paging.shadow.total_pages - < (shadow_min_acceptable_pages(d) + (1<<SHADOW_MAX_ORDER)) ) + < (shadow_min_acceptable_pages(d) + (1 << order)) ) return 0; /* Not enough shadow memory: need to increase it first */ pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0)); - d->arch.paging.shadow.p2m_pages += (1<<SHADOW_MAX_ORDER); - d->arch.paging.shadow.total_pages -= (1<<SHADOW_MAX_ORDER); - for (i = 0; i < (1<<SHADOW_MAX_ORDER); i++) + d->arch.paging.shadow.p2m_pages += (1 << order); + d->arch.paging.shadow.total_pages -= (1 << order); + for (i = 0; i < (1U << order); i++) { /* Unlike shadow pages, mark p2m pages as owned by the domain. * Marking the domain as the owner would normally allow the guest to @@ -1228,7 +1196,7 @@ static unsigned int sh_set_allocation(st { struct shadow_page_info *sp; unsigned int lower_bound; - int j; + unsigned int j, order = shadow_max_order(d); ASSERT(shadow_locked_by_me(d)); @@ -1249,15 +1217,15 @@ static unsigned int sh_set_allocation(st { /* Need to allocate more memory from domheap */ sp = (struct shadow_page_info *) - alloc_domheap_pages(NULL, SHADOW_MAX_ORDER, 0); + alloc_domheap_pages(NULL, order, 0); if ( sp == NULL ) { SHADOW_PRINTK("failed to allocate shadow pages.\n"); return -ENOMEM; } - d->arch.paging.shadow.free_pages += 1<<SHADOW_MAX_ORDER; - d->arch.paging.shadow.total_pages += 1<<SHADOW_MAX_ORDER; - for ( j = 0; j < 1<<SHADOW_MAX_ORDER; j++ ) + d->arch.paging.shadow.free_pages += 1 << order; + d->arch.paging.shadow.total_pages += 1 << order; + for ( j = 0; j < 1U << order; j++ ) { sp[j].type = 0; sp[j].pinned = 0; @@ -1265,21 +1233,20 @@ static unsigned int sh_set_allocation(st sp[j].mbz = 0; sp[j].tlbflush_timestamp = 0; /* Not in any TLB */ } - sp->order = SHADOW_MAX_ORDER; - list_add_tail(&sp->list, - &d->arch.paging.shadow.freelists[SHADOW_MAX_ORDER]); + sp->order = order; + list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[order]); } else if ( d->arch.paging.shadow.total_pages > pages ) { /* Need to return memory to domheap */ - shadow_prealloc(d, SHADOW_MAX_ORDER); - ASSERT(!list_empty(&d->arch.paging.shadow.freelists[SHADOW_MAX_ORDER])); - sp = list_entry(d->arch.paging.shadow.freelists[SHADOW_MAX_ORDER].next, + _shadow_prealloc(d, order, 1); + ASSERT(!list_empty(&d->arch.paging.shadow.freelists[order])); + sp = list_entry(d->arch.paging.shadow.freelists[order].next, struct shadow_page_info, list); list_del(&sp->list); - d->arch.paging.shadow.free_pages -= 1<<SHADOW_MAX_ORDER; - d->arch.paging.shadow.total_pages -= 1<<SHADOW_MAX_ORDER; - free_domheap_pages((struct page_info *)sp, SHADOW_MAX_ORDER); + d->arch.paging.shadow.free_pages -= 1 << order; + d->arch.paging.shadow.total_pages -= 1 << order; + free_domheap_pages((struct page_info *)sp, order); } /* Check to see if we need to yield and try again */ diff -Naurp xen/arch/x86/mm/shadow/multi.c xen-redhat/arch/x86/mm/shadow/multi.c --- xen/arch/x86/mm/shadow/multi.c +++ xen-redhat/arch/x86/mm/shadow/multi.c @@ -35,6 +35,7 @@ #include <asm/hvm/hvm.h> #include "private.h" #include "types.h" +#include <xen/iocap.h> /* THINGS TO DO LATER: * @@ -654,7 +655,8 @@ _sh_propagate(struct vcpu *v, goto done; } - if ( level == 1 && mmio ) + if ( level == 1 && mmio && + !iomem_access_permitted(d, mfn_x(target_mfn), mfn_x(target_mfn)) ) { /* Guest l1e maps MMIO space */ *sp = sh_l1e_mmio(guest_l1e_get_gfn(*gp), gflags); @@ -667,7 +669,8 @@ _sh_propagate(struct vcpu *v, // case of a prefetch, an invalid mfn means that we can not usefully // shadow anything, and so we return early. // - if ( !mfn_valid(target_mfn) ) + if ( !mfn_valid(target_mfn) && + !iomem_access_permitted(d, mfn_x(target_mfn), mfn_x(target_mfn)) ) { ASSERT((ft == ft_prefetch)); *sp = shadow_l1e_empty(); @@ -750,6 +753,10 @@ _sh_propagate(struct vcpu *v, sflags |= _PAGE_USER; } + /* MMIO addresses should never be cached */ + if ( iomem_access_permitted(d, mfn_x(target_mfn), mfn_x(target_mfn)) ) + sflags |= _PAGE_PCD; + *sp = shadow_l1e_from_mfn(target_mfn, sflags); done: SHADOW_DEBUG(PROPAGATE, @@ -1661,7 +1668,7 @@ sh_make_monitor_table(struct vcpu *v) ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0); /* Guarantee we can get the memory we need */ - shadow_prealloc(d, SHADOW_MAX_ORDER); + shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS - 1); #if CONFIG_PAGING_LEVELS == 4 { @@ -2815,10 +2822,13 @@ static int sh_page_fault(struct vcpu *v, } /* Make sure there is enough free shadow memory to build a chain of - * shadow tables: one SHADOW_MAX_ORDER chunk will always be enough - * to allocate all we need. (We never allocate a top-level shadow - * on this path, only a 32b l1, pae l2+1 or 64b l3+2+1) */ - shadow_prealloc(d, SHADOW_MAX_ORDER); + * shadow tables. (We never allocate a top-level shadow on this path, + * only a 32b l1, pae l1, or 64b l3+2+1. Note that while + * SH_type_l1_shadow isn't correct in the latter case, all page + * tables are the same size there.) */ + shadow_prealloc(d, + SH_type_l1_shadow, + GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1); /* Acquire the shadow. This must happen before we figure out the rights * for the shadow entry, since we might promote a page here. */ @@ -2905,7 +2915,7 @@ static int sh_page_fault(struct vcpu *v, * stack is currently considered to be a page table, so we should * unshadow the faulting page before exiting. */ - if ( unlikely(hvm_event_injection_faulted(v)) ) + if ( unlikely(hvm_event_pending(v)) ) { gdprintk(XENLOG_DEBUG, "write to pagetable during event " "injection: cr2=%#lx, mfn=%#lx\n", @@ -3439,7 +3449,7 @@ sh_set_toplevel_shadow(struct vcpu *v, if ( !mfn_valid(smfn) ) { /* Make sure there's enough free shadow memory. */ - shadow_prealloc(d, SHADOW_MAX_ORDER); + shadow_prealloc(d, root_type, 1); /* Shadow the page. */ smfn = sh_make_shadow(v, gmfn, root_type); } @@ -4012,7 +4022,8 @@ static inline void * emulate_map_dest(st if ( !(flags & _PAGE_RW) ) goto page_fault; - if ( mfn_valid(mfn) ) + if ( mfn_valid(mfn) && + (mfn_x(mfn) != v->domain->arch.hvm_domain.vmx_apic_access_mfn) ) { *mfnp = mfn; v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn); diff -Naurp xen/arch/x86/mm/shadow/private.h xen-redhat/arch/x86/mm/shadow/private.h --- xen/arch/x86/mm/shadow/private.h +++ xen-redhat/arch/x86/mm/shadow/private.h @@ -243,17 +243,22 @@ struct shadow_page_info /* For non-pinnable shadows, a higher entry that points at us */ paddr_t up; }; +#if NR_CPUS > 64 + /* Need to add some padding to match struct page_info size, + * if cpumask_t is larger than a long + */ + u8 padding[sizeof(cpumask_t)-sizeof(long)]; +#endif }; -/* The structure above *must* be the same size as a struct page_info +/* The structure above *must* be no larger than a struct page_info * from mm.h, since we'll be using the same space in the frametable. * Also, the mbz field must line up with the owner field of normal * pages, so they look properly like anonymous/xen pages. */ static inline void shadow_check_page_struct_offsets(void) { - BUILD_BUG_ON(sizeof (struct shadow_page_info) - != sizeof (struct page_info)); - BUILD_BUG_ON(offsetof(struct shadow_page_info, mbz) - != offsetof(struct page_info, u.inuse._domain)); + BUILD_BUG_ON(sizeof (struct shadow_page_info) > sizeof (struct page_info)); + BUILD_BUG_ON(offsetof(struct shadow_page_info, mbz) != + offsetof(struct page_info, u.inuse._domain)); }; /* Shadow type codes */ @@ -354,7 +359,7 @@ void shadow_promote(struct vcpu *v, mfn_ void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type); /* Shadow page allocation functions */ -void shadow_prealloc(struct domain *d, unsigned int order); +void shadow_prealloc(struct domain *d, u32 shadow_type, unsigned int count); mfn_t shadow_alloc(struct domain *d, u32 shadow_type, unsigned long backpointer); diff -Naurp xen/arch/x86/mm.c xen-redhat/arch/x86/mm.c --- xen/arch/x86/mm.c +++ xen-redhat/arch/x86/mm.c @@ -140,7 +140,7 @@ static DEFINE_PER_CPU(struct percpu_mm_i #define FOREIGNDOM (this_cpu(percpu_mm_info).foreign ?: current->domain) /* Private domain structs for DOMID_XEN and DOMID_IO. */ -static struct domain *dom_xen, *dom_io; +struct domain *dom_xen, *dom_io; /* Frame table and its size in pages. */ struct page_info *frame_table; @@ -465,11 +465,11 @@ static int alloc_segdesc_page(struct pag goto fail; unmap_domain_page(descs); - return 1; + return 0; fail: unmap_domain_page(descs); - return 0; + return -EINVAL; } @@ -523,20 +523,25 @@ static int get_page_from_pagenr(unsigned static int get_page_and_type_from_pagenr(unsigned long page_nr, unsigned long type, - struct domain *d) + struct domain *d, + int partial, + int preemptible) { struct page_info *page = mfn_to_page(page_nr); + int rc; - if ( unlikely(!get_page_from_pagenr(page_nr, d)) ) - return 0; + if ( likely(partial >= 0) && + unlikely(!get_page_from_pagenr(page_nr, d)) ) + return -EINVAL; - if ( unlikely(!get_page_type(page, type)) ) - { + rc = (preemptible ? + get_page_type_preemptible(page, type) : + (get_page_type(page, type) ? 0 : -EINVAL)); + + if ( unlikely(rc) && partial >= 0 ) put_page(page); - return 0; - } - return 1; + return rc; } /* @@ -667,12 +672,13 @@ get_page_from_l2e( if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) ) { MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK); - return 0; + return -EINVAL; } - rc = get_page_and_type_from_pagenr(l2e_get_pfn(l2e), PGT_l1_page_table, d); - if ( unlikely(!rc) ) - rc = get_l2_linear_pagetable(l2e, pfn, d); + rc = get_page_and_type_from_pagenr( + l2e_get_pfn(l2e), PGT_l1_page_table, d, 0, 0); + if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) ) + rc = 0; return rc; } @@ -682,7 +688,7 @@ get_page_from_l2e( define_get_linear_pagetable(l3); static int get_page_from_l3e( - l3_pgentry_t l3e, unsigned long pfn, struct domain *d) + l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial, int preemptible) { int rc; @@ -692,12 +698,13 @@ get_page_from_l3e( if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) ) { MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d)); - return 0; + return -EINVAL; } - rc = get_page_and_type_from_pagenr(l3e_get_pfn(l3e), PGT_l2_page_table, d); - if ( unlikely(!rc) ) - rc = get_l3_linear_pagetable(l3e, pfn, d); + rc = get_page_and_type_from_pagenr( + l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, preemptible); + if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) ) + rc = 0; return rc; } @@ -707,7 +714,7 @@ get_page_from_l3e( define_get_linear_pagetable(l4); static int get_page_from_l4e( - l4_pgentry_t l4e, unsigned long pfn, struct domain *d) + l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial, int preemptible) { int rc; @@ -717,12 +724,13 @@ get_page_from_l4e( if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) ) { MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK); - return 0; + return -EINVAL; } - rc = get_page_and_type_from_pagenr(l4e_get_pfn(l4e), PGT_l3_page_table, d); - if ( unlikely(!rc) ) - rc = get_l4_linear_pagetable(l4e, pfn, d); + rc = get_page_and_type_from_pagenr( + l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, preemptible); + if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) ) + rc = 0; return rc; } @@ -857,29 +865,47 @@ void put_page_from_l1e(l1_pgentry_t l1e, * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. * Note also that this automatically deals correctly with linear p.t.'s. */ -static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn) +static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn) { - if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) && - (l2e_get_pfn(l2e) != pfn) ) + if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) && + (l2e_get_pfn(l2e) != pfn) ) + { put_page_and_type(l2e_get_page(l2e)); + return 0; + } + return 1; } #if CONFIG_PAGING_LEVELS >= 3 -static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn) +static int __put_page_type(struct page_info *, int preemptible); + +static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, + int partial, int preemptible) { - if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) && - (l3e_get_pfn(l3e) != pfn) ) - put_page_and_type(l3e_get_page(l3e)); + if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) && + (l3e_get_pfn(l3e) != pfn) ) + { + if ( unlikely(partial > 0) ) + return __put_page_type(l3e_get_page(l3e), preemptible); + return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible); + } + return 1; } #endif #if CONFIG_PAGING_LEVELS >= 4 -static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn) +static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, + int partial, int preemptible) { if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && (l4e_get_pfn(l4e) != pfn) ) - put_page_and_type(l4e_get_page(l4e)); + { + if ( unlikely(partial > 0) ) + return __put_page_type(l4e_get_page(l4e), preemptible); + return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible); + } + return 1; } #endif @@ -888,7 +914,7 @@ static int alloc_l1_table(struct page_in struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l1_pgentry_t *pl1e; - int i; + unsigned int i; pl1e = map_domain_page(pfn); @@ -902,7 +928,7 @@ static int alloc_l1_table(struct page_in } unmap_domain_page(pl1e); - return 1; + return 0; fail: MEM_LOG("Failure in alloc_l1_table: entry %d", i); @@ -911,7 +937,7 @@ static int alloc_l1_table(struct page_in put_page_from_l1e(pl1e[i], d); unmap_domain_page(pl1e); - return 0; + return -EINVAL; } #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT) @@ -1043,61 +1069,57 @@ static void pae_flush_pgd( # define pae_flush_pgd(mfn, idx, nl3e) ((void)0) #endif -static int alloc_l2_table(struct page_info *page, unsigned long type) +static int alloc_l2_table(struct page_info *page, unsigned long type, + int preemptible) { struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l2_pgentry_t *pl2e; - int i; + unsigned int i; + int rc = 0; pl2e = map_domain_page(pfn); - for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) + for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ ) { - if ( is_guest_l2_slot(d, type, i) && - unlikely(!get_page_from_l2e(pl2e[i], pfn, d)) ) - goto fail; - - adjust_guest_l2e(pl2e[i], d); - } + if ( preemptible && i && hypercall_preempt_check() ) + { + page->nr_validated_ptes = i; + rc = -EAGAIN; + break; + } -#if CONFIG_PAGING_LEVELS == 2 - /* Xen private mappings. */ - memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT], - &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT], - L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t)); - pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = - l2e_from_pfn(pfn, __PAGE_HYPERVISOR); - for ( i = 0; i < PDPT_L2_ENTRIES; i++ ) - pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] = - l2e_from_page( - virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i, - __PAGE_HYPERVISOR); -#endif + if ( !is_guest_l2_slot(d, type, i) || + (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 ) + continue; - unmap_domain_page(pl2e); - return 1; + if ( rc < 0 ) + { + MEM_LOG("Failure in alloc_l2_table: entry %d", i); + while ( i-- > 0 ) + if ( is_guest_l2_slot(d, type, i) ) + put_page_from_l2e(pl2e[i], pfn); + break; + } - fail: - MEM_LOG("Failure in alloc_l2_table: entry %d", i); - while ( i-- > 0 ) - if ( is_guest_l2_slot(d, type, i) ) - put_page_from_l2e(pl2e[i], pfn); + adjust_guest_l2e(pl2e[i], d); + } unmap_domain_page(pl2e); - return 0; + return rc > 0 ? 0 : rc; } #if CONFIG_PAGING_LEVELS >= 3 -static int alloc_l3_table(struct page_info *page) +static int alloc_l3_table(struct page_info *page, int preemptible) { struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l3_pgentry_t *pl3e; - int i; + unsigned int i; + int rc = 0, partial = page->partial_pte; -#ifdef CONFIG_X86_PAE +#if CONFIG_PAGING_LEVELS == 3 /* * PAE pgdirs above 4GB are unacceptable if the guest does not understand * the weird 'extended cr3' format for dealing with high-order address @@ -1108,7 +1130,7 @@ static int alloc_l3_table(struct page_in d->vcpu[0] && d->vcpu[0]->is_initialised ) { MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn); - return 0; + return -EINVAL; } #endif @@ -1124,60 +1146,103 @@ static int alloc_l3_table(struct page_in if ( is_pv_32on64_domain(d) ) memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e)); - for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ ) + for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; + i++, partial = 0 ) { -#if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT) if ( is_pv_32bit_domain(d) && (i == 3) ) { if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) || - (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) || - !get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]), - PGT_l2_page_table | - PGT_pae_xen_l2, - d) ) - goto fail; + (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) ) + rc = -EINVAL; + else + rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]), + PGT_l2_page_table | + PGT_pae_xen_l2, + d, partial, preemptible); + } + else if ( !is_guest_l3_slot(i) || + (rc = get_page_from_l3e(pl3e[i], pfn, d, + partial, preemptible)) > 0 ) + continue; + + if ( rc == -EAGAIN ) + { + page->nr_validated_ptes = i; + page->partial_pte = partial ?: 1; } - else -#endif - if ( is_guest_l3_slot(i) && - unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) ) - goto fail; - + else if ( rc == -EINTR && i ) + { + page->nr_validated_ptes = i; + page->partial_pte = 0; + rc = -EAGAIN; + } + if ( rc < 0 ) + break; + adjust_guest_l3e(pl3e[i], d); } - if ( !create_pae_xen_mappings(d, pl3e) ) - goto fail; - - unmap_domain_page(pl3e); - return 1; - - fail: - MEM_LOG("Failure in alloc_l3_table: entry %d", i); - while ( i-- > 0 ) - if ( is_guest_l3_slot(i) ) - put_page_from_l3e(pl3e[i], pfn); + if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) ) + rc = -EINVAL; + if ( rc < 0 && rc != -EAGAIN && rc != -EINTR ) + { + MEM_LOG("Failure in alloc_l3_table: entry %d", i); + while ( i-- > 0 ) + { + if ( !is_guest_l3_slot(i) ) + continue; + unadjust_guest_l3e(pl3e[i], d); + put_page_from_l3e(pl3e[i], pfn, 0, 0); + } + } unmap_domain_page(pl3e); - return 0; + return rc > 0 ? 0 : rc; } #else -#define alloc_l3_table(page) (0) +#define alloc_l3_table(page, preemptible) (-EINVAL) #endif #if CONFIG_PAGING_LEVELS >= 4 -static int alloc_l4_table(struct page_info *page) +static int alloc_l4_table(struct page_info *page, int preemptible) { struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l4_pgentry_t *pl4e = page_to_virt(page); - int i; + unsigned int i; + int rc = 0, partial = page->partial_pte; - for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ ) + for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; + i++, partial = 0 ) { - if ( is_guest_l4_slot(d, i) && - unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) ) - goto fail; + if ( !is_guest_l4_slot(d, i) || + (rc = get_page_from_l4e(pl4e[i], pfn, d, + partial, preemptible)) > 0 ) + continue; + + if ( rc == -EAGAIN ) + { + page->nr_validated_ptes = i; + page->partial_pte = partial ?: 1; + } + else if ( rc == -EINTR ) + { + if ( i ) + { + page->nr_validated_ptes = i; + page->partial_pte = 0; + rc = -EAGAIN; + } + } + else if ( rc < 0 ) + { + MEM_LOG("Failure in alloc_l4_table: entry %d", i); + while ( i-- > 0 ) + if ( is_guest_l4_slot(d, i) ) + put_page_from_l4e(pl4e[i], pfn, 0, 0); + } + if ( rc < 0 ) + return rc; adjust_guest_l4e(pl4e[i], d); } @@ -1191,23 +1256,11 @@ static int alloc_l4_table(struct page_in pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] = l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR); - if ( is_pv_32on64_domain(d) ) - pl4e[l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] = - l4e_from_page(virt_to_page(d->arch.mm_arg_xlat_l3), - __PAGE_HYPERVISOR); - - return 1; - - fail: - MEM_LOG("Failure in alloc_l4_table: entry %d", i); - while ( i-- > 0 ) - if ( is_guest_l4_slot(d, i) ) - put_page_from_l4e(pl4e[i], pfn); - return 0; + return rc > 0 ? 0 : rc; } #else -#define alloc_l4_table(page) (0) +#define alloc_l4_table(page, preemptible) (-EINVAL) #endif @@ -1216,7 +1269,7 @@ static void free_l1_table(struct page_in struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l1_pgentry_t *pl1e; - int i; + unsigned int i; pl1e = map_domain_page(pfn); @@ -1228,64 +1281,113 @@ static void free_l1_table(struct page_in } -static void free_l2_table(struct page_info *page) +static int free_l2_table(struct page_info *page, int preemptible) { #ifdef CONFIG_COMPAT struct domain *d = page_get_owner(page); #endif unsigned long pfn = page_to_mfn(page); l2_pgentry_t *pl2e; - int i; + unsigned int i = page->nr_validated_ptes - 1; + int err = 0; pl2e = map_domain_page(pfn); - for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) - if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) ) - put_page_from_l2e(pl2e[i], pfn); + ASSERT(page->nr_validated_ptes); + do { + if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) && + put_page_from_l2e(pl2e[i], pfn) == 0 && + preemptible && i && hypercall_preempt_check() ) + { + page->nr_validated_ptes = i; + err = -EAGAIN; + } + } while ( !err && i-- ); unmap_domain_page(pl2e); - page->u.inuse.type_info &= ~PGT_pae_xen_l2; + if ( !err ) + page->u.inuse.type_info &= ~PGT_pae_xen_l2; + + return err; } #if CONFIG_PAGING_LEVELS >= 3 - -static void free_l3_table(struct page_info *page) +static int free_l3_table(struct page_info *page, int preemptible) { struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l3_pgentry_t *pl3e; - int i; + int rc = 0, partial = page->partial_pte; + unsigned int i = page->nr_validated_ptes - !partial; pl3e = map_domain_page(pfn); - for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ ) + do { if ( is_guest_l3_slot(i) ) { - put_page_from_l3e(pl3e[i], pfn); + rc = put_page_from_l3e(pl3e[i], pfn, partial, preemptible); + if ( rc < 0 ) + break; + partial = 0; + if ( rc > 0 ) + continue; unadjust_guest_l3e(pl3e[i], d); } + } while ( i-- ); unmap_domain_page(pl3e); -} + if ( rc == -EAGAIN ) + { + page->nr_validated_ptes = i; + page->partial_pte = partial ?: -1; + } + else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 ) + { + page->nr_validated_ptes = i + 1; + page->partial_pte = 0; + rc = -EAGAIN; + } + return rc > 0 ? 0 : rc; +} +#else +#define free_l3_table(page, preemptible) (-EINVAL) #endif #if CONFIG_PAGING_LEVELS >= 4 - -static void free_l4_table(struct page_info *page) +static int free_l4_table(struct page_info *page, int preemptible) { struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l4_pgentry_t *pl4e = page_to_virt(page); - int i; + int rc = 0, partial = page->partial_pte; + unsigned int i = page->nr_validated_ptes - !partial; - for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ ) + do { if ( is_guest_l4_slot(d, i) ) - put_page_from_l4e(pl4e[i], pfn); -} + rc = put_page_from_l4e(pl4e[i], pfn, partial, preemptible); + if ( rc < 0 ) + break; + partial = 0; + } while ( i-- ); + if ( rc == -EAGAIN ) + { + page->nr_validated_ptes = i; + page->partial_pte = partial ?: -1; + } + else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 ) + { + page->nr_validated_ptes = i + 1; + page->partial_pte = 0; + rc = -EAGAIN; + } + return rc > 0 ? 0 : rc; +} +#else +#define free_l4_table(page, preemptible) (-EINVAL) #endif @@ -1295,16 +1397,24 @@ static inline int update_intpte(intpte_t intpte_t old, intpte_t new, unsigned long mfn, - struct vcpu *v) + struct vcpu *v, + int preserve_ad) { int rv = 1; #ifndef PTE_UPDATE_WITH_CMPXCHG - rv = paging_write_guest_entry(v, p, new, _mfn(mfn)); -#else + if ( !preserve_ad ) + { + rv = paging_write_guest_entry(v, p, new, _mfn(mfn)); + } + else +#endif { intpte_t t = old; for ( ; ; ) { + if ( preserve_ad ) + new |= old & (_PAGE_ACCESSED | _PAGE_DIRTY); + rv = paging_cmpxchg_guest_entry(v, p, &t, new, _mfn(mfn)); if ( unlikely(rv == 0) ) { @@ -1322,20 +1432,19 @@ static inline int update_intpte(intpte_t old = t; } } -#endif return rv; } /* Macro that wraps the appropriate type-changes around update_intpte(). * Arguments are: type, ptr, old, new, mfn, vcpu */ -#define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v) \ +#define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v,_ad) \ update_intpte(&_t ## e_get_intpte(*(_p)), \ _t ## e_get_intpte(_o), _t ## e_get_intpte(_n), \ - (_m), (_v)) + (_m), (_v), (_ad)) /* Update the L1 entry at pl1e to new value nl1e. */ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e, - unsigned long gl1mfn) + unsigned long gl1mfn, int preserve_ad) { l1_pgentry_t ol1e; struct domain *d = current->domain; @@ -1345,7 +1454,7 @@ static int mod_l1_entry(l1_pgentry_t *pl return 0; if ( unlikely(paging_mode_refcounts(d)) ) - return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current); + return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current, preserve_ad); if ( l1e_get_flags(nl1e) & _PAGE_PRESENT ) { @@ -1363,16 +1472,20 @@ static int mod_l1_entry(l1_pgentry_t *pl return 0; } - adjust_guest_l1e(nl1e, d); - /* Fast path for identical mapping, r/w and presence. */ if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) ) - return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current); + { + adjust_guest_l1e(nl1e, d); + return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current, + preserve_ad); + } if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) ) return 0; - - if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current)) ) + + adjust_guest_l1e(nl1e, d); + if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current, + preserve_ad)) ) { put_page_from_l1e(nl1e, d); return 0; @@ -1380,7 +1493,8 @@ static int mod_l1_entry(l1_pgentry_t *pl } else { - if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current)) ) + if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current, + preserve_ad)) ) return 0; } @@ -1393,7 +1507,8 @@ static int mod_l1_entry(l1_pgentry_t *pl static int mod_l2_entry(l2_pgentry_t *pl2e, l2_pgentry_t nl2e, unsigned long pfn, - unsigned long type) + unsigned long type, + int preserve_ad) { l2_pgentry_t ol2e; struct domain *d = current->domain; @@ -1416,22 +1531,27 @@ static int mod_l2_entry(l2_pgentry_t *pl return 0; } - adjust_guest_l2e(nl2e, d); - /* Fast path for identical mapping and presence. */ if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT)) - return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current); + { + adjust_guest_l2e(nl2e, d); + return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current, preserve_ad); + } - if ( unlikely(!get_page_from_l2e(nl2e, pfn, d)) ) + if ( unlikely(get_page_from_l2e(nl2e, pfn, d) < 0) ) return 0; - if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current)) ) + adjust_guest_l2e(nl2e, d); + + if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current, + preserve_ad)) ) { put_page_from_l2e(nl2e, pfn); return 0; } } - else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current)) ) + else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current, + preserve_ad)) ) { return 0; } @@ -1445,16 +1565,18 @@ static int mod_l2_entry(l2_pgentry_t *pl /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */ static int mod_l3_entry(l3_pgentry_t *pl3e, l3_pgentry_t nl3e, - unsigned long pfn) + unsigned long pfn, + int preserve_ad, + int preemptible) { l3_pgentry_t ol3e; struct domain *d = current->domain; - int okay; + int rc = 0; if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) ) { MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e); - return 0; + return -EINVAL; } #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT) @@ -1463,11 +1585,11 @@ static int mod_l3_entry(l3_pgentry_t *pl * would be a pain to ensure they remain continuously valid throughout. */ if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) ) - return 0; + return -EINVAL; #endif if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) ) - return 0; + return -EFAULT; if ( l3e_get_flags(nl3e) & _PAGE_PRESENT ) { @@ -1475,36 +1597,46 @@ static int mod_l3_entry(l3_pgentry_t *pl { MEM_LOG("Bad L3 flags %x", l3e_get_flags(nl3e) & l3_disallow_mask(d)); - return 0; + return -EINVAL; } - adjust_guest_l3e(nl3e, d); - /* Fast path for identical mapping and presence. */ if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT)) - return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current); + { + adjust_guest_l3e(nl3e, d); + rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current, preserve_ad); + return rc ? 0 : -EFAULT; + } - if ( unlikely(!get_page_from_l3e(nl3e, pfn, d)) ) - return 0; + rc = get_page_from_l3e(nl3e, pfn, d, 0, preemptible); + if ( unlikely(rc < 0) ) + return rc; + rc = 0; - if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current)) ) + adjust_guest_l3e(nl3e, d); + if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current, + preserve_ad)) ) { - put_page_from_l3e(nl3e, pfn); - return 0; + ol3e = nl3e; + rc = -EFAULT; } } - else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current)) ) + else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current, + preserve_ad)) ) { - return 0; + return -EFAULT; } - okay = create_pae_xen_mappings(d, pl3e); - BUG_ON(!okay); + if ( likely(rc == 0) ) + { + if ( !create_pae_xen_mappings(d, pl3e) ) + BUG(); - pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e); + pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e); + } - put_page_from_l3e(ol3e, pfn); - return 1; + put_page_from_l3e(ol3e, pfn, 0, 0); + return rc; } #endif @@ -1515,18 +1647,21 @@ static int mod_l3_entry(l3_pgentry_t *pl static int mod_l4_entry(struct domain *d, l4_pgentry_t *pl4e, l4_pgentry_t nl4e, - unsigned long pfn) + unsigned long pfn, + int preserve_ad, + int preemptible) { l4_pgentry_t ol4e; + int rc = 0; if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) ) { MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e); - return 0; + return -EINVAL; } if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) ) - return 0; + return -EFAULT; if ( l4e_get_flags(nl4e) & _PAGE_PRESENT ) { @@ -1534,38 +1669,69 @@ static int mod_l4_entry(struct domain *d { MEM_LOG("Bad L4 flags %x", l4e_get_flags(nl4e) & L4_DISALLOW_MASK); - return 0; + return -EINVAL; } - adjust_guest_l4e(nl4e, current->domain); - /* Fast path for identical mapping and presence. */ if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT)) - return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current); + { + adjust_guest_l4e(nl4e, current->domain); + rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current, preserve_ad); + return rc ? 0 : -EFAULT; + } - if ( unlikely(!get_page_from_l4e(nl4e, pfn, current->domain)) ) - return 0; + rc = get_page_from_l4e(nl4e, pfn, current->domain, 0, preemptible); + if ( unlikely(rc < 0) ) + return rc; + rc = 0; - if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current)) ) + adjust_guest_l4e(nl4e, current->domain); + if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current, + preserve_ad)) ) { - put_page_from_l4e(nl4e, pfn); - return 0; + ol4e = nl4e; + rc = -EFAULT; } } - else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current)) ) + else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current, + preserve_ad)) ) { - return 0; + return -EFAULT; } - put_page_from_l4e(ol4e, pfn); - return 1; + put_page_from_l4e(ol4e, pfn, 0, 0); + return rc; } #endif -int alloc_page_type(struct page_info *page, unsigned long type) +/* + * Special version of get_page() to be used exclusively when + * - a page is known to already have a non-zero reference count + * - the page does not need its owner to be checked + * - it will not be called more than once without dropping the thus + * acquired reference again. + * Due to get_page() reserving one reference, this call cannot fail. + */ +static void get_page_light(struct page_info *page) +{ + u32 x, nx, y = page->count_info; + + do { + x = y; + nx = x + 1; + BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */ + BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */ + y = cmpxchg(&page->count_info, x, nx); + } + while ( unlikely(y != x) ); +} + +int alloc_page_type(struct page_info *page, unsigned long type, + int preemptible) { struct domain *owner = page_get_owner(page); + int rc; /* A page table is dirtied when its type count becomes non-zero. */ if ( likely(owner != NULL) ) @@ -1574,31 +1740,66 @@ int alloc_page_type(struct page_info *pa switch ( type & PGT_type_mask ) { case PGT_l1_page_table: - return alloc_l1_table(page); + rc = alloc_l1_table(page); + break; case PGT_l2_page_table: - return alloc_l2_table(page, type); + rc = alloc_l2_table(page, type, preemptible); + break; case PGT_l3_page_table: - return alloc_l3_table(page); + rc = alloc_l3_table(page, preemptible); + break; case PGT_l4_page_table: - return alloc_l4_table(page); + rc = alloc_l4_table(page, preemptible); + break; case PGT_gdt_page: case PGT_ldt_page: - return alloc_segdesc_page(page); + rc = alloc_segdesc_page(page); + break; default: printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n", type, page->u.inuse.type_info, page->count_info); + rc = -EINVAL; BUG(); } - return 0; + /* No need for atomic update of type_info here: noone else updates it. */ + wmb(); + if ( rc == -EAGAIN ) + { + get_page_light(page); + page->u.inuse.type_info |= PGT_partial; + } + else if ( rc == -EINTR ) + { + ASSERT((page->u.inuse.type_info & + (PGT_count_mask|PGT_validated|PGT_partial)) == 1); + page->u.inuse.type_info &= ~PGT_count_mask; + } + else if ( rc ) + { + ASSERT(rc < 0); + MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %" + PRtype_info ": caf=%08x taf=%" PRtype_info, + page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)), + type, page->count_info, page->u.inuse.type_info); + page->u.inuse.type_info = 0; + } + else + { + page->u.inuse.type_info |= PGT_validated; + } + + return rc; } -void free_page_type(struct page_info *page, unsigned long type) +int free_page_type(struct page_info *page, unsigned long type, + int preemptible) { struct domain *owner = page_get_owner(page); unsigned long gmfn; + int rc; if ( likely(owner != NULL) ) { @@ -1618,7 +1819,7 @@ void free_page_type(struct page_info *pa paging_mark_dirty(owner, page_to_mfn(page)); if ( shadow_mode_refcounts(owner) ) - return; + return 0; gmfn = mfn_to_gmfn(owner, page_to_mfn(page)); ASSERT(VALID_M2P(gmfn)); @@ -1626,42 +1827,97 @@ void free_page_type(struct page_info *pa } } + if ( !(type & PGT_partial) ) + { + page->nr_validated_ptes = 1U << PAGETABLE_ORDER; + page->partial_pte = 0; + } + switch ( type & PGT_type_mask ) { case PGT_l1_page_table: free_l1_table(page); + rc = 0; break; - case PGT_l2_page_table: - free_l2_table(page); + rc = free_l2_table(page, preemptible); break; - #if CONFIG_PAGING_LEVELS >= 3 case PGT_l3_page_table: - free_l3_table(page); +#if CONFIG_PAGING_LEVELS == 3 + if ( !(type & PGT_partial) ) + page->nr_validated_ptes = L3_PAGETABLE_ENTRIES; +#endif + rc = free_l3_table(page, preemptible); break; #endif - #if CONFIG_PAGING_LEVELS >= 4 case PGT_l4_page_table: - free_l4_table(page); + rc = free_l4_table(page, preemptible); break; #endif default: - printk("%s: type %lx pfn %lx\n",__FUNCTION__, - type, page_to_mfn(page)); + MEM_LOG("type %lx pfn %lx\n", type, page_to_mfn(page)); + rc = -EINVAL; BUG(); } + + return rc; } -void put_page_type(struct page_info *page) +static int __put_final_page_type( + struct page_info *page, unsigned long type, int preemptible) +{ + int rc = free_page_type(page, type, preemptible); + + /* No need for atomic update of type_info here: noone else updates it. */ + if ( rc == 0 ) + { + /* + * Record TLB information for flush later. We do not stamp page tables + * when running in shadow mode: + * 1. Pointless, since it's the shadow pt's which must be tracked. + * 2. Shadow mode reuses this field for shadowed page tables to + * store flags info -- we don't want to conflict with that. + */ + if ( !(shadow_mode_enabled(page_get_owner(page)) && + (page->count_info & PGC_page_table)) ) + page->tlbflush_timestamp = tlbflush_current_time(); + wmb(); + page->u.inuse.type_info--; + } + else if ( rc == -EINTR ) + { + ASSERT((page->u.inuse.type_info & + (PGT_count_mask|PGT_validated|PGT_partial)) == 1); + if ( !(shadow_mode_enabled(page_get_owner(page)) && + (page->count_info & PGC_page_table)) ) + page->tlbflush_timestamp = tlbflush_current_time(); + wmb(); + page->u.inuse.type_info |= PGT_validated; + } + else + { + BUG_ON(rc != -EAGAIN); + wmb(); + get_page_light(page); + page->u.inuse.type_info |= PGT_partial; + } + + return rc; +} + + +static int __put_page_type(struct page_info *page, + int preemptible) { unsigned long nx, x, y = page->u.inuse.type_info; + int rc = 0; - again: - do { + for ( ; ; ) + { x = y; nx = x - 1; @@ -1670,21 +1926,22 @@ void put_page_type(struct page_info *pag if ( unlikely((nx & PGT_count_mask) == 0) ) { if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) && - likely(nx & PGT_validated) ) + likely(nx & (PGT_validated|PGT_partial)) ) { /* * Page-table pages must be unvalidated when count is zero. The * 'free' is safe because the refcnt is non-zero and validated * bit is clear => other ops will spin or fail. */ - if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, - x & ~PGT_validated)) != x) ) - goto again; + nx = x & ~(PGT_validated|PGT_partial); + if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, + x, nx)) != x) ) + continue; /* We cleared the 'valid bit' so we do the clean up. */ - free_page_type(page, x); - /* Carry on, but with the 'valid bit' now clear. */ - x &= ~PGT_validated; - nx &= ~PGT_validated; + rc = __put_final_page_type(page, x, preemptible); + if ( x & PGT_partial ) + put_page(page); + break; } /* @@ -1698,25 +1955,34 @@ void put_page_type(struct page_info *pag (page->count_info & PGC_page_table)) ) page->tlbflush_timestamp = tlbflush_current_time(); } + + if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) ) + break; + + if ( preemptible && hypercall_preempt_check() ) + return -EINTR; } - while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); + + return rc; } -int get_page_type(struct page_info *page, unsigned long type) +static int __get_page_type(struct page_info *page, unsigned long type, + int preemptible) { unsigned long nx, x, y = page->u.inuse.type_info; + int rc = 0; ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2))); - again: - do { + for ( ; ; ) + { x = y; nx = x + 1; if ( unlikely((nx & PGT_count_mask) == 0) ) { MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page)); - return 0; + return -EINVAL; } else if ( unlikely((x & PGT_count_mask) == 0) ) { @@ -1763,50 +2029,85 @@ int get_page_type(struct page_info *page /* Don't log failure if it could be a recursive-mapping attempt. */ if ( ((x & PGT_type_mask) == PGT_l2_page_table) && (type == PGT_l1_page_table) ) - return 0; + return -EINVAL; if ( ((x & PGT_type_mask) == PGT_l3_page_table) && (type == PGT_l2_page_table) ) - return 0; + return -EINVAL; if ( ((x & PGT_type_mask) == PGT_l4_page_table) && (type == PGT_l3_page_table) ) - return 0; + return -EINVAL; MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") " "for mfn %lx (pfn %lx)", x, type, page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page))); - return 0; + return -EINVAL; } else if ( unlikely(!(x & PGT_validated)) ) { - /* Someone else is updating validation of this page. Wait... */ - while ( (y = page->u.inuse.type_info) == x ) - cpu_relax(); - goto again; + if ( !(x & PGT_partial) ) + { + /* Someone else is updating validation of this page. Wait... */ + while ( (y = page->u.inuse.type_info) == x ) + { + if ( preemptible && hypercall_preempt_check() ) + return -EINTR; + cpu_relax(); + } + continue; + } + /* Type ref count was left at 1 when PGT_partial got set. */ + ASSERT((x & PGT_count_mask) == 1); + nx = x & ~PGT_partial; } + + if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) ) + break; + + if ( preemptible && hypercall_preempt_check() ) + return -EINTR; } - while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); if ( unlikely(!(nx & PGT_validated)) ) { - /* Try to validate page type; drop the new reference on failure. */ - if ( unlikely(!alloc_page_type(page, type)) ) + if ( !(x & PGT_partial) ) { - MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %" - PRtype_info ": caf=%08x taf=%" PRtype_info, - page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)), - type, page->count_info, page->u.inuse.type_info); - /* Noone else can get a reference. We hold the only ref. */ - page->u.inuse.type_info = 0; - return 0; + page->nr_validated_ptes = 0; + page->partial_pte = 0; } - - /* Noone else is updating simultaneously. */ - __set_bit(_PGT_validated, &page->u.inuse.type_info); + rc = alloc_page_type(page, type, preemptible); } - return 1; + if ( (x & PGT_partial) && !(nx & PGT_partial) ) + put_page(page); + + return rc; +} + +void put_page_type(struct page_info *page) +{ + int rc = __put_page_type(page, 0); + ASSERT(rc == 0); + (void)rc; } +int get_page_type(struct page_info *page, unsigned long type) +{ + int rc = __get_page_type(page, type, 0); + if ( likely(rc == 0) ) + return 1; + ASSERT(rc == -EINVAL); + return 0; +} + +int put_page_type_preemptible(struct page_info *page) +{ + return __put_page_type(page, 1); +} + +int get_page_type_preemptible(struct page_info *page, unsigned long type) +{ + return __get_page_type(page, type, 1); +} int new_guest_cr3(unsigned long mfn) { @@ -1826,7 +2127,7 @@ int new_guest_cr3(unsigned long mfn) l4e_from_pfn( mfn, (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)), - pagetable_get_pfn(v->arch.guest_table)); + pagetable_get_pfn(v->arch.guest_table), 0, 0) == 0; if ( unlikely(!okay) ) { MEM_LOG("Error while installing new compat baseptr %lx", mfn); @@ -1841,7 +2142,7 @@ int new_guest_cr3(unsigned long mfn) #endif okay = paging_mode_refcounts(d) ? get_page_from_pagenr(mfn, d) - : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d); + : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0); if ( unlikely(!okay) ) { MEM_LOG("Error while installing new baseptr %lx", mfn); @@ -1962,6 +2263,12 @@ static inline cpumask_t vcpumask_to_pcpu cpumask_t pmask = CPU_MASK_NONE; struct vcpu *v; + /* + * Callers copy only a single guest-sized longword from the guest. + * This must be wide enough to reference all VCPUs. Worst case is 32 bits. + */ + BUILD_BUG_ON(MAX_VIRT_CPUS > 32); + while ( vmask != 0 ) { vcpu_id = find_first_set_bit(vmask); @@ -2015,9 +2322,7 @@ int do_mmuext_op( { if ( hypercall_preempt_check() ) { - rc = hypercall_create_continuation( - __HYPERVISOR_mmuext_op, "hihi", - uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); + rc = -EAGAIN; break; } @@ -2060,10 +2365,14 @@ int do_mmuext_op( if ( paging_mode_refcounts(FOREIGNDOM) ) break; - okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM); + rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 0, 1); + okay = !rc; if ( unlikely(!okay) ) { - MEM_LOG("Error while pinning mfn %lx", mfn); + if ( rc == -EINTR ) + rc = -EAGAIN; + else if ( rc != -EAGAIN ) + MEM_LOG("Error while pinning mfn %lx", mfn); break; } @@ -2108,8 +2417,11 @@ int do_mmuext_op( { put_page_and_type(page); put_page(page); - /* A page is dirtied when its pin status is cleared. */ - paging_mark_dirty(d, mfn); + if ( !rc ) + { + /* A page is dirtied when its pin status is cleared. */ + paging_mark_dirty(d, mfn); + } } else { @@ -2133,8 +2445,8 @@ int do_mmuext_op( if ( paging_mode_refcounts(d) ) okay = get_page_from_pagenr(mfn, d); else - okay = get_page_and_type_from_pagenr( - mfn, PGT_root_page_table, d); + okay = !get_page_and_type_from_pagenr( + mfn, PGT_root_page_table, d, 0, 0); if ( unlikely(!okay) ) { MEM_LOG("Error while installing new mfn %lx", mfn); @@ -2253,6 +2565,11 @@ int do_mmuext_op( guest_handle_add_offset(uops, 1); } + if ( rc == -EAGAIN ) + rc = hypercall_create_continuation( + __HYPERVISOR_mmuext_op, "hihi", + uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); + process_deferred_ops(); UNLOCK_BIGLOCK(d); @@ -2316,9 +2633,7 @@ int do_mmu_update( { if ( hypercall_preempt_check() ) { - rc = hypercall_create_continuation( - __HYPERVISOR_mmu_update, "hihi", - ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); + rc = -EAGAIN; break; } @@ -2336,9 +2651,12 @@ int do_mmu_update( { /* * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table. + * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR) + * current A/D bits. */ case MMU_NORMAL_PT_UPDATE: - + case MMU_PT_UPDATE_PRESERVE_AD: + req.ptr -= cmd; gmfn = req.ptr >> PAGE_SHIFT; mfn = gmfn_to_mfn(d, gmfn); @@ -2375,20 +2693,24 @@ int do_mmu_update( case PGT_l1_page_table: { l1_pgentry_t l1e = l1e_from_intpte(req.val); - okay = mod_l1_entry(va, l1e, mfn); + okay = mod_l1_entry(va, l1e, mfn, + cmd == MMU_PT_UPDATE_PRESERVE_AD); } break; case PGT_l2_page_table: { l2_pgentry_t l2e = l2e_from_intpte(req.val); - okay = mod_l2_entry(va, l2e, mfn, type_info); + okay = mod_l2_entry(va, l2e, mfn, type_info, + cmd == MMU_PT_UPDATE_PRESERVE_AD); } break; #if CONFIG_PAGING_LEVELS >= 3 case PGT_l3_page_table: { l3_pgentry_t l3e = l3e_from_intpte(req.val); - okay = mod_l3_entry(va, l3e, mfn); + rc = mod_l3_entry(va, l3e, mfn, + cmd == MMU_PT_UPDATE_PRESERVE_AD, 1); + okay = !rc; } break; #endif @@ -2396,13 +2718,17 @@ int do_mmu_update( case PGT_l4_page_table: { l4_pgentry_t l4e = l4e_from_intpte(req.val); - okay = mod_l4_entry(d, va, l4e, mfn); + rc = mod_l4_entry(d, va, l4e, mfn, + cmd == MMU_PT_UPDATE_PRESERVE_AD, 1); + okay = !rc; } break; #endif } put_page_type(page); + if ( rc == -EINTR) + rc = -EAGAIN; } break; @@ -2465,6 +2791,11 @@ int do_mmu_update( guest_handle_add_offset(ureqs, 1); } + if ( rc == -EAGAIN ) + rc = hypercall_create_continuation( + __HYPERVISOR_mmu_update, "hihi", + ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); + process_deferred_ops(); UNLOCK_BIGLOCK(d); @@ -2522,7 +2853,7 @@ static int create_grant_pte_mapping( } ol1e = *(l1_pgentry_t *)va; - if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v) ) + if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v, 0) ) { put_page_type(page); rc = GNTST_general_error; @@ -2590,9 +2921,11 @@ static int destroy_grant_pte_mapping( } /* Delete pagetable entry. */ - if ( unlikely(!UPDATE_ENTRY(l1, - (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn, - d->vcpu[0] /* Change if we go to per-vcpu shadows. */)) ) + if ( unlikely(!UPDATE_ENTRY + (l1, + (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn, + d->vcpu[0] /* Change if we go to per-vcpu shadows. */, + 0)) ) { MEM_LOG("Cannot delete PTE entry at %p", va); put_page_type(page); @@ -2628,7 +2961,7 @@ static int create_grant_va_mapping( return GNTST_general_error; } ol1e = *pl1e; - okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v); + okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0); guest_unmap_l1e(v, pl1e); pl1e = NULL; @@ -2666,7 +2999,7 @@ static int destroy_grant_va_mapping( } /* Delete pagetable entry. */ - if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(), gl1mfn, v)) ) + if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(), gl1mfn, v, 0)) ) { MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e); rc = GNTST_general_error; @@ -2768,7 +3101,7 @@ int do_update_va_mapping(unsigned long v pl1e = guest_map_l1e(v, va, &gl1mfn); - if ( unlikely(!pl1e || !mod_l1_entry(pl1e, val, gl1mfn)) ) + if ( unlikely(!pl1e || !mod_l1_entry(pl1e, val, gl1mfn, 0)) ) rc = -EINVAL; if ( pl1e ) @@ -3061,7 +3394,7 @@ long arch_memory_op(int op, XEN_GUEST_HA { if ( is_xen_heap_frame(mfn_to_page(prev_mfn)) ) /* Xen heap frames are simply unhooked from this phys slot. */ - guest_physmap_remove_page(d, xatp.gpfn, prev_mfn); + guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0); else /* Normal domain memory is freed, to avoid leaking memory. */ guest_remove_page(d, xatp.gpfn); @@ -3070,10 +3403,10 @@ long arch_memory_op(int op, XEN_GUEST_HA /* Unmap from old location, if any. */ gpfn = get_gpfn_from_mfn(mfn); if ( gpfn != INVALID_M2P_ENTRY ) - guest_physmap_remove_page(d, gpfn, mfn); + guest_physmap_remove_page(d, gpfn, mfn, 0); /* Map at new location. */ - guest_physmap_add_page(d, xatp.gpfn, mfn); + guest_physmap_add_page(d, xatp.gpfn, mfn, 0); UNLOCK_BIGLOCK(d); @@ -3318,7 +3651,7 @@ static int ptwr_emulated_update( else { ol1e = *pl1e; - if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v) ) + if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v, 0) ) BUG(); } diff -Naurp xen/arch/x86/msi.c xen-redhat/arch/x86/msi.c --- xen/arch/x86/msi.c +++ xen-redhat/arch/x86/msi.c @@ -0,0 +1,858 @@ +/* + * File: msi.c + * Purpose: PCI Message Signaled Interrupt (MSI) + * + * Copyright (C) 2003-2004 Intel + * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com) + */ + +#include <xen/config.h> +#include <xen/lib.h> +#include <xen/init.h> +#include <xen/irq.h> +#include <xen/delay.h> +#include <xen/sched.h> +#include <xen/acpi.h> +#include <xen/errno.h> +#include <xen/pci.h> +#include <xen/pci_regs.h> +#include <xen/keyhandler.h> +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/desc.h> +#include <asm/msi.h> +#include <asm/fixmap.h> +#include <mach_apic.h> +#include <io_ports.h> +#include <public/physdev.h> +#include <xen/iommu.h> + +/* bitmap indicate which fixed map is free */ +DEFINE_SPINLOCK(msix_fixmap_lock); +DECLARE_BITMAP(msix_fixmap_pages, FIX_MSIX_MAX_PAGES); + +static int msix_fixmap_alloc(void) +{ + int i, rc = -ENOMEM; + + spin_lock(&msix_fixmap_lock); + for ( i = 0; i < FIX_MSIX_MAX_PAGES; i++ ) + if ( !test_bit(i, &msix_fixmap_pages) ) + break; + if ( i == FIX_MSIX_MAX_PAGES ) + goto out; + rc = FIX_MSIX_IO_RESERV_BASE + i; + set_bit(i, &msix_fixmap_pages); + + out: + spin_unlock(&msix_fixmap_lock); + return rc; +} + +static void msix_fixmap_free(int idx) +{ + spin_lock(&msix_fixmap_lock); + if ( idx >= FIX_MSIX_IO_RESERV_BASE ) + clear_bit(idx - FIX_MSIX_IO_RESERV_BASE, &msix_fixmap_pages); + spin_unlock(&msix_fixmap_lock); +} + +static int msix_get_fixmap(struct pci_dev *dev, unsigned long table_paddr, + unsigned long entry_paddr) +{ + int nr_page, idx; + + nr_page = (entry_paddr >> PAGE_SHIFT) - (table_paddr >> PAGE_SHIFT); + + if ( nr_page < 0 || nr_page >= MAX_MSIX_TABLE_PAGES ) + return -EINVAL; + + spin_lock(&dev->msix_table_lock); + if ( dev->msix_table_refcnt[nr_page]++ == 0 ) + { + idx = msix_fixmap_alloc(); + if ( idx < 0 ) + { + dev->msix_table_refcnt[nr_page]--; + goto out; + } + set_fixmap_nocache(idx, entry_paddr); + dev->msix_table_idx[nr_page] = idx; + } + else + idx = dev->msix_table_idx[nr_page]; + + out: + spin_unlock(&dev->msix_table_lock); + return idx; +} + +static void msix_put_fixmap(struct pci_dev *dev, int idx) +{ + int i; + unsigned long start; + + spin_lock(&dev->msix_table_lock); + for ( i = 0; i < MAX_MSIX_TABLE_PAGES; i++ ) + { + if ( dev->msix_table_idx[i] == idx ) + break; + } + if ( i == MAX_MSIX_TABLE_PAGES ) + goto out; + + if ( --dev->msix_table_refcnt[i] == 0 ) + { + start = fix_to_virt(idx); + destroy_xen_mappings(start, start + PAGE_SIZE); + msix_fixmap_free(idx); + dev->msix_table_idx[i] = 0; + } + + out: + spin_unlock(&dev->msix_table_lock); +} + +/* + * MSI message composition + */ +static void msi_compose_msg(struct pci_dev *pdev, int vector, + struct msi_msg *msg) +{ + unsigned dest; + cpumask_t tmp; + + tmp = TARGET_CPUS; + if ( vector ) + { + dest = cpu_mask_to_apicid(tmp); + + msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_lo = + MSI_ADDR_BASE_LO | + ((INT_DEST_MODE == 0) ? + MSI_ADDR_DESTMODE_PHYS: + MSI_ADDR_DESTMODE_LOGIC) | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU: + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(dest); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED: + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(vector); + } +} + +static void read_msi_msg(struct msi_desc *entry, struct msi_msg *msg) +{ + switch ( entry->msi_attrib.type ) + { + case PCI_CAP_ID_MSI: + { + struct pci_dev *dev = entry->dev; + int pos = entry->msi_attrib.pos; + u16 data; + u8 bus = dev->bus; + u8 slot = PCI_SLOT(dev->devfn); + u8 func = PCI_FUNC(dev->devfn); + + msg->address_lo = pci_conf_read32(bus, slot, func, + msi_lower_address_reg(pos)); + if ( entry->msi_attrib.is_64 ) + { + msg->address_hi = pci_conf_read32(bus, slot, func, + msi_upper_address_reg(pos)); + data = pci_conf_read16(bus, slot, func, msi_data_reg(pos, 1)); + } + else + { + msg->address_hi = 0; + data = pci_conf_read16(bus, slot, func, msi_data_reg(pos, 0)); + } + msg->data = data; + break; + } + case PCI_CAP_ID_MSIX: + { + void __iomem *base; + base = entry->mask_base; + + msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); + msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); + msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET); + break; + } + default: + BUG(); + } + + if ( vtd_enabled ) + msi_msg_read_remap_rte(entry, msg); +} + +static int set_vector_msi(struct msi_desc *entry) +{ + if ( entry->vector >= NR_VECTORS ) + { + dprintk(XENLOG_ERR, "Trying to install msi data for Vector %d\n", + entry->vector); + return -EINVAL; + } + + irq_desc[entry->vector].msi_desc = entry; + return 0; +} + +static int unset_vector_msi(int vector) +{ + ASSERT(spin_is_locked(&irq_desc[vector].lock)); + + if ( vector >= NR_VECTORS ) + { + dprintk(XENLOG_ERR, "Trying to uninstall msi data for Vector %d\n", + vector); + return -EINVAL; + } + + irq_desc[vector].msi_desc = NULL; + + return 0; +} + +static void write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) +{ + if ( iommu_enabled ) + iommu_update_ire_from_msi(entry, msg); + + switch ( entry->msi_attrib.type ) + { + case PCI_CAP_ID_MSI: + { + struct pci_dev *dev = entry->dev; + int pos = entry->msi_attrib.pos; + u8 bus = dev->bus; + u8 slot = PCI_SLOT(dev->devfn); + u8 func = PCI_FUNC(dev->devfn); + + pci_conf_write32(bus, slot, func, msi_lower_address_reg(pos), + msg->address_lo); + if ( entry->msi_attrib.is_64 ) + { + pci_conf_write32(bus, slot, func, msi_upper_address_reg(pos), + msg->address_hi); + pci_conf_write16(bus, slot, func, msi_data_reg(pos, 1), + msg->data); + } + else + pci_conf_write16(bus, slot, func, msi_data_reg(pos, 0), + msg->data); + break; + } + case PCI_CAP_ID_MSIX: + { + void __iomem *base; + base = entry->mask_base; + + writel(msg->address_lo, + base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); + writel(msg->address_hi, + base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); + writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET); + break; + } + default: + BUG(); + } + entry->msg = *msg; +} + +void set_msi_affinity(unsigned int vector, cpumask_t mask) +{ + struct msi_desc *desc = irq_desc[vector].msi_desc; + struct msi_msg msg; + unsigned int dest; + + memset(&msg, 0, sizeof(msg)); + cpus_and(mask, mask, cpu_online_map); + if ( cpus_empty(mask) ) + mask = TARGET_CPUS; + dest = cpu_mask_to_apicid(mask); + + if ( !desc ) + return; + + ASSERT(spin_is_locked(&irq_desc[vector].lock)); + read_msi_msg(desc, &msg); + + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + write_msi_msg(desc, &msg); +} + +static void msi_set_enable(struct pci_dev *dev, int enable) +{ + int pos; + u16 control; + u8 bus = dev->bus; + u8 slot = PCI_SLOT(dev->devfn); + u8 func = PCI_FUNC(dev->devfn); + + pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSI); + if ( pos ) + { + control = pci_conf_read16(bus, slot, func, pos + PCI_MSI_FLAGS); + control &= ~PCI_MSI_FLAGS_ENABLE; + if ( enable ) + control |= PCI_MSI_FLAGS_ENABLE; + pci_conf_write16(bus, slot, func, pos + PCI_MSI_FLAGS, control); + } +} + +static void msix_set_enable(struct pci_dev *dev, int enable) +{ + int pos; + u16 control; + u8 bus = dev->bus; + u8 slot = PCI_SLOT(dev->devfn); + u8 func = PCI_FUNC(dev->devfn); + + pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSIX); + if ( pos ) + { + control = pci_conf_read16(bus, slot, func, pos + PCI_MSIX_FLAGS); + control &= ~PCI_MSIX_FLAGS_ENABLE; + if ( enable ) + control |= PCI_MSIX_FLAGS_ENABLE; + pci_conf_write16(bus, slot, func, pos + PCI_MSIX_FLAGS, control); + } +} + +static void msix_flush_writes(unsigned int vector) +{ + struct msi_desc *entry = irq_desc[vector].msi_desc; + + BUG_ON(!entry || !entry->dev); + switch (entry->msi_attrib.type) { + case PCI_CAP_ID_MSI: + /* nothing to do */ + break; + case PCI_CAP_ID_MSIX: + { + int offset = PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET; + readl(entry->mask_base + offset); + break; + } + default: + BUG(); + break; + } +} + +int msi_maskable_irq(const struct msi_desc *entry) +{ + BUG_ON(!entry); + return entry->msi_attrib.type != PCI_CAP_ID_MSI + || entry->msi_attrib.maskbit; +} + +static void msi_set_mask_bit(unsigned int vector, int flag) +{ + struct msi_desc *entry = irq_desc[vector].msi_desc; + + ASSERT(spin_is_locked(&irq_desc[vector].lock)); + BUG_ON(!entry || !entry->dev); + switch (entry->msi_attrib.type) { + case PCI_CAP_ID_MSI: + if (entry->msi_attrib.maskbit) { + int pos; + u32 mask_bits; + u8 bus = entry->dev->bus; + u8 slot = PCI_SLOT(entry->dev->devfn); + u8 func = PCI_FUNC(entry->dev->devfn); + + pos = (long)entry->mask_base; + mask_bits = pci_conf_read32(bus, slot, func, pos); + mask_bits &= ~(1); + mask_bits |= flag; + pci_conf_write32(bus, slot, func, pos, mask_bits); + } + break; + case PCI_CAP_ID_MSIX: + { + int offset = PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET; + writel(flag, entry->mask_base + offset); + readl(entry->mask_base + offset); + break; + } + default: + BUG(); + break; + } + entry->msi_attrib.masked = !!flag; +} + +void mask_msi_vector(unsigned int vector) +{ + msi_set_mask_bit(vector, 1); + msix_flush_writes(vector); +} + +void unmask_msi_vector(unsigned int vector) +{ + msi_set_mask_bit(vector, 0); + msix_flush_writes(vector); +} + +static struct msi_desc* alloc_msi_entry(void) +{ + struct msi_desc *entry; + + entry = xmalloc(struct msi_desc); + if ( !entry ) + return NULL; + + INIT_LIST_HEAD(&entry->list); + entry->dev = NULL; + entry->remap_index = -1; + + return entry; +} + +int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) +{ + struct msi_msg msg; + + msi_compose_msg(dev, desc->vector, &msg); + set_vector_msi(desc); + write_msi_msg(irq_desc[desc->vector].msi_desc, &msg); + + return 0; +} + +void teardown_msi_vector(int vector) +{ + unset_vector_msi(vector); +} + +int msi_free_vector(struct msi_desc *entry) +{ + if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX ) + { + unsigned long start; + + writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET); + + start = (unsigned long)entry->mask_base & ~(PAGE_SIZE - 1); + msix_put_fixmap(entry->dev, virt_to_fix(start)); + } + + /* Free the unused IRTE if intr remap enabled */ + if ( iommu_enabled ) + iommu_update_ire_from_msi(entry, NULL); + + list_del(&entry->list); + free_irq_vector(entry->vector); + xfree(entry); + return 0; +} + +static struct msi_desc *find_msi_entry(struct pci_dev *dev, + int vector, int cap_id) +{ + struct msi_desc *entry; + + list_for_each_entry( entry, &dev->msi_list, list ) + { + if ( entry->msi_attrib.type == cap_id && + (vector == -1 || entry->vector == vector) ) + return entry; + } + + return NULL; +} + +/** + * msi_capability_init - configure device's MSI capability structure + * @dev: pointer to the pci_dev data structure of MSI device function + * + * Setup the MSI capability structure of device function with a single + * MSI irq, regardless of device function is capable of handling + * multiple messages. A return of zero indicates the successful setup + * of an entry zero with the new MSI irq or non-zero for otherwise. + **/ +static int msi_capability_init(struct pci_dev *dev, + int vector, + struct msi_desc **desc) +{ + struct msi_desc *entry; + int pos; + u16 control; + u8 bus = dev->bus; + u8 slot = PCI_SLOT(dev->devfn); + u8 func = PCI_FUNC(dev->devfn); + + ASSERT(spin_is_locked(&pcidevs_lock)); + pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSI); + control = pci_conf_read16(bus, slot, func, msi_control_reg(pos)); + /* MSI Entry Initialization */ + msi_set_enable(dev, 0); /* Ensure msi is disabled as I set it up */ + + entry = alloc_msi_entry(); + if ( !entry ) + return -ENOMEM; + + entry->msi_attrib.type = PCI_CAP_ID_MSI; + entry->msi_attrib.is_64 = is_64bit_address(control); + entry->msi_attrib.entry_nr = 0; + entry->msi_attrib.maskbit = is_mask_bit_support(control); + entry->msi_attrib.masked = 1; + entry->msi_attrib.pos = pos; + entry->vector = vector; + if ( is_mask_bit_support(control) ) + entry->mask_base = (void __iomem *)(long)msi_mask_bits_reg(pos, + is_64bit_address(control)); + entry->dev = dev; + if ( entry->msi_attrib.maskbit ) + { + unsigned int maskbits, temp; + /* All MSIs are unmasked by default, Mask them all */ + maskbits = pci_conf_read32(bus, slot, func, + msi_mask_bits_reg(pos, is_64bit_address(control))); + temp = (1 << multi_msi_capable(control)); + temp = ((temp - 1) & ~temp); + maskbits |= temp; + pci_conf_write32(bus, slot, func, + msi_mask_bits_reg(pos, is_64bit_address(control)), + maskbits); + } + list_add_tail(&entry->list, &dev->msi_list); + + *desc = entry; + /* Restore the original MSI enabled bits */ + pci_conf_write16(bus, slot, func, msi_control_reg(pos), control); + + return 0; +} + +/** + * msix_capability_init - configure device's MSI-X capability + * @dev: pointer to the pci_dev data structure of MSI-X device function + * @entries: pointer to an array of struct msix_entry entries + * @nvec: number of @entries + * + * Setup the MSI-X capability structure of device function with a + * single MSI-X irq. A return of zero indicates the successful setup of + * requested MSI-X entries with allocated irqs or non-zero for otherwise. + **/ +static int msix_capability_init(struct pci_dev *dev, + struct msi_info *msi, + struct msi_desc **desc) +{ + struct msi_desc *entry; + int pos; + u16 control; + unsigned long table_paddr, entry_paddr; + u32 table_offset, entry_offset; + u8 bir; + void __iomem *base; + int idx; + u8 bus = dev->bus; + u8 slot = PCI_SLOT(dev->devfn); + u8 func = PCI_FUNC(dev->devfn); + + ASSERT(spin_is_locked(&pcidevs_lock)); + ASSERT(desc); + + pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSIX); + control = pci_conf_read16(bus, slot, func, msix_control_reg(pos)); + msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */ + + /* MSI-X Table Initialization */ + entry = alloc_msi_entry(); + if ( !entry ) + return -ENOMEM; + + /* Request & Map MSI-X table region */ + table_offset = pci_conf_read32(bus, slot, func, msix_table_offset_reg(pos)); + bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK); + table_offset &= ~PCI_MSIX_FLAGS_BIRMASK; + entry_offset = msi->entry_nr * PCI_MSIX_ENTRY_SIZE; + + table_paddr = msi->table_base + table_offset; + entry_paddr = table_paddr + entry_offset; + idx = msix_get_fixmap(dev, table_paddr, entry_paddr); + if ( idx < 0 ) + { + xfree(entry); + return idx; + } + base = (void *)(fix_to_virt(idx) + (entry_paddr & ((1UL << PAGE_SHIFT) - 1))); + + entry->msi_attrib.type = PCI_CAP_ID_MSIX; + entry->msi_attrib.is_64 = 1; + entry->msi_attrib.entry_nr = msi->entry_nr; + entry->msi_attrib.maskbit = 1; + entry->msi_attrib.masked = 1; + entry->msi_attrib.pos = pos; + entry->vector = msi->vector; + entry->dev = dev; + entry->mask_base = base; + + list_add_tail(&entry->list, &dev->msi_list); + + /* Mask interrupt here */ + writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET); + + *desc = entry; + /* Restore MSI-X enabled bits */ + pci_conf_write16(bus, slot, func, msix_control_reg(pos), control); + + return 0; +} + +/** + * pci_enable_msi - configure device's MSI capability structure + * @dev: pointer to the pci_dev data structure of MSI device function + * + * Setup the MSI capability structure of device function with + * a single MSI irq upon its software driver call to request for + * MSI mode enabled on its hardware device function. A return of zero + * indicates the successful setup of an entry zero with the new MSI + * irq or non-zero for otherwise. + **/ +static int __pci_enable_msi(struct msi_info *msi, struct msi_desc **desc) +{ + int status; + struct pci_dev *pdev; + + ASSERT(spin_is_locked(&pcidevs_lock)); + pdev = pci_get_pdev(msi->bus, msi->devfn); + if ( !pdev ) + return -ENODEV; + + if ( find_msi_entry(pdev, msi->vector, PCI_CAP_ID_MSI) ) + { + dprintk(XENLOG_WARNING, "vector %d has already mapped to MSI on " + "device %02x:%02x.%01x.\n", msi->vector, msi->bus, + PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn)); + return 0; + } + + status = msi_capability_init(pdev, msi->vector, desc); + return status; +} + +static void __pci_disable_msi(struct msi_desc *entry) +{ + struct pci_dev *dev; + int pos; + u16 control; + u8 bus, slot, func; + + dev = entry->dev; + bus = dev->bus; + slot = PCI_SLOT(dev->devfn); + func = PCI_FUNC(dev->devfn); + + pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSI); + control = pci_conf_read16(bus, slot, func, msi_control_reg(pos)); + msi_set_enable(dev, 0); + + BUG_ON(list_empty(&dev->msi_list)); + +} + +/** + * pci_enable_msix - configure device's MSI-X capability structure + * @dev: pointer to the pci_dev data structure of MSI-X device function + * @entries: pointer to an array of MSI-X entries + * @nvec: number of MSI-X irqs requested for allocation by device driver + * + * Setup the MSI-X capability structure of device function with the number + * of requested irqs upon its software driver call to request for + * MSI-X mode enabled on its hardware device function. A return of zero + * indicates the successful configuration of MSI-X capability structure + * with new allocated MSI-X irqs. A return of < 0 indicates a failure. + * Or a return of > 0 indicates that driver request is exceeding the number + * of irqs available. Driver should use the returned value to re-send + * its request. + **/ +static int __pci_enable_msix(struct msi_info *msi, struct msi_desc **desc) +{ + int status, pos, nr_entries; + struct pci_dev *pdev; + u16 control; + u8 slot = PCI_SLOT(msi->devfn); + u8 func = PCI_FUNC(msi->devfn); + + ASSERT(spin_is_locked(&pcidevs_lock)); + pdev = pci_get_pdev(msi->bus, msi->devfn); + if ( !pdev ) + return -ENODEV; + + pos = pci_find_cap_offset(msi->bus, slot, func, PCI_CAP_ID_MSIX); + control = pci_conf_read16(msi->bus, slot, func, msi_control_reg(pos)); + nr_entries = multi_msix_capable(control); + if (msi->entry_nr >= nr_entries) + return -EINVAL; + + if ( find_msi_entry(pdev, msi->vector, PCI_CAP_ID_MSIX) ) + { + dprintk(XENLOG_WARNING, "vector %d has already mapped to MSIX on " + "device %02x:%02x.%01x.\n", msi->vector, msi->bus, + PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn)); + return 0; + } + + status = msix_capability_init(pdev, msi, desc); + return status; +} + +static void __pci_disable_msix(struct msi_desc *entry) +{ + struct pci_dev *dev; + int pos; + u16 control; + u8 bus, slot, func; + + dev = entry->dev; + bus = dev->bus; + slot = PCI_SLOT(dev->devfn); + func = PCI_FUNC(dev->devfn); + + pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSIX); + control = pci_conf_read16(bus, slot, func, msix_control_reg(pos)); + msi_set_enable(dev, 0); + + BUG_ON(list_empty(&dev->msi_list)); + + writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET); + + pci_conf_write16(bus, slot, func, msix_control_reg(pos), control); +} + +/* + * Notice: only construct the msi_desc + * no change to irq_desc here, and the interrupt is masked + */ +int pci_enable_msi(struct msi_info *msi, struct msi_desc **desc) +{ + ASSERT(spin_is_locked(&pcidevs_lock)); + + return msi->table_base ? __pci_enable_msix(msi, desc) : + __pci_enable_msi(msi, desc); +} + +/* + * Device only, no irq_desc + */ +void pci_disable_msi(struct msi_desc *msi_desc) +{ + if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSI ) + __pci_disable_msi(msi_desc); + else if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSIX ) + __pci_disable_msix(msi_desc); +} + +static void msi_free_vectors(struct pci_dev* dev) +{ + struct msi_desc *entry, *tmp; + irq_desc_t *desc; + unsigned long flags, vector; + + list_for_each_entry_safe( entry, tmp, &dev->msi_list, list ) + { + vector = entry->vector; + desc = &irq_desc[vector]; + pci_disable_msi(entry); + + spin_lock_irqsave(&desc->lock, flags); + + teardown_msi_vector(vector); + + if ( desc->handler == &pci_msi_type ) + { + /* MSI is not shared, so should be released already */ + BUG_ON(desc->status & IRQ_GUEST); + desc->handler = &no_irq_type; + } + + spin_unlock_irqrestore(&desc->lock, flags); + msi_free_vector(entry); + } +} + +void pci_cleanup_msi(struct pci_dev *pdev) +{ + /* Disable MSI and/or MSI-X */ + msi_set_enable(pdev, 0); + msix_set_enable(pdev, 0); + msi_free_vectors(pdev); +} + +int pci_restore_msi_state(struct pci_dev *pdev) +{ + unsigned long flags; + int vector; + struct msi_desc *entry, *tmp; + irq_desc_t *desc; + + ASSERT(spin_is_locked(&pcidevs_lock)); + + if (!pdev) + return -EINVAL; + + list_for_each_entry_safe( entry, tmp, &pdev->msi_list, list ) + { + vector = entry->vector; + desc = &irq_desc[vector]; + + spin_lock_irqsave(&desc->lock, flags); + + ASSERT(desc->msi_desc == entry); + + if (desc->msi_desc != entry) + { + dprintk(XENLOG_ERR, "Restore MSI for dev %x:%x not set before?\n", + pdev->bus, pdev->devfn); + spin_unlock_irqrestore(&desc->lock, flags); + return -EINVAL; + } + + msi_set_enable(pdev, 0); + write_msi_msg(entry, &entry->msg); + + msi_set_enable(pdev, 1); + msi_set_mask_bit(vector, entry->msi_attrib.masked); + spin_unlock_irqrestore(&desc->lock, flags); + } + + return 0; +} + +unsigned int pci_msix_get_table_len(struct pci_dev *pdev) +{ + int pos; + u16 control; + u8 bus, slot, func; + unsigned int len; + + bus = pdev->bus; + slot = PCI_SLOT(pdev->devfn); + func = PCI_FUNC(pdev->devfn); + + pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSIX); + if ( !pos ) + return 0; + + control = pci_conf_read16(bus, slot, func, msix_control_reg(pos)); + len = msix_table_size(control) * PCI_MSIX_ENTRY_SIZE; + + return len; +} diff -Naurp xen/arch/x86/nmi.c xen-redhat/arch/x86/nmi.c --- xen/arch/x86/nmi.c +++ xen-redhat/arch/x86/nmi.c @@ -73,7 +73,7 @@ int nmi_active; #define P6_EVNTSEL_OS (1 << 17) #define P6_EVNTSEL_USR (1 << 16) #define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 -#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED +#define CORE_EVENT_CPU_CLOCKS_NOT_HALTED 0x3c #define P4_ESCR_EVENT_SELECT(N) ((N)<<25) #define P4_CCCR_OVF_PMI0 (1<<26) @@ -248,7 +248,7 @@ static void __pminit setup_k7_watchdog(v wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); } -static void __pminit setup_p6_watchdog(void) +static void __pminit setup_p6_watchdog(unsigned counter) { unsigned int evntsel; @@ -260,7 +260,7 @@ static void __pminit setup_p6_watchdog(v evntsel = P6_EVNTSEL_INT | P6_EVNTSEL_OS | P6_EVNTSEL_USR - | P6_NMI_EVENT; + | counter; wrmsr(MSR_P6_EVNTSEL0, evntsel, 0); write_watchdog_counter("P6_PERFCTR0"); @@ -314,14 +314,21 @@ void __pminit setup_apic_nmi_watchdog(vo switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15) + switch (boot_cpu_data.x86) { + case 6: + case 0xf ... 0x17: + setup_k7_watchdog(); + break; + default: return; - setup_k7_watchdog(); + } break; case X86_VENDOR_INTEL: switch (boot_cpu_data.x86) { case 6: - setup_p6_watchdog(); + setup_p6_watchdog((boot_cpu_data.x86_model < 14) + ? P6_EVENT_CPU_CLOCKS_NOT_HALTED + : CORE_EVENT_CPU_CLOCKS_NOT_HALTED); break; case 15: if (!setup_p4_watchdog()) diff -Naurp xen/arch/x86/numa.c xen-redhat/arch/x86/numa.c --- xen/arch/x86/numa.c +++ xen-redhat/arch/x86/numa.c @@ -57,7 +57,7 @@ populate_memnodemap(const struct node *n { int i; int res = -1; - unsigned long addr, end; + paddr_t addr, end; if (shift >= 64) return -1; @@ -286,13 +286,13 @@ static void dump_numa(unsigned char key) (u32)(now>>32), (u32)now); for_each_online_node(i) { - unsigned long pa = (NODE_DATA(i)->node_start_pfn + 1)<< PAGE_SHIFT; + paddr_t pa = (NODE_DATA(i)->node_start_pfn + 1)<< PAGE_SHIFT; printk("idx%d -> NODE%d start->%lu size->%lu\n", i, NODE_DATA(i)->node_id, NODE_DATA(i)->node_start_pfn, NODE_DATA(i)->node_spanned_pages); /* sanity check phys_to_nid() */ - printk("phys_to_nid(%lx) -> %d should be %d\n", pa, phys_to_nid(pa), + printk("phys_to_nid(%"PRIpaddr") -> %d should be %d\n", pa, phys_to_nid(pa), NODE_DATA(i)->node_id); } for_each_online_cpu(i) diff -Naurp xen/arch/x86/oprofile/nmi_int.c xen-redhat/arch/x86/oprofile/nmi_int.c --- xen/arch/x86/oprofile/nmi_int.c +++ xen-redhat/arch/x86/oprofile/nmi_int.c @@ -291,37 +291,77 @@ static int __init p4_init(char ** cpu_ty } +static int force_arch_perfmon; +static int force_cpu_type(const char *str) +{ + if (!strcmp(str, "arch_perfmon")) { + force_arch_perfmon = 1; + printk(KERN_INFO "oprofile: forcing architectural perfmon\n"); + } + + return 0; +} +custom_param("cpu_type", force_cpu_type); + static int __init ppro_init(char ** cpu_type) { __u8 cpu_model = current_cpu_data.x86_model; - if (cpu_model > 15) { - printk("xenoprof: Initialization failed. " - "Intel processor model %d for P6 class family is not " - "supported\n", cpu_model); + if (force_arch_perfmon && cpu_has_arch_perfmon) return 0; - } - else if (cpu_model == 15) - *cpu_type = "i386/core_2"; - else if (cpu_model == 14) - *cpu_type = "i386/core"; - else if (cpu_model == 9) - *cpu_type = "i386/p6_mobile"; - else if (cpu_model > 5) - *cpu_type = "i386/piii"; - else if (cpu_model > 2) - *cpu_type = "i386/pii"; - else + + switch (cpu_model) { + case 0 ... 2: *cpu_type = "i386/ppro"; + break; + case 3 ... 5: + *cpu_type = "i386/pii"; + break; + case 6 ... 8: + case 10 ... 11: + *cpu_type = "i386/piii"; + break; + case 9: + case 13: + *cpu_type = "i386/p6_mobile"; + break; + case 14: + *cpu_type = "i386/core"; + break; + case 15: case 23: case 29: + *cpu_type = "i386/core_2"; + break; + case 26: + arch_perfmon_setup_counters(); + *cpu_type = "i386/core_i7"; + break; + case 28: + *cpu_type = "i386/atom"; + break; + default: + /* Unknown */ + return 0; + } model = &op_ppro_spec; return 1; } +static int __init arch_perfmon_init(char **cpu_type) +{ + if (!cpu_has_arch_perfmon) + return 0; + *cpu_type = "i386/arch_perfmon"; + model = &op_arch_perfmon_spec; + arch_perfmon_setup_counters(); + return 1; +} + static int __init nmi_init(void) { __u8 vendor = current_cpu_data.x86_vendor; __u8 family = current_cpu_data.x86; + __u8 _model = current_cpu_data.x86_model; if (!cpu_has_apic) { printk("xenoprof: Initialization failed. No APIC\n"); @@ -348,6 +388,26 @@ static int __init nmi_init(void) give user space an consistent name. */ cpu_type = "x86-64/hammer"; break; + case 0x10: + model = &op_athlon_spec; + cpu_type = "x86-64/family10"; + break; + case 0x11: + model = &op_athlon_spec; + cpu_type = "x86-64/family11"; + break; + case 0x12: + model = &op_athlon_spec; + cpu_type = "x86-64/family12"; + break; + case 0x14: + model = &op_athlon_spec; + cpu_type = "x86-64/family14"; + break; + case 0x15: + model = &op_athlon_spec; + cpu_type = "x86-64/family15"; + break; } break; @@ -355,21 +415,22 @@ static int __init nmi_init(void) switch (family) { /* Pentium IV */ case 0xf: - if (!p4_init(&cpu_type)) - return -ENODEV; + p4_init(&cpu_type); break; /* A P6-class processor */ case 6: - if (!ppro_init(&cpu_type)) - return -ENODEV; + ppro_init(&cpu_type); break; default: + break; + } + if (!cpu_type && !arch_perfmon_init(&cpu_type)) { printk("xenoprof: Initialization failed. " - "Intel processor family %d is not " - "supported\n", family); - return -ENODEV; + "Intel processor family %d model %d" + "is not supported\n", family, _model); + return -ENODEV; } break; diff -Naurp xen/arch/x86/oprofile/op_model_athlon.c xen-redhat/arch/x86/oprofile/op_model_athlon.c --- xen/arch/x86/oprofile/op_model_athlon.c +++ xen-redhat/arch/x86/oprofile/op_model_athlon.c @@ -34,12 +34,15 @@ #define CTRL_WRITE(l,h,msrs,c) do {wrmsr(msrs->controls[(c)].addr, (l), (h));} while (0) #define CTRL_SET_ACTIVE(n) (n |= (1<<22)) #define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) -#define CTRL_CLEAR(x) (x &= (1<<21)) +#define CTRL_CLEAR(lo, hi) (lo &= (1<<21), hi = 0) #define CTRL_SET_ENABLE(val) (val |= 1<<20) #define CTRL_SET_USR(val,u) (val |= ((u & 1) << 16)) #define CTRL_SET_KERN(val,k) (val |= ((k & 1) << 17)) -#define CTRL_SET_UM(val, m) (val |= (m << 8)) -#define CTRL_SET_EVENT(val, e) (val |= e) +#define CTRL_SET_UM(val, m) (val |= ((m & 0xff) << 8)) +#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff)) +#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf)) +#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) +#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) static unsigned long reset_value[NUM_COUNTERS]; @@ -72,7 +75,7 @@ static void athlon_setup_ctrs(struct op_ /* clear all counters */ for (i = 0 ; i < NUM_CONTROLS; ++i) { CTRL_READ(low, high, msrs, i); - CTRL_CLEAR(low); + CTRL_CLEAR(low, high); CTRL_WRITE(low, high, msrs, i); } @@ -89,12 +92,15 @@ static void athlon_setup_ctrs(struct op_ CTR_WRITE(counter_config[i].count, msrs, i); CTRL_READ(low, high, msrs, i); - CTRL_CLEAR(low); + CTRL_CLEAR(low, high); CTRL_SET_ENABLE(low); CTRL_SET_USR(low, counter_config[i].user); CTRL_SET_KERN(low, counter_config[i].kernel); CTRL_SET_UM(low, counter_config[i].unit_mask); - CTRL_SET_EVENT(low, counter_config[i].event); + CTRL_SET_EVENT_LOW(low, counter_config[i].event); + CTRL_SET_EVENT_HIGH(high, counter_config[i].event); + CTRL_SET_HOST_ONLY(high, 0); + CTRL_SET_GUEST_ONLY(high, 0); CTRL_WRITE(low, high, msrs, i); } else { reset_value[i] = 0; diff -Naurp xen/arch/x86/oprofile/op_model_ppro.c xen-redhat/arch/x86/oprofile/op_model_ppro.c --- xen/arch/x86/oprofile/op_model_ppro.c +++ xen-redhat/arch/x86/oprofile/op_model_ppro.c @@ -22,12 +22,24 @@ #include "op_x86_model.h" #include "op_counter.h" -#define NUM_COUNTERS 2 -#define NUM_CONTROLS 2 +/* + * Intel "Architectural Performance Monitoring" CPUID + * detection/enumeration details: + */ +union cpuid10_eax { + struct { + unsigned int version_id:8; + unsigned int num_counters:8; + unsigned int bit_width:8; + unsigned int mask_length:8; + } split; + unsigned int full; +}; + +static int num_counters = 2; +static int counter_width = 32; -#define CTR_READ(l,h,msrs,c) do {rdmsr(msrs->counters[(c)].addr, (l), (h));} while (0) -#define CTR_WRITE(l,msrs,c) do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), -1);} while (0) -#define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) +#define CTR_OVERFLOWED(n) (!((n) & (1ULL<<(counter_width-1)))) #define CTRL_READ(l,h,msrs,c) do {rdmsr((msrs->controls[(c)].addr), (l), (h));} while (0) #define CTRL_WRITE(l,h,msrs,c) do {wrmsr((msrs->controls[(c)].addr), (l), (h));} while (0) @@ -40,15 +52,16 @@ #define CTRL_SET_UM(val, m) (val |= (m << 8)) #define CTRL_SET_EVENT(val, e) (val |= e) -static unsigned long reset_value[NUM_COUNTERS]; +static unsigned long reset_value[OP_MAX_COUNTER]; static void ppro_fill_in_addresses(struct op_msrs * const msrs) { - msrs->counters[0].addr = MSR_P6_PERFCTR0; - msrs->counters[1].addr = MSR_P6_PERFCTR1; - - msrs->controls[0].addr = MSR_P6_EVNTSEL0; - msrs->controls[1].addr = MSR_P6_EVNTSEL1; + int i; + + for (i = 0; i < num_counters; i++) + msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; + for (i = 0; i < num_counters; i++) + msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; } @@ -56,25 +69,41 @@ static void ppro_setup_ctrs(struct op_ms { unsigned int low, high; int i; + + if (cpu_has_arch_perfmon) { + union cpuid10_eax eax; + eax.full = cpuid_eax(0xa); + + /* + * For Core2 (family 6, model 15), don't reset the + * counter width: + */ + if (!(eax.split.version_id == 0 && + current_cpu_data.x86 == 6 && + current_cpu_data.x86_model == 15)) { + + if (counter_width < eax.split.bit_width) + counter_width = eax.split.bit_width; + } + } /* clear all counters */ - for (i = 0 ; i < NUM_CONTROLS; ++i) { + for (i = 0 ; i < num_counters; ++i) { CTRL_READ(low, high, msrs, i); CTRL_CLEAR(low); CTRL_WRITE(low, high, msrs, i); } /* avoid a false detection of ctr overflows in NMI handler */ - for (i = 0; i < NUM_COUNTERS; ++i) { - CTR_WRITE(1, msrs, i); - } + for (i = 0; i < num_counters; ++i) + wrmsrl(msrs->counters[i].addr, -1LL); /* enable active counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (counter_config[i].enabled) { reset_value[i] = counter_config[i].count; - CTR_WRITE(counter_config[i].count, msrs, i); + wrmsrl(msrs->counters[i].addr, -reset_value[i]); CTRL_READ(low, high, msrs, i); CTRL_CLEAR(low); @@ -84,6 +113,8 @@ static void ppro_setup_ctrs(struct op_ms CTRL_SET_UM(low, counter_config[i].unit_mask); CTRL_SET_EVENT(low, counter_config[i].event); CTRL_WRITE(low, high, msrs, i); + } else { + reset_value[i] = 0; } } } @@ -97,17 +128,19 @@ static int ppro_check_ctrs(unsigned int struct op_msrs const * const msrs, struct cpu_user_regs * const regs) { - unsigned int low, high; + u64 val; int i; int ovf = 0; unsigned long eip = regs->eip; int mode = xenoprofile_get_mode(current, regs); - for (i = 0 ; i < NUM_COUNTERS; ++i) { - CTR_READ(low, high, msrs, i); - if (CTR_OVERFLOWED(low)) { + for (i = 0 ; i < num_counters; ++i) { + if (!reset_value[i]) + continue; + rdmsrl(msrs->counters[i].addr, val); + if (CTR_OVERFLOWED(val)) { xenoprof_log_event(current, regs, eip, mode, i); - CTR_WRITE(reset_value[i], msrs, i); + wrmsrl(msrs->counters[i].addr, -reset_value[i]); ovf = 1; } } @@ -123,27 +156,78 @@ static int ppro_check_ctrs(unsigned int static void ppro_start(struct op_msrs const * const msrs) { unsigned int low,high; - CTRL_READ(low, high, msrs, 0); - CTRL_SET_ACTIVE(low); - CTRL_WRITE(low, high, msrs, 0); + int i; + + for (i = 0; i < num_counters; ++i) { + if (reset_value[i]) { + CTRL_READ(low, high, msrs, i); + CTRL_SET_ACTIVE(low); + CTRL_WRITE(low, high, msrs, i); + } + } } static void ppro_stop(struct op_msrs const * const msrs) { unsigned int low,high; - CTRL_READ(low, high, msrs, 0); - CTRL_SET_INACTIVE(low); - CTRL_WRITE(low, high, msrs, 0); + int i; + + for (i = 0; i < num_counters; ++i) { + if (!reset_value[i]) + continue; + CTRL_READ(low, high, msrs, i); + CTRL_SET_INACTIVE(low); + CTRL_WRITE(low, high, msrs, i); + } } -struct op_x86_model_spec const op_ppro_spec = { - .num_counters = NUM_COUNTERS, - .num_controls = NUM_CONTROLS, +/* + * Architectural performance monitoring. + * + * Newer Intel CPUs (Core1+) have support for architectural + * events described in CPUID 0xA. See the IA32 SDM Vol3b.18 for details. + * The advantage of this is that it can be done without knowing about + * the specific CPU. + */ +void arch_perfmon_setup_counters(void) +{ + union cpuid10_eax eax; + + eax.full = cpuid_eax(0xa); + + /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */ + if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 && + current_cpu_data.x86_model == 15) { + eax.split.version_id = 2; + eax.split.num_counters = 2; + eax.split.bit_width = 40; + } + + num_counters = min_t(u8, eax.split.num_counters, OP_MAX_COUNTER); + + op_arch_perfmon_spec.num_counters = num_counters; + op_arch_perfmon_spec.num_controls = num_counters; + op_ppro_spec.num_counters = num_counters; + op_ppro_spec.num_controls = num_counters; +} + +struct op_x86_model_spec op_ppro_spec = { + .num_counters = 2, + .num_controls = 2, .fill_in_addresses = &ppro_fill_in_addresses, .setup_ctrs = &ppro_setup_ctrs, .check_ctrs = &ppro_check_ctrs, .start = &ppro_start, .stop = &ppro_stop }; + +struct op_x86_model_spec op_arch_perfmon_spec = { + /* num_counters/num_controls filled in at runtime */ + .fill_in_addresses = &ppro_fill_in_addresses, + .setup_ctrs = &ppro_setup_ctrs, + .check_ctrs = &ppro_check_ctrs, + .start = &ppro_start, + .stop = &ppro_stop, +}; diff -Naurp xen/arch/x86/oprofile/op_x86_model.h xen-redhat/arch/x86/oprofile/op_x86_model.h --- xen/arch/x86/oprofile/op_x86_model.h +++ xen-redhat/arch/x86/oprofile/op_x86_model.h @@ -32,8 +32,8 @@ struct pt_regs; * various x86 CPU model's perfctr support. */ struct op_x86_model_spec { - unsigned int const num_counters; - unsigned int const num_controls; + unsigned int num_counters; + unsigned int num_controls; void (*fill_in_addresses)(struct op_msrs * const msrs); void (*setup_ctrs)(struct op_msrs const * const msrs); int (*check_ctrs)(unsigned int const cpu, @@ -43,9 +43,11 @@ struct op_x86_model_spec { void (*stop)(struct op_msrs const * const msrs); }; -extern struct op_x86_model_spec const op_ppro_spec; +extern struct op_x86_model_spec op_ppro_spec; +extern struct op_x86_model_spec op_arch_perfmon_spec; extern struct op_x86_model_spec const op_p4_spec; extern struct op_x86_model_spec const op_p4_ht2_spec; extern struct op_x86_model_spec const op_athlon_spec; +void arch_perfmon_setup_counters(void); #endif /* OP_X86_MODEL_H */ diff -Naurp xen/arch/x86/pci.c xen-redhat/arch/x86/pci.c --- xen/arch/x86/pci.c +++ xen-redhat/arch/x86/pci.c @@ -0,0 +1,117 @@ +/****************************************************************************** + * pci.c + * + * Architecture-dependent PCI access functions. + */ + +#include <xen/spinlock.h> +#include <asm/io.h> + +#define PCI_CONF_ADDRESS(bus, dev, func, reg) \ + (0x80000000 | (bus << 16) | (dev << 11) | (func << 8) | (reg & ~3)) + +static DEFINE_SPINLOCK(pci_config_lock); + +uint32_t pci_conf_read(uint32_t cf8, uint8_t offset, uint8_t bytes) +{ + unsigned long flags; + uint32_t value; + + BUG_ON((offset + bytes) > 4); + + spin_lock_irqsave(&pci_config_lock, flags); + + outl(cf8, 0xcf8); + + switch ( bytes ) + { + case 1: + value = inb(0xcfc + offset); + break; + case 2: + value = inw(0xcfc + offset); + break; + case 4: + value = inl(0xcfc + offset); + break; + default: + value = 0; + BUG(); + } + + spin_unlock_irqrestore(&pci_config_lock, flags); + + return value; +} + +void pci_conf_write(uint32_t cf8, uint8_t offset, uint8_t bytes, uint32_t data) +{ + unsigned long flags; + + BUG_ON((offset + bytes) > 4); + + spin_lock_irqsave(&pci_config_lock, flags); + + outl(cf8, 0xcf8); + + switch ( bytes ) + { + case 1: + outb((uint8_t)data, 0xcfc + offset); + break; + case 2: + outw((uint16_t)data, 0xcfc + offset); + break; + case 4: + outl(data, 0xcfc + offset); + break; + } + + spin_unlock_irqrestore(&pci_config_lock, flags); +} + +uint8_t pci_conf_read8( + unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg) +{ + BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255)); + return pci_conf_read(PCI_CONF_ADDRESS(bus, dev, func, reg), reg & 3, 1); +} + +uint16_t pci_conf_read16( + unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg) +{ + BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255)); + return pci_conf_read(PCI_CONF_ADDRESS(bus, dev, func, reg), reg & 2, 2); +} + +uint32_t pci_conf_read32( + unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg) +{ + BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255)); + return pci_conf_read(PCI_CONF_ADDRESS(bus, dev, func, reg), 0, 4); +} + +void pci_conf_write8( + unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg, + uint8_t data) +{ + BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255)); + pci_conf_write(PCI_CONF_ADDRESS(bus, dev, func, reg), reg & 3, 1, data); +} + +void pci_conf_write16( + unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg, + uint16_t data) +{ + BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255)); + pci_conf_write(PCI_CONF_ADDRESS(bus, dev, func, reg), reg & 2, 2, data); +} + +void pci_conf_write32( + unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg, + uint32_t data) +{ + BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255)); + pci_conf_write(PCI_CONF_ADDRESS(bus, dev, func, reg), 0, 4, data); +} + diff -Naurp xen/arch/x86/physdev.c xen-redhat/arch/x86/physdev.c --- xen/arch/x86/physdev.c +++ xen-redhat/arch/x86/physdev.c @@ -1,4 +1,3 @@ - #include <xen/config.h> #include <xen/init.h> #include <xen/lib.h> @@ -7,11 +6,13 @@ #include <xen/irq.h> #include <xen/event.h> #include <xen/guest_access.h> +#include <xen/iocap.h> #include <asm/current.h> -#include <asm/smpboot.h> +#include <asm/msi.h> #include <asm/hypercall.h> #include <public/xen.h> #include <public/physdev.h> +#include <asm/p2m.h> #ifndef COMPAT typedef long ret_t; @@ -24,10 +25,166 @@ int ioapic_guest_write( unsigned long physbase, unsigned int reg, u32 pval); +static int physdev_map_pirq(struct physdev_map_pirq *map) +{ + struct domain *d; + int vector, pirq, ret = 0; + struct msi_info _msi; + void *map_data = NULL; + + if ( !IS_PRIV(current->domain) ) + return -EPERM; + + if ( !map ) + return -EINVAL; + + if ( map->domid == DOMID_SELF ) + d = rcu_lock_domain(current->domain); + else + d = rcu_lock_domain_by_id(map->domid); + + if ( d == NULL ) + { + ret = -ESRCH; + goto free_domain; + } + + /* Verify or get vector. */ + switch ( map->type ) + { + case MAP_PIRQ_TYPE_GSI: + if ( map->index < 0 || map->index >= NR_IRQS ) + { + dprintk(XENLOG_G_ERR, "dom%d: map invalid irq %d\n", + d->domain_id, map->index); + ret = -EINVAL; + goto free_domain; + } + vector = IO_APIC_VECTOR(map->index); + if ( !vector ) + { + dprintk(XENLOG_G_ERR, "dom%d: map irq with no vector %d\n", + d->domain_id, vector); + ret = -EINVAL; + goto free_domain; + } + break; + + case MAP_PIRQ_TYPE_MSI: + vector = map->index; + if ( vector == -1 ) + vector = assign_irq_vector(AUTO_ASSIGN); + + if ( vector < 0 || vector >= NR_VECTORS ) + { + dprintk(XENLOG_G_ERR, "dom%d: map irq with wrong vector %d\n", + d->domain_id, vector); + ret = -EINVAL; + goto free_domain; + } + + _msi.bus = map->bus; + _msi.devfn = map->devfn; + _msi.entry_nr = map->entry_nr; + _msi.table_base = map->table_base; + _msi.vector = vector; + map_data = &_msi; + break; + + default: + dprintk(XENLOG_G_ERR, "dom%d: wrong map_pirq type %x\n", + d->domain_id, map->type); + ret = -EINVAL; + goto free_domain; + } + + spin_lock(&pcidevs_lock); + /* Verify or get pirq. */ + spin_lock(&d->event_lock); + pirq = domain_vector_to_irq(d, vector); + if ( map->pirq < 0 ) + { + if ( pirq ) + { + dprintk(XENLOG_G_ERR, "dom%d: %d:%d already mapped to %d\n", + d->domain_id, map->index, map->pirq, + pirq); + if ( pirq < 0 ) + { + ret = -EBUSY; + goto done; + } + } + else + { + pirq = get_free_pirq(d, map->type, map->index); + if ( pirq < 0 ) + { + dprintk(XENLOG_G_ERR, "dom%d: no free pirq\n", d->domain_id); + ret = pirq; + goto done; + } + } + } + else + { + if ( pirq && pirq != map->pirq ) + { + dprintk(XENLOG_G_ERR, "dom%d: vector %d conflicts with irq %d\n", + d->domain_id, map->index, map->pirq); + ret = -EEXIST; + goto done; + } + else + pirq = map->pirq; + } + + ret = map_domain_pirq(d, pirq, vector, map->type, map_data); + if ( ret == 0 ) + map->pirq = pirq; + +done: + spin_unlock(&d->event_lock); + spin_unlock(&pcidevs_lock); + if ( (ret != 0) && (map->type == MAP_PIRQ_TYPE_MSI) && (map->index == -1) ) + free_irq_vector(vector); +free_domain: + rcu_unlock_domain(d); + return ret; +} + +static int physdev_unmap_pirq(struct physdev_unmap_pirq *unmap) +{ + struct domain *d; + int ret; + + if ( !IS_PRIV(current->domain) ) + return -EPERM; + + if ( unmap->domid == DOMID_SELF ) + d = rcu_lock_domain(current->domain); + else + d = rcu_lock_domain_by_id(unmap->domid); + + if ( d == NULL ) + return -ESRCH; + + spin_lock(&pcidevs_lock); + spin_lock(&d->event_lock); + ret = unmap_domain_pirq(d, unmap->pirq); + spin_unlock(&d->event_lock); + spin_unlock(&pcidevs_lock); + + rcu_unlock_domain(d); + + return ret; +} + ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE(void) arg) { int irq; ret_t ret; + struct vcpu *v = current; switch ( cmd ) { @@ -36,13 +193,13 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H ret = -EFAULT; if ( copy_from_guest(&eoi, arg, 1) != 0 ) break; - ret = pirq_guest_eoi(current->domain, eoi.irq); + ret = pirq_guest_eoi(v->domain, eoi.irq); break; } /* Legacy since 0x00030202. */ case PHYSDEVOP_IRQ_UNMASK_NOTIFY: { - ret = pirq_guest_unmask(current->domain); + ret = pirq_guest_unmask(v->domain); break; } @@ -56,21 +213,67 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H if ( (irq < 0) || (irq >= NR_IRQS) ) break; irq_status_query.flags = 0; - if ( pirq_acktype(irq) != 0 ) - irq_status_query.flags |= XENIRQSTAT_needs_eoi; - if ( pirq_shared(irq) ) + /* + * Even edge-triggered or message-based IRQs can need masking from + * time to time. If teh guest is not dynamically checking for this + * via the new pirq_eoi_map mechanism, it must conservatively always + * execute the EOI hypercall. In practice, this only really makes a + * difference for maskable MSI sources, and if those are supported + * then dom0 is probably modern anyway. + */ + irq_status_query.flags |= XENIRQSTAT_needs_eoi; + if ( pirq_shared(v->domain, irq) ) irq_status_query.flags |= XENIRQSTAT_shared; ret = copy_to_guest(arg, &irq_status_query, 1) ? -EFAULT : 0; break; } + case PHYSDEVOP_set_device_msixtbl: { + struct physdev_device_msixtbl tbl; + + ret = -EFAULT; + if ( copy_from_guest(&tbl, arg, 1) != 0 ) + break; + + spin_lock(&pcidevs_lock); + ret = pci_set_device_msixtbl(tbl.bus, tbl.devfn, tbl.gtable); + spin_unlock(&pcidevs_lock); + + break; + } + + case PHYSDEVOP_map_pirq: { + struct physdev_map_pirq map; + + ret = -EFAULT; + if ( copy_from_guest(&map, arg, 1) != 0 ) + break; + + ret = physdev_map_pirq(&map); + + if ( copy_to_guest(arg, &map, 1) != 0 ) + ret = -EFAULT; + break; + } + + case PHYSDEVOP_unmap_pirq: { + struct physdev_unmap_pirq unmap; + + ret = -EFAULT; + if ( copy_from_guest(&unmap, arg, 1) != 0 ) + break; + + ret = physdev_unmap_pirq(&unmap); + break; + } + case PHYSDEVOP_apic_read: { struct physdev_apic apic; ret = -EFAULT; if ( copy_from_guest(&apic, arg, 1) != 0 ) break; ret = -EPERM; - if ( !IS_PRIV(current->domain) ) + if ( !IS_PRIV(v->domain) ) break; ret = ioapic_guest_read(apic.apic_physbase, apic.reg, &apic.value); if ( copy_to_guest(arg, &apic, 1) != 0 ) @@ -84,7 +287,7 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H if ( copy_from_guest(&apic, arg, 1) != 0 ) break; ret = -EPERM; - if ( !IS_PRIV(current->domain) ) + if ( !IS_PRIV(v->domain) ) break; ret = ioapic_guest_write(apic.apic_physbase, apic.reg, apic.value); break; @@ -98,7 +301,7 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H break; ret = -EPERM; - if ( !IS_PRIV(current->domain) ) + if ( !IS_PRIV(v->domain) ) break; irq = irq_op.irq; @@ -107,7 +310,16 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H break; irq_op.vector = assign_irq_vector(irq); - ret = copy_to_guest(arg, &irq_op, 1) ? -EFAULT : 0; + + spin_lock(&pcidevs_lock); + spin_lock(&dom0->event_lock); + ret = map_domain_pirq(dom0, irq_op.irq, irq_op.vector, + MAP_PIRQ_TYPE_GSI, NULL); + spin_unlock(&dom0->event_lock); + spin_unlock(&pcidevs_lock); + + if ( copy_to_guest(arg, &irq_op, 1) != 0 ) + ret = -EFAULT; break; } @@ -120,7 +332,7 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H if ( set_iopl.iopl > 3 ) break; ret = 0; - current->arch.iopl = set_iopl.iopl; + v->arch.iopl = set_iopl.iopl; break; } @@ -135,11 +347,37 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H break; ret = 0; #ifndef COMPAT - current->arch.iobmp = set_iobitmap.bitmap; + v->arch.iobmp = set_iobitmap.bitmap; #else - guest_from_compat_handle(current->arch.iobmp, set_iobitmap.bitmap); + guest_from_compat_handle(v->arch.iobmp, set_iobitmap.bitmap); #endif - current->arch.iobmp_limit = set_iobitmap.nr_ports; + v->arch.iobmp_limit = set_iobitmap.nr_ports; + break; + } + + case PHYSDEVOP_manage_pci_add: { + struct physdev_manage_pci manage_pci; + ret = -EPERM; + if ( !IS_PRIV(current->domain) ) + break; + ret = -EFAULT; + if ( copy_from_guest(&manage_pci, arg, 1) != 0 ) + break; + + ret = pci_add_device(manage_pci.bus, manage_pci.devfn); + break; + } + + case PHYSDEVOP_manage_pci_remove: { + struct physdev_manage_pci manage_pci; + ret = -EPERM; + if ( !IS_PRIV(current->domain) ) + break; + ret = -EFAULT; + if ( copy_from_guest(&manage_pci, arg, 1) != 0 ) + break; + + ret = pci_remove_device(manage_pci.bus, manage_pci.devfn); break; } diff -Naurp xen/arch/x86/platform_hypercall.c xen-redhat/arch/x86/platform_hypercall.c --- xen/arch/x86/platform_hypercall.c +++ xen-redhat/arch/x86/platform_hypercall.c @@ -34,10 +34,19 @@ DEFINE_SPINLOCK(xenpf_lock); # define copy_from_compat copy_from_guest # undef copy_to_compat # define copy_to_compat copy_to_guest +# undef guest_from_compat_handle +# define guest_from_compat_handle(x,y) ((x)=(y)) #else extern spinlock_t xenpf_lock; #endif +static DEFINE_PER_CPU(uint64_t, freq); + +static long cpu_frequency_change_helper(void *data) +{ + return cpu_frequency_change(this_cpu(freq)); +} + ret_t do_platform_op(XEN_GUEST_HANDLE(xen_platform_op_t) u_xenpf_op) { ret_t ret = 0; @@ -247,11 +256,82 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe } break; +#if defined (CONFIG_X86_64) && !defined (COMPAT) + case XENPF_stratus_call: + { + extern int do_stratus(xenpf_stratus_call_t*); + ret = (ret_t)do_stratus(&(op->u.stratus_call)); + if (copy_to_guest(u_xenpf_op, op, 1)) + ret = -EFAULT; + } + break; +#endif + + case XENPF_change_freq: + ret = -ENOSYS; + if ( cpufreq_controller != FREQCTL_dom0_kernel ) + break; + ret = -EINVAL; + if ( op->u.change_freq.flags || !cpu_online(op->u.change_freq.cpu) ) + break; + per_cpu(freq, op->u.change_freq.cpu) = op->u.change_freq.freq; + ret = continue_hypercall_on_cpu(op->u.change_freq.cpu, + cpu_frequency_change_helper, + NULL); + break; + + case XENPF_getidletime: + { + uint32_t cpu; + uint64_t idletime, now = NOW(); + struct vcpu *v; + struct xenctl_cpumap ctlmap; + cpumask_t cpumap; + XEN_GUEST_HANDLE(uint8_t) cpumap_bitmap; + XEN_GUEST_HANDLE(uint64_t) idletimes; + + ret = -ENOSYS; + if ( cpufreq_controller != FREQCTL_dom0_kernel ) + break; + + ctlmap.nr_cpus = op->u.getidletime.cpumap_nr_cpus; + guest_from_compat_handle(cpumap_bitmap, + op->u.getidletime.cpumap_bitmap); + ctlmap.bitmap.p = cpumap_bitmap.p; /* handle -> handle_64 conversion */ + xenctl_cpumap_to_cpumask(&cpumap, &ctlmap); + guest_from_compat_handle(idletimes, op->u.getidletime.idletime); + + for_each_cpu_mask ( cpu, cpumap ) + { + if ( (v = idle_vcpu[cpu]) != NULL ) + { + idletime = v->runstate.time[RUNSTATE_running]; + if ( v->is_running ) + idletime += now - v->runstate.state_entry_time; + } + else + { + idletime = 0; + cpu_clear(cpu, cpumap); + } + + ret = -EFAULT; + if ( copy_to_guest_offset(idletimes, cpu, &idletime, 1) ) + goto out; + } + + op->u.getidletime.now = now; + cpumask_to_xenctl_cpumap(&ctlmap, &cpumap); + ret = copy_to_guest(u_xenpf_op, op, 1) ? -EFAULT : 0; + } + break; + default: ret = -ENOSYS; break; } + out: spin_unlock(&xenpf_lock); return ret; diff -Naurp xen/arch/x86/setup.c xen-redhat/arch/x86/setup.c --- xen/arch/x86/setup.c +++ xen-redhat/arch/x86/setup.c @@ -19,6 +19,7 @@ #include <xen/numa.h> #include <xen/rcupdate.h> #include <xen/vga.h> +#include <xen/dmi.h> #include <public/version.h> #ifdef CONFIG_COMPAT #include <compat/platform.h> @@ -44,7 +45,6 @@ #define maddr_to_bootstrap_virt(m) ((void *)(long)(m)) #endif -extern void dmi_scan_machine(void); extern void generic_apic_probe(void); extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn); @@ -109,6 +109,12 @@ extern void early_cpu_init(void); extern void vesa_init(void); extern void vesa_mtrr_init(void); +DEFINE_PER_CPU(struct desc_struct *, gdt_table) = boot_cpu_gdt_table; +#ifdef CONFIG_COMPAT +DEFINE_PER_CPU(struct desc_struct *, compat_gdt_table) + = boot_cpu_compat_gdt_table; +#endif + struct tss_struct init_tss[NR_CPUS]; char __attribute__ ((__section__(".bss.stack_aligned"))) cpu0_stack[STACK_SIZE]; @@ -314,41 +320,6 @@ static void __init move_memory( /* A temporary copy of the e820 map that we can mess with during bootstrap. */ static struct e820map __initdata boot_e820; -/* Reserve area (@s,@e) in the temporary bootstrap e820 map. */ -static int __init reserve_in_boot_e820(unsigned long s, unsigned long e) -{ - uint64_t rs, re; - int i; - - for ( i = 0; i < boot_e820.nr_map; i++ ) - { - /* Have we found the e820 region that includes the specified range? */ - rs = boot_e820.map[i].addr; - re = rs + boot_e820.map[i].size; - if ( (s >= rs) && (e <= re) ) - goto found; - } - - return 0; - - found: - /* Start fragment. */ - boot_e820.map[i].size = s - rs; - - /* End fragment. */ - if ( e < re ) - { - memmove(&boot_e820.map[i+1], &boot_e820.map[i], - (boot_e820.nr_map-i) * sizeof(boot_e820.map[0])); - boot_e820.nr_map++; - i++; - boot_e820.map[i].addr = e; - boot_e820.map[i].size = re - e; - } - - return 1; -} - struct boot_video_info { u8 orig_x; /* 0x00 */ u8 orig_y; /* 0x01 */ @@ -411,6 +382,32 @@ static void __init parse_video_info(void } } +void __init kexec_reserve_area(struct e820map *e820) +{ + unsigned long kdump_start = kexec_crash_area.start; + unsigned long kdump_size = kexec_crash_area.size; + static int is_reserved = 0; + + kdump_size = (kdump_size + PAGE_SIZE - 1) & PAGE_MASK; + + if ( (kdump_start == 0) || (kdump_size == 0) || is_reserved ) + return; + + is_reserved = 1; + + if ( !reserve_e820_ram(e820, kdump_start, kdump_start + kdump_size) ) + { + printk("Kdump: DISABLED (failed to reserve %luMB (%lukB) at 0x%lx)" + "\n", kdump_size >> 20, kdump_size >> 10, kdump_start); + kexec_crash_area.start = kexec_crash_area.size = 0; + } + else + { + printk("Kdump: %luMB (%lukB) at 0x%lx\n", + kdump_size >> 20, kdump_size >> 10, kdump_start); + } +} + void init_done(void) { extern char __init_begin[], __init_end[]; @@ -483,6 +480,9 @@ void __init __start_xen(unsigned long mb set_current((struct vcpu *)0xfffff000); /* debug sanity */ idle_vcpu[0] = current; set_processor_id(0); /* needed early, for smp_processor_id() */ + if ( cpu_has_efer ) + rdmsrl(MSR_EFER, this_cpu(efer)); + asm volatile ( "mov %%cr4,%0" : "=r" (this_cpu(cr4)) ); smp_prepare_boot_cpu(); @@ -556,14 +556,6 @@ void __init __start_xen(unsigned long mb if ( ((unsigned long)cpu0_stack & (STACK_SIZE-1)) != 0 ) EARLY_FAIL("Misaligned CPU0 stack.\n"); - /* - * Since there are some stubs getting built on the stacks which use - * direct calls/jumps, the heap must be confined to the lower 2G so - * that those branches can reach their targets. - */ - if ( opt_xenheap_megabytes > 2048 ) - opt_xenheap_megabytes = 2048; - if ( e820_raw_nr != 0 ) { memmap_type = "Xen-e820"; @@ -582,7 +574,7 @@ void __init __start_xen(unsigned long mb else if ( mbi->flags & MBI_MEMMAP ) { memmap_type = "Multiboot-e820"; - while ( bytes < mbi->mmap_length ) + while ( (bytes < mbi->mmap_length) && (e820_raw_nr < E820MAX) ) { memory_map_t *map = __va(mbi->mmap_addr + bytes); @@ -633,47 +625,31 @@ void __init __start_xen(unsigned long mb EARLY_FAIL("Bootloader provided no memory information.\n"); } - /* Ensure that all E820 RAM regions are page-aligned and -sized. */ - for ( i = 0; i < e820_raw_nr; i++ ) - { - uint64_t s, e; - - if ( e820_raw[i].type != E820_RAM ) - continue; - s = PFN_UP(e820_raw[i].addr); - e = PFN_DOWN(e820_raw[i].addr + e820_raw[i].size); - e820_raw[i].size = 0; /* discarded later */ - if ( s < e ) - { - e820_raw[i].addr = s << PAGE_SHIFT; - e820_raw[i].size = (e - s) << PAGE_SHIFT; - } - } - /* Sanitise the raw E820 map to produce a final clean version. */ max_page = init_e820(memmap_type, e820_raw, &e820_raw_nr); +#ifdef CONFIG_X86_64 /* - * Create a temporary copy of the E820 map. Truncate it to above 16MB - * as anything below that is already mapped and has a statically-allocated - * purpose. + * On x86/64 we are able to account for the allocation bitmap + * (allocated in common/page_alloc.c:init_boot_allocator()) stealing + * from the Xen heap. Here we make the Xen heap appropriately larger. */ + opt_xenheap_megabytes += (max_page / 8) >> 20; +#endif + + /* + * Since there are some stubs getting built on the stacks which use + * direct calls/jumps, the heap must be confined to the lower 2G so + * that those branches can reach their targets. + */ + if ( opt_xenheap_megabytes > 2048 ) + opt_xenheap_megabytes = 2048; + + /* Create a temporary copy of the E820 map. */ memcpy(&boot_e820, &e820, sizeof(e820)); - for ( i = 0; i < boot_e820.nr_map; i++ ) - { - uint64_t s, e, min = 16 << 20; /* 16MB */ - s = boot_e820.map[i].addr; - e = boot_e820.map[i].addr + boot_e820.map[i].size; - if ( s >= min ) - continue; - if ( e > min ) - { - boot_e820.map[i].addr = min; - boot_e820.map[i].size = e - min; - } - else - boot_e820.map[i].type = E820_RESERVED; - } + + /* Early kexec reservation (explicit static start address). */ + kexec_reserve_area(&boot_e820); /* * Iterate backwards over all superpage-aligned RAM regions. @@ -693,9 +669,10 @@ void __init __start_xen(unsigned long mb { uint64_t s, e, mask = (1UL << L2_PAGETABLE_SHIFT) - 1; - /* Superpage-aligned chunks up to BOOTSTRAP_DIRECTMAP_END, please. */ + /* Superpage-aligned chunks from 16MB to BOOTSTRAP_DIRECTMAP_END. */ s = (boot_e820.map[i].addr + mask) & ~mask; e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask; + s = max_t(uint64_t, s, 16 << 20); e = min_t(uint64_t, e, BOOTSTRAP_DIRECTMAP_END); if ( (boot_e820.map[i].type != E820_RAM) || (s >= e) ) continue; @@ -796,71 +773,61 @@ void __init __start_xen(unsigned long mb if ( !initial_images_start ) EARLY_FAIL("Not enough memory to relocate the dom0 kernel image.\n"); - reserve_in_boot_e820(initial_images_start, initial_images_end); + reserve_e820_ram(&boot_e820, initial_images_start, initial_images_end); - /* - * With modules (and Xen itself, on x86/64) relocated out of the way, we - * can now initialise the boot allocator with some memory. - */ + /* Initialise Xen heap and boot heap. */ xenheap_phys_start = init_boot_allocator(__pa(&_end)); xenheap_phys_end = opt_xenheap_megabytes << 20; #if defined(CONFIG_X86_64) if ( !xen_phys_start ) EARLY_FAIL("Not enough memory to relocate Xen.\n"); xenheap_phys_end += xen_phys_start; - reserve_in_boot_e820(xen_phys_start, - xen_phys_start + (opt_xenheap_megabytes<<20)); - init_boot_pages(1<<20, 16<<20); /* Initial seed: 15MB */ -#else - init_boot_pages(xenheap_phys_end, 16<<20); /* Initial seed: 4MB */ + reserve_e820_ram(&boot_e820, xen_phys_start, + xen_phys_start + (opt_xenheap_megabytes<<20)); #endif - if ( kexec_crash_area.size != 0 ) - { - unsigned long kdump_start = kexec_crash_area.start; - unsigned long kdump_size = kexec_crash_area.size; - - kdump_size = (kdump_size + PAGE_SIZE - 1) & PAGE_MASK; - - if ( !reserve_in_boot_e820(kdump_start, kdump_size) ) - { - printk("Kdump: DISABLED (failed to reserve %luMB (%lukB) at 0x%lx)" - "\n", kdump_size >> 20, kdump_size >> 10, kdump_start); - kexec_crash_area.start = kexec_crash_area.size = 0; - } - else - { - printk("Kdump: %luMB (%lukB) at 0x%lx\n", - kdump_size >> 20, kdump_size >> 10, kdump_start); - } - } + /* Late kexec reservation (dynamic start address). */ + kexec_reserve_area(&boot_e820); /* - * With the boot allocator now seeded, we can walk every RAM region and - * map it in its entirety (on x86/64, at least) and notify it to the + * With the boot allocator now initialised, we can walk every RAM region + * and map it in its entirety (on x86/64, at least) and notify it to the * boot allocator. */ for ( i = 0; i < boot_e820.nr_map; i++ ) { - uint64_t s, e, map_e, mask = PAGE_SIZE - 1; + uint64_t s, e, map_s, map_e, mask = PAGE_SIZE - 1; /* Only page alignment required now. */ s = (boot_e820.map[i].addr + mask) & ~mask; e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask; +#if defined(CONFIG_X86_32) + s = max_t(uint64_t, s, xenheap_phys_end); +#else + s = max_t(uint64_t, s, 1<<20); +#endif if ( (boot_e820.map[i].type != E820_RAM) || (s >= e) ) continue; - /* Perform the mapping (truncated in 32-bit mode). */ + /* Need to create mappings above 16MB. */ + map_s = max_t(uint64_t, s, 16<<20); map_e = e; -#if defined(CONFIG_X86_32) +#if defined(CONFIG_X86_32) /* mappings are truncated on x86_32 */ map_e = min_t(uint64_t, map_e, BOOTSTRAP_DIRECTMAP_END); #endif - if ( s < map_e ) + + /* Pass mapped memory to allocator /before/ creating new mappings. */ + init_boot_pages(s, min_t(uint64_t, map_s, e)); + + /* Create new mappings /before/ passing memory to the allocator. */ + if ( map_s < map_e ) map_pages_to_xen( - (unsigned long)maddr_to_bootstrap_virt(s), - s >> PAGE_SHIFT, (map_e-s) >> PAGE_SHIFT, PAGE_HYPERVISOR); + (unsigned long)maddr_to_bootstrap_virt(map_s), + map_s >> PAGE_SHIFT, (map_e-map_s) >> PAGE_SHIFT, + PAGE_HYPERVISOR); - init_boot_pages(s, e); + /* Pass remainder of this memory chunk to the allocator. */ + init_boot_pages(map_s, e); } memguard_init(); @@ -988,6 +955,8 @@ void __init __start_xen(unsigned long mb if ( opt_nosmp ) max_cpus = 0; + iommu_setup(); + smp_prepare_cpus(max_cpus); /* @@ -1161,6 +1130,14 @@ void arch_get_xen_caps(xen_capabilities_ #endif } +int xen_in_range(paddr_t start, paddr_t end) +{ + start = max_t(paddr_t, start, xenheap_phys_start); + end = min_t(paddr_t, end, xenheap_phys_end); + + return start < end; +} + /* * Local variables: * mode: C diff -Naurp xen/arch/x86/smpboot.c xen-redhat/arch/x86/smpboot.c --- xen/arch/x86/smpboot.c +++ xen-redhat/arch/x86/smpboot.c @@ -50,6 +50,7 @@ #include <asm/div64.h> #include <asm/flushtlb.h> #include <asm/msr.h> +#include <asm/mtrr.h> #include <mach_apic.h> #include <mach_wakecpu.h> #include <smpboot_hooks.h> @@ -489,6 +490,9 @@ void __devinit start_secondary(void *unu set_processor_id(cpu); set_current(idle_vcpu[cpu]); this_cpu(curr_vcpu) = idle_vcpu[cpu]; + if ( cpu_has_efer ) + rdmsrl(MSR_EFER, this_cpu(efer)); + asm volatile ( "mov %%cr4,%0" : "=r" (this_cpu(cr4)) ); percpu_traps_init(); @@ -531,6 +535,7 @@ void __devinit start_secondary(void *unu /* We can take interrupts now: we're officially "up". */ local_irq_enable(); + mtrr_ap_init(); init_percpu_time(); @@ -543,40 +548,6 @@ extern struct { unsigned short ss; } stack_start; -#ifdef CONFIG_NUMA - -/* which logical CPUs are on which nodes */ -cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly = - { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE }; -/* which node each logical CPU is on */ -int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; -EXPORT_SYMBOL(cpu_2_node); - -/* set up a mapping between cpu and node. */ -static inline void map_cpu_to_node(int cpu, int node) -{ - printk("Mapping cpu %d to node %d\n", cpu, node); - cpu_set(cpu, node_2_cpu_mask[node]); - cpu_2_node[cpu] = node; -} - -/* undo a mapping between cpu and node. */ -static inline void unmap_cpu_to_node(int cpu) -{ - int node; - - printk("Unmapping cpu %d from all nodes\n", cpu); - for (node = 0; node < MAX_NUMNODES; node ++) - cpu_clear(cpu, node_2_cpu_mask[node]); - cpu_2_node[cpu] = 0; -} -#else /* !CONFIG_NUMA */ - -#define map_cpu_to_node(cpu, node) ({}) -#define unmap_cpu_to_node(cpu) ({}) - -#endif /* CONFIG_NUMA */ - u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; static void map_cpu_to_logical_apicid(void) @@ -585,13 +556,11 @@ static void map_cpu_to_logical_apicid(vo int apicid = hard_smp_processor_id(); cpu_2_logical_apicid[cpu] = apicid; - map_cpu_to_node(cpu, apicid_to_node(apicid)); } static void unmap_cpu_to_logical_apicid(int cpu) { cpu_2_logical_apicid[cpu] = BAD_APICID; - unmap_cpu_to_node(cpu); } #if APIC_DEBUG @@ -838,10 +807,15 @@ static int __devinit do_boot_cpu(int api */ { unsigned long boot_error; + unsigned int order; int timeout; unsigned long start_eip; unsigned short nmi_high = 0, nmi_low = 0; struct vcpu *v; + struct desc_struct *gdt; +#ifdef __x86_64__ + struct page_info *page; +#endif ++cpucount; @@ -861,6 +835,41 @@ static int __devinit do_boot_cpu(int api /* Debug build: detect stack overflow by setting up a guard page. */ memguard_guard_stack(stack_start.esp); + gdt = per_cpu(gdt_table, cpu); + if (gdt == boot_cpu_gdt_table) { + order = get_order_from_pages(NR_RESERVED_GDT_PAGES); +#ifdef __x86_64__ +#ifdef CONFIG_COMPAT + page = alloc_domheap_pages(NULL, order, 0); + per_cpu(compat_gdt_table, cpu) = gdt = page_to_virt(page); + memcpy(gdt, boot_cpu_compat_gdt_table, + NR_RESERVED_GDT_PAGES * PAGE_SIZE); + gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu; +#endif + page = alloc_domheap_pages(NULL, order, 0); + per_cpu(gdt_table, cpu) = gdt = page_to_virt(page); +#else + per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(order); +#endif + memcpy(gdt, boot_cpu_gdt_table, + NR_RESERVED_GDT_PAGES * PAGE_SIZE); + BUILD_BUG_ON(NR_CPUS > 0x10000); + gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu; + } + +#ifdef __i386__ + if (!per_cpu(doublefault_tss, cpu)) { + per_cpu(doublefault_tss, cpu) = alloc_xenheap_page(); + memset(per_cpu(doublefault_tss, cpu), 0, PAGE_SIZE); + } +#endif + + if (!idt_tables[cpu]) { + idt_tables[cpu] = xmalloc_array(idt_entry_t, IDT_ENTRIES); + memcpy(idt_tables[cpu], idt_table, + IDT_ENTRIES*sizeof(idt_entry_t)); + } + /* * This grunge runs the startup process for * the targeted processor. @@ -1121,6 +1130,7 @@ void __init smp_prepare_cpus(unsigned in smp_commenced_mask = cpumask_of_cpu(0); cpu_callin_map = cpumask_of_cpu(0); mb(); + mtrr_aps_sync_begin(); smp_boot_cpus(max_cpus); } @@ -1158,6 +1168,7 @@ void __init smp_cpus_done(unsigned int m #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); #endif + mtrr_aps_sync_end(); #ifndef CONFIG_HOTPLUG_CPU /* * Disable executability of the SMP trampoline: diff -Naurp xen/arch/x86/smp.c xen-redhat/arch/x86/smp.c --- xen/arch/x86/smp.c +++ xen-redhat/arch/x86/smp.c @@ -86,6 +86,12 @@ static inline void check_IPI_mask(cpumas ASSERT(!cpus_empty(cpumask)); } +void apic_wait_icr_idle(void) +{ + while ( apic_read( APIC_ICR ) & APIC_ICR_BUSY ) + cpu_relax(); +} + void send_IPI_mask_flat(cpumask_t cpumask, int vector) { unsigned long mask = cpus_addr(cpumask)[0]; diff -Naurp xen/arch/x86/srat.c xen-redhat/arch/x86/srat.c --- xen/arch/x86/srat.c +++ xen-redhat/arch/x86/srat.c @@ -17,6 +17,7 @@ #include <xen/nodemask.h> #include <xen/acpi.h> #include <xen/numa.h> +#include <asm/e820.h> #include <asm/page.h> static struct acpi_table_slit *acpi_slit; @@ -217,23 +218,39 @@ acpi_numa_memory_affinity_init(struct ac static int nodes_cover_memory(void) { int i; - u64 pxmram, e820ram; - pxmram = 0; - for_each_node_mask(i, nodes_parsed) { - u64 s = nodes[i].start >> PAGE_SHIFT; - u64 e = nodes[i].end >> PAGE_SHIFT; - pxmram += e - s; - } + for (i = 0; i < e820.nr_map; i++) { + int j, found; + unsigned long long start, end; - e820ram = max_page; - /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ - if ((long)(e820ram - pxmram) >= 1*1024*1024) { - printk(KERN_ERR "SRAT: PXMs only cover %"PRIu64"MB of your %" - PRIu64"MB e820 RAM. Not used.\n", - (pxmram << PAGE_SHIFT) >> 20, - (e820ram << PAGE_SHIFT) >> 20); - return 0; + if (e820.map[i].type != E820_RAM) { + continue; + } + + start = e820.map[i].addr; + end = e820.map[i].addr + e820.map[i].size - 1; + + do { + found = 0; + for_each_node_mask(j, nodes_parsed) + if (start < nodes[j].end + && end > nodes[j].start) { + if (start >= nodes[j].start) { + start = nodes[j].end; + found = 1; + } + if (end <= nodes[j].end) { + end = nodes[j].start; + found = 1; + } + } + } while (found && start < end); + + if (start < end) { + printk(KERN_ERR "SRAT: No PXM for e820 range: " + "%016Lx - %016Lx\n", start, end); + return 0; + } } return 1; } diff -Naurp xen/arch/x86/sysctl.c xen-redhat/arch/x86/sysctl.c --- xen/arch/x86/sysctl.c +++ xen-redhat/arch/x86/sysctl.c @@ -23,6 +23,10 @@ #include <asm/hvm/hvm.h> #include <asm/hvm/support.h> #include <asm/processor.h> +#include <asm/numa.h> +#include <xen/nodemask.h> + +#define get_xen_guest_handle(val, hnd) do { val = (hnd).p; } while (0) long arch_do_sysctl( struct xen_sysctl *sysctl, XEN_GUEST_HANDLE(xen_sysctl_t) u_sysctl) @@ -34,25 +38,57 @@ long arch_do_sysctl( case XEN_SYSCTL_physinfo: { + uint32_t i, max_array_ent; + xen_sysctl_physinfo_t *pi = &sysctl->u.physinfo; pi->threads_per_core = cpus_weight(cpu_sibling_map[0]); pi->cores_per_socket = cpus_weight(cpu_core_map[0]) / pi->threads_per_core; - pi->sockets_per_node = - num_online_cpus() / cpus_weight(cpu_core_map[0]); + pi->nr_nodes = num_online_nodes(); + + /* + * RHEL5 ABI compat: + * Newer userspace expects 'sockets_per_node' to actually + * contain 'nr_cpus' data. + */ + if (sysctl->interface_version > XEN_SYSCTL_INTERFACE_VERSION) + pi->sockets_per_node = (u32)num_online_cpus(); + else + pi->sockets_per_node = num_online_cpus() / + (pi->nr_nodes * pi->cores_per_socket * pi->threads_per_core); - pi->nr_nodes = 1; pi->total_pages = total_pages; pi->free_pages = avail_domheap_pages(); - pi->scrub_pages = avail_scrub_pages(); + pi->scrub_pages = 0; pi->cpu_khz = cpu_khz; memset(pi->hw_cap, 0, sizeof(pi->hw_cap)); memcpy(pi->hw_cap, boot_cpu_data.x86_capability, NCAPINTS*4); - ret = 0; - if ( copy_to_guest(u_sysctl, sysctl, 1) ) - ret = -EFAULT; + + max_array_ent = pi->max_cpu_id; + pi->max_cpu_id = last_cpu(cpu_online_map); + max_array_ent = min_t(uint32_t, max_array_ent, pi->max_cpu_id); + + ret = -EFAULT; + /* + * RHEL5 ABI compat: + * Only fill in extended NUMA info if a newer userspace + * is talking to us + */ + if (sysctl->interface_version > XEN_SYSCTL_INTERFACE_VERSION) + { + if ( !guest_handle_is_null(pi->cpu_to_node) ) + { + for ( i = 0; i <= max_array_ent; i++ ) + { + uint32_t node = cpu_online(i) ? cpu_to_node(i) : ~0u; + if ( copy_to_guest_offset(pi->cpu_to_node, i, &node, 1) ) + break; + } + } + } + ret = copy_to_guest(u_sysctl, sysctl, 1) ? -EFAULT : 0; } break; diff -Naurp xen/arch/x86/time.c xen-redhat/arch/x86/time.c --- xen/arch/x86/time.c +++ xen-redhat/arch/x86/time.c @@ -177,7 +177,6 @@ static u64 init_pit_and_calibrate_tsc(vo unsigned long count; /* Set PIT channel 0 to HZ Hz. */ -#define CLOCK_TICK_RATE 1193180 /* crystal freq (Hz) */ #define LATCH (((CLOCK_TICK_RATE)+(HZ/2))/HZ) outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ outb_p(LATCH & 0xff, PIT_CH0); /* LSB */ @@ -554,8 +553,7 @@ static void init_platform_timer(void) if ( (rc <= 0) && !init_cyclone(pts) && - !init_hpet(pts) && - !init_pmtimer(pts) ) + !init_hpet(pts) ) init_pit(pts); plt_mask = (u32)~0u >> (32 - pts->counter_bits); @@ -725,6 +723,37 @@ void update_domain_wallclock_time(struct spin_unlock(&wc_lock); } +int cpu_frequency_change(u64 freq) +{ + struct cpu_time *t = &this_cpu(cpu_time); + u64 curr_tsc; + + /* Sanity check: CPU frequency allegedly dropping below 1MHz? */ + if ( freq < 1000000u ) + { + gdprintk(XENLOG_WARNING, "Rejecting CPU frequency change " + "to %"PRIu64" Hz.\n", freq); + return -EINVAL; + } + + local_irq_disable(); + rdtscll(curr_tsc); + t->local_tsc_stamp = curr_tsc; + t->stime_master_stamp = read_platform_stime(); + /* TSC-extrapolated time may be bogus after frequency change. */ + /*t->stime_local_stamp = get_s_time();*/ + t->stime_local_stamp = t->stime_master_stamp; + set_time_scale(&t->tsc_scale, freq); + local_irq_enable(); + + /* A full epoch should pass before we check for deviation. */ + set_timer(&t->calibration_timer, NOW() + EPOCH); + if ( smp_processor_id() == 0 ) + platform_time_calibration(); + + return 0; +} + /* Set clock to <secs,usecs> after 00:00:00 UTC, 1 January, 1970. */ void do_settime(unsigned long secs, unsigned long nsecs, u64 system_time_base) { @@ -869,12 +898,14 @@ static void local_time_calibration(void error_factor, calibration_mul_frac, tsc_shift); #endif - /* Record new timestamp information. */ + /* Record new timestamp information, atomically w.r.t. interrupts. */ + local_irq_disable(); t->tsc_scale.mul_frac = calibration_mul_frac; t->tsc_scale.shift = tsc_shift; t->local_tsc_stamp = curr_tsc; t->stime_local_stamp = curr_local_stime; t->stime_master_stamp = curr_master_stime; + local_irq_enable(); update_vcpu_system_time(current); @@ -974,6 +1005,50 @@ int time_resume(void) return 0; } +int dom0_pit_access(struct ioreq *ioreq) +{ + /* Is Xen using Channel 2? Then disallow direct dom0 access. */ + if ( plt_src.read_counter == read_pit_count ) + return 0; + + switch ( ioreq->addr ) + { + case PIT_CH2: + if ( ioreq->dir == IOREQ_READ ) + ioreq->data = inb(PIT_CH2); + else + outb(ioreq->data, PIT_CH2); + return 1; + + case PIT_MODE: + if ( ioreq->dir == IOREQ_READ ) + return 0; /* urk! */ + switch ( ioreq->data & 0xc0 ) + { + case 0xc0: /* Read Back */ + if ( ioreq->data & 0x08 ) /* Select Channel 2? */ + outb(ioreq->data & 0xf8, PIT_MODE); + if ( !(ioreq->data & 0x06) ) /* Select Channel 0/1? */ + return 1; /* no - we're done */ + /* Filter Channel 2 and reserved bit 0. */ + ioreq->data &= ~0x09; + return 0; /* emulate ch0/1 readback */ + case 0x80: /* Select Counter 2 */ + outb(ioreq->data, PIT_MODE); + return 1; + } + + case 0x61: + if ( ioreq->dir == IOREQ_READ ) + ioreq->data = inb(0x61); + else + outb((inb(0x61) & ~3) | (ioreq->data & 3), 0x61); + return 1; + } + + return 0; +} + /* * Local variables: * mode: C diff -Naurp xen/arch/x86/traps.c xen-redhat/arch/x86/traps.c --- xen/arch/x86/traps.c +++ xen-redhat/arch/x86/traps.c @@ -107,6 +107,8 @@ DECLARE_TRAP_HANDLER(spurious_interrupt_ long do_set_debugreg(int reg, unsigned long value); unsigned long do_get_debugreg(int reg); +void (*ioemul_handle_quirk)( + u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs); static int debug_stack_lines = 20; integer_param("debug_stack_lines", debug_stack_lines); @@ -602,28 +604,76 @@ static int emulate_forced_invalid_op(str : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "0" (a), "1" (b), "2" (c), "3" (d) ); - if ( regs->eax == 1 ) + if ( (regs->eax & 0x7fffffff) == 1 ) { /* Modify Feature Information. */ clear_bit(X86_FEATURE_VME, &d); + if ( !cpu_has_apic ) + clear_bit(X86_FEATURE_APIC, &d); clear_bit(X86_FEATURE_DE, &d); clear_bit(X86_FEATURE_PSE, &d); clear_bit(X86_FEATURE_PGE, &d); - if ( !supervisor_mode_kernel ) - clear_bit(X86_FEATURE_SEP, &d); + clear_bit(X86_FEATURE_MCE, &d); + clear_bit(X86_FEATURE_MCA, &d); if ( !IS_PRIV(current->domain) ) clear_bit(X86_FEATURE_MTRR, &d); + clear_bit(X86_FEATURE_PSE36, &d); } - else if ( regs->eax == 0x80000001 ) + switch ( (uint32_t)regs->eax ) { + case 1: + /* Modify Feature Information. */ + if ( !supervisor_mode_kernel ) + clear_bit(X86_FEATURE_SEP, &d); + clear_bit(X86_FEATURE_DS, &d); + clear_bit(X86_FEATURE_ACC, &d); + clear_bit(X86_FEATURE_PBE, &d); + + clear_bit(X86_FEATURE_DTES64 % 32, &c); + clear_bit(X86_FEATURE_MWAIT % 32, &c); + clear_bit(X86_FEATURE_DSCPL % 32, &c); + clear_bit(X86_FEATURE_SMXE % 32, &c); + clear_bit(X86_FEATURE_TM2 % 32, &c); + if ( is_pv_32bit_vcpu(current) ) + clear_bit(X86_FEATURE_CX16 % 32, &c); + clear_bit(X86_FEATURE_XTPR % 32, &c); + clear_bit(X86_FEATURE_PDCM % 32, &c); + clear_bit(X86_FEATURE_DCA % 32, &c); + clear_bit(X86_FEATURE_XSAVE % 32, &c); + set_bit(X86_FEATURE_HYPERVISOR % 32, &c); + break; + case 0x80000001: /* Modify Feature Information. */ if ( is_pv_32bit_vcpu(current) ) + { clear_bit(X86_FEATURE_SYSCALL % 32, &d); + clear_bit(X86_FEATURE_LM % 32, &d); + clear_bit(X86_FEATURE_LAHF_LM % 32, &c); + } + clear_bit(X86_FEATURE_PAGE1GB % 32, &d); clear_bit(X86_FEATURE_RDTSCP % 32, &d); - } - else - { + + clear_bit(X86_FEATURE_OSVW % 32, &c); + clear_bit(X86_FEATURE_IBS % 32, &c); + clear_bit(X86_FEATURE_SKINIT % 32, &c); + clear_bit(X86_FEATURE_WDT % 32, &c); + clear_bit(X86_FEATURE_LWP % 32, &c); + clear_bit(X86_FEATURE_NODEID_MSR % 32, &c); + clear_bit(X86_FEATURE_TOPOEXT % 32, &c); + clear_bit(X86_FEATURE_PERFCTR_CORE % 32, &c); + clear_bit(X86_FEATURE_PERFCTR_NB % 32, &c); + break; + case 5: /* MONITOR/MWAIT */ + case 0xa: /* Architectural Performance Monitor Features */ + case 0x8000000a: /* SVM revision and features */ + case 0x8000001b: /* Instruction Based Sampling */ + case 0x8000001c: /* Light Weight Profiling */ + case 0x8000001e: /* Extended topology reporting */ + a = b = c = d = 0; + break; + default: (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d); + break; } regs->eax = a; @@ -641,6 +691,7 @@ asmlinkage int do_invalid_op(struct cpu_ struct bug_frame bug; struct bug_frame_str bug_str; char *filename, *predicate, *eip = (char *)regs->eip; + unsigned long fixup; int rc, id, lineno; DEBUGGER_trap_entry(TRAP_invalid_op, regs); @@ -711,6 +762,11 @@ asmlinkage int do_invalid_op(struct cpu_ predicate, filename, lineno); die: + if ( (fixup = search_exception_table(regs->eip)) != 0 ) + { + regs->eip = fixup; + return 0; + } DEBUGGER_trap_fatal(TRAP_invalid_op, regs); show_execution_state(regs); panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op); @@ -738,6 +794,15 @@ asmlinkage void do_machine_check(struct machine_check_vector(regs, regs->error_code); } +static void reserved_bit_page_fault( + unsigned long addr, struct cpu_user_regs *regs) +{ + printk("d%d:v%d: reserved bit in page table (ec=%04X)\n", + current->domain->domain_id, current->vcpu_id, regs->error_code); + show_page_walk(addr); + show_execution_state(regs); +} + void propagate_page_fault(unsigned long addr, u16 error_code) { struct trap_info *ti; @@ -761,10 +826,13 @@ void propagate_page_fault(unsigned long tb->flags |= TBF_INTERRUPT; if ( unlikely(null_trap_bounce(v, tb)) ) { - printk("Unhandled page fault in domain %d on VCPU %d (ec=%04X)\n", + printk("d%d:v%d: unhandled page fault (ec=%04X)\n", v->domain->domain_id, v->vcpu_id, error_code); show_page_walk(addr); } + + if ( unlikely(error_code & PFEC_reserved_bit) ) + reserved_bit_page_fault(addr, guest_cpu_user_regs()); } static int handle_gdt_ldt_mapping_fault( @@ -940,7 +1008,8 @@ static int fixup_page_fault(unsigned lon { if ( paging_mode_external(d) && guest_mode(regs) ) return paging_fault(addr, regs); - if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) ) + if ( !(regs->error_code & (PFEC_user_mode | PFEC_reserved_bit)) && + (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) ) return handle_gdt_ldt_mapping_fault( addr - GDT_LDT_VIRT_START, regs); return 0; @@ -950,7 +1019,8 @@ static int fixup_page_fault(unsigned lon guest_kernel_mode(v, regs) && /* Do not check if access-protection fault since the page may legitimately be not present in shadow page tables */ - ((regs->error_code & PFEC_write_access) == PFEC_write_access) && + ((regs->error_code & (PFEC_write_access|PFEC_reserved_bit)) == + PFEC_write_access) && ptwr_do_page_fault(v, addr, regs) ) return EXCRET_fault_fixed; @@ -990,6 +1060,8 @@ asmlinkage int do_page_fault(struct cpu_ if ( likely((fixup = search_exception_table(regs->eip)) != 0) ) { perfc_incr(copy_user_faults); + if ( unlikely(regs->error_code & PFEC_reserved_bit) ) + reserved_bit_page_fault(addr, regs); regs->eip = fixup; return 0; } @@ -1117,7 +1189,7 @@ static int read_descriptor(unsigned int } /* Has the guest requested sufficient permission for this I/O access? */ -static inline int guest_io_okay( +static int guest_io_okay( unsigned int port, unsigned int bytes, struct vcpu *v, struct cpu_user_regs *regs) { @@ -1159,19 +1231,126 @@ static inline int guest_io_okay( } /* Has the administrator granted sufficient permission for this I/O access? */ -static inline int admin_io_okay( +static int admin_io_okay( unsigned int port, unsigned int bytes, struct vcpu *v, struct cpu_user_regs *regs) { + /* + * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses. + * We never permit direct access to that register. + */ + if ( (port == 0xcf8) && (bytes == 4) ) + return 0; + return ioports_access_permitted(v->domain, port, port + bytes - 1); } -#define guest_inb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r) -#define guest_inw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r) -#define guest_inl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r) -#define guest_outb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r) -#define guest_outw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r) -#define guest_outl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r) +static uint32_t guest_io_read( + unsigned int port, unsigned int bytes, + struct vcpu *v, struct cpu_user_regs *regs) +{ + extern uint32_t pci_conf_read( + uint32_t cf8, uint8_t offset, uint8_t bytes); + + uint32_t data = 0; + unsigned int shift = 0; + + if ( admin_io_okay(port, bytes, v, regs) ) + { + switch ( bytes ) + { + case 1: return inb(port); + case 2: return inw(port); + case 4: return inl(port); + } + } + + while ( bytes != 0 ) + { + unsigned int size = 1; + uint32_t sub_data = 0xff; + + if ( (port == 0x42) || (port == 0x43) || (port == 0x61) ) + { + sub_data = pv_pit_handler(port, 0, 0); + } + else if ( (port == 0xcf8) && (bytes == 4) ) + { + size = 4; + sub_data = v->domain->arch.pci_cf8; + } + else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) ) + { + size = min(bytes, 4 - (port & 3)); + if ( size == 3 ) + size = 2; + sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size); + } + + if ( size == 4 ) + return sub_data; + + data |= (sub_data & ((1u << (size * 8)) - 1)) << shift; + shift += size * 8; + port += size; + bytes -= size; + } + + return data; +} + +static void guest_io_write( + unsigned int port, unsigned int bytes, uint32_t data, + struct vcpu *v, struct cpu_user_regs *regs) +{ + extern void pci_conf_write( + uint32_t cf8, uint8_t offset, uint8_t bytes, uint32_t data); + + if ( admin_io_okay(port, bytes, v, regs) ) + { + switch ( bytes ) { + case 1: + outb((uint8_t)data, port); + break; + case 2: + outw((uint16_t)data, port); + break; + case 4: + outl(data, port); + break; + } + return; + } + + while ( bytes != 0 ) + { + unsigned int size = 1; + + if ( (port == 0x42) || (port == 0x43) || (port == 0x61) ) + { + pv_pit_handler(port, (uint8_t)data, 1); + } + else if ( (port == 0xcf8) && (bytes == 4) ) + { + size = 4; + v->domain->arch.pci_cf8 = data; + } + else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) ) + { + size = min(bytes, 4 - (port & 3)); + if ( size == 3 ) + size = 2; + pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data); + } + + if ( size == 4 ) + return; + + port += size; + bytes -= size; + data >>= size * 8; + } +} /* I/O emulation support. Helper routines for, and type of, the stack stub.*/ void host_to_guest_gpr_switch(struct cpu_user_regs *) @@ -1198,6 +1377,12 @@ unsigned long guest_to_host_gpr_switch(u # define read_sreg(regs, sr) read_segment_register(sr) #endif +static int is_cpufreq_controller(struct domain *d) +{ + return ((cpufreq_controller == FREQCTL_dom0_kernel) && + (d->domain_id == 0)); +} + static int emulate_privileged_op(struct cpu_user_regs *regs) { struct vcpu *v = current; @@ -1217,7 +1402,7 @@ static int emulate_privileged_op(struct ? (*(u32 *)®s->reg = (val)) \ : (*(u16 *)®s->reg = (val))) unsigned long code_base, code_limit; - char io_emul_stub[16]; + char io_emul_stub[32]; void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1))); u32 l, h, eax, edx; @@ -1286,7 +1471,7 @@ static int emulate_privileged_op(struct /* REX prefix. */ if ( rex & 8 ) /* REX.W */ - op_bytes = 4; /* emulating only opcodes not supporting 64-bit operands */ + op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */ modrm_reg = (rex & 4) << 1; /* REX.R */ /* REX.X does not need to be decoded. */ modrm_rm = (rex & 1) << 3; /* REX.B */ @@ -1315,7 +1500,8 @@ static int emulate_privileged_op(struct { if ( !read_descriptor(data_sel, v, regs, &data_base, &data_limit, &ar, - _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) ) + _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL| + _SEGMENT_P) ) goto fail; if ( !(ar & _SEGMENT_S) || !(ar & _SEGMENT_P) || @@ -1354,73 +1540,47 @@ static int emulate_privileged_op(struct } #endif + port = (u16)regs->edx; + continue_io_string: switch ( opcode ) { case 0x6c: /* INSB */ op_bytes = 1; case 0x6d: /* INSW/INSL */ - if ( data_limit < op_bytes - 1 || - rd_ad(edi) > data_limit - (op_bytes - 1) || - !guest_io_okay((u16)regs->edx, op_bytes, v, regs) ) + if ( (data_limit < (op_bytes - 1)) || + (rd_ad(edi) > (data_limit - (op_bytes - 1))) || + !guest_io_okay(port, op_bytes, v, regs) ) goto fail; - port = (u16)regs->edx; - switch ( op_bytes ) - { - case 1: - /* emulate PIT counter 2 */ - data = (u8)(guest_inb_okay(port, v, regs) ? inb(port) : - ((port == 0x42 || port == 0x43 || port == 0x61) ? - pv_pit_handler(port, 0, 0) : ~0)); - break; - case 2: - data = (u16)(guest_inw_okay(port, v, regs) ? inw(port) : ~0); - break; - case 4: - data = (u32)(guest_inl_okay(port, v, regs) ? inl(port) : ~0); - break; - } - if ( (rc = copy_to_user((void *)data_base + rd_ad(edi), &data, op_bytes)) != 0 ) + data = guest_io_read(port, op_bytes, v, regs); + if ( (rc = copy_to_user((void *)data_base + rd_ad(edi), + &data, op_bytes)) != 0 ) { propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc, PFEC_write_access); return EXCRET_fault_fixed; } - wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes)); + wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF) + ? -op_bytes : op_bytes)); break; case 0x6e: /* OUTSB */ op_bytes = 1; case 0x6f: /* OUTSW/OUTSL */ - if ( data_limit < op_bytes - 1 || - rd_ad(esi) > data_limit - (op_bytes - 1) || - !guest_io_okay((u16)regs->edx, op_bytes, v, regs) ) + if ( (data_limit < (op_bytes - 1)) || + (rd_ad(esi) > (data_limit - (op_bytes - 1))) || + !guest_io_okay(port, op_bytes, v, regs) ) goto fail; - rc = copy_from_user(&data, (void *)data_base + rd_ad(esi), op_bytes); - if ( rc != 0 ) + if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi), + op_bytes)) != 0 ) { - propagate_page_fault(data_base + rd_ad(esi) + op_bytes - rc, 0); + propagate_page_fault(data_base + rd_ad(esi) + + op_bytes - rc, 0); return EXCRET_fault_fixed; } - port = (u16)regs->edx; - switch ( op_bytes ) - { - case 1: - if ( guest_outb_okay(port, v, regs) ) - outb((u8)data, port); - else if ( port == 0x42 || port == 0x43 || port == 0x61 ) - pv_pit_handler(port, data, 1); - break; - case 2: - if ( guest_outw_okay(port, v, regs) ) - outw((u16)data, port); - break; - case 4: - if ( guest_outl_okay(port, v, regs) ) - outl((u32)data, port); - break; - } - wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes)); + guest_io_write(port, op_bytes, data, v, regs); + wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF) + ? -op_bytes : op_bytes)); break; } @@ -1468,6 +1628,9 @@ static int emulate_privileged_op(struct /* Handy function-typed pointer to the stub. */ io_emul = (void *)io_emul_stub; + if ( ioemul_handle_quirk ) + ioemul_handle_quirk(opcode, &io_emul_stub[12], regs); + /* I/O Port and Interrupt Flag instructions. */ switch ( opcode ) { @@ -1479,31 +1642,17 @@ static int emulate_privileged_op(struct exec_in: if ( !guest_io_okay(port, op_bytes, v, regs) ) goto fail; - switch ( op_bytes ) + if ( admin_io_okay(port, op_bytes, v, regs) ) { - case 1: - if ( guest_inb_okay(port, v, regs) ) - io_emul(regs); - else if ( port == 0x42 || port == 0x43 || port == 0x61 ) - { - regs->eax &= ~0xffUL; - regs->eax |= pv_pit_handler(port, 0, 0); - } - else - regs->eax |= (u8)~0; - break; - case 2: - if ( guest_inw_okay(port, v, regs) ) - io_emul(regs); - else - regs->eax |= (u16)~0; - break; - case 4: - if ( guest_inl_okay(port, v, regs) ) - io_emul(regs); + io_emul(regs); + } + else + { + if ( op_bytes == 4 ) + regs->eax = 0; else - regs->eax = (u32)~0; - break; + regs->eax &= ~((1u << (op_bytes * 8)) - 1); + regs->eax |= guest_io_read(port, op_bytes, v, regs); } goto done; @@ -1521,22 +1670,11 @@ static int emulate_privileged_op(struct exec_out: if ( !guest_io_okay(port, op_bytes, v, regs) ) goto fail; - switch ( op_bytes ) + if ( admin_io_okay(port, op_bytes, v, regs) ) + io_emul(regs); + else { - case 1: - if ( guest_outb_okay(port, v, regs) ) - io_emul(regs); - else if ( port == 0x42 || port == 0x43 || port == 0x61 ) - pv_pit_handler(port, regs->eax, 1); - break; - case 2: - if ( guest_outw_okay(port, v, regs) ) - io_emul(regs); - break; - case 4: - if ( guest_outl_okay(port, v, regs) ) - io_emul(regs); - break; + guest_io_write(port, op_bytes, regs->eax, v, regs); } goto done; @@ -1674,10 +1812,9 @@ static int emulate_privileged_op(struct break; case 4: /* Write CR4 */ - if ( *reg != (read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE)) ) - gdprintk(XENLOG_WARNING, - "Attempt to change CR4 flags %08lx -> %08lx\n", - read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE), *reg); + v->arch.guest_context.ctrlreg[4] = pv_guest_cr4_fixup(*reg); + write_cr4(pv_guest_cr4_to_real_cr4( + v->arch.guest_context.ctrlreg[4])); break; default: @@ -1698,7 +1835,7 @@ static int emulate_privileged_op(struct eax = regs->eax; edx = regs->edx; res = ((u64)edx << 32) | eax; - switch ( regs->ecx ) + switch ( (u32)regs->ecx ) { #ifdef CONFIG_X86_64 case MSR_FS_BASE: @@ -1723,10 +1860,49 @@ static int emulate_privileged_op(struct v->arch.guest_context.gs_base_user = res; break; #endif + case MSR_K8_FIDVID_STATUS: + case MSR_K8_FIDVID_CTL: + case MSR_K8_PSTATE_LIMIT: + case MSR_K8_PSTATE_CTRL: + case MSR_K8_PSTATE_STATUS: + case MSR_K8_PSTATE0: + case MSR_K8_PSTATE1: + case MSR_K8_PSTATE2: + case MSR_K8_PSTATE3: + case MSR_K8_PSTATE4: + case MSR_K8_PSTATE5: + case MSR_K8_PSTATE6: + case MSR_K8_PSTATE7: + case MSR_K8_HWCR: + if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) + goto fail; + if ( !is_cpufreq_controller(v->domain) ) + break; + if ( wrmsr_safe(regs->ecx, eax, edx) != 0 ) + goto fail; + break; + case MSR_IA32_MPERF: + case MSR_IA32_APERF: + if (( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) && + ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) ) + goto fail; + if ( !is_cpufreq_controller(v->domain) ) + break; + if ( wrmsr_safe(regs->ecx, eax, edx) != 0 ) + goto fail; + break; + case MSR_IA32_PERF_CTL: + case MSR_IA32_THERM_CONTROL: + if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) + goto fail; + if ( !is_cpufreq_controller(v->domain) ) + break; + if ( wrmsr_safe(regs->ecx, eax, edx) != 0 ) + goto fail; + break; default: if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) ) break; - if ( (rdmsr_safe(regs->ecx, l, h) != 0) || (eax != l) || (edx != h) ) gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from " @@ -1736,8 +1912,12 @@ static int emulate_privileged_op(struct } break; + case 0x31: /* RDTSC */ + rdtsc(regs->eax, regs->edx); + break; + case 0x32: /* RDMSR */ - switch ( regs->ecx ) + switch ( (u32)regs->ecx ) { #ifdef CONFIG_X86_64 case MSR_FS_BASE: @@ -1759,10 +1939,48 @@ static int emulate_privileged_op(struct regs->edx = v->arch.guest_context.gs_base_user >> 32; break; #endif + case MSR_K8_FIDVID_CTL: + case MSR_K8_FIDVID_STATUS: + case MSR_K8_PSTATE_LIMIT: + case MSR_K8_PSTATE_CTRL: + case MSR_K8_PSTATE_STATUS: + case MSR_K8_PSTATE0: + case MSR_K8_PSTATE1: + case MSR_K8_PSTATE2: + case MSR_K8_PSTATE3: + case MSR_K8_PSTATE4: + case MSR_K8_PSTATE5: + case MSR_K8_PSTATE6: + case MSR_K8_PSTATE7: + if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) + goto fail; + if ( !is_cpufreq_controller(v->domain) ) + { + regs->eax = regs->edx = 0; + break; + } + if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) != 0 ) + goto fail; + break; case MSR_EFER: if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) ) goto fail; break; + case MSR_IA32_MISC_ENABLE: + if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) ) + goto fail; + regs->eax &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL | + MSR_IA32_MISC_ENABLE_MONITOR_ENABLE); + regs->eax |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL | + MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | + MSR_IA32_MISC_ENABLE_XTPR_DISABLE; + break; + case MSR_IA32_THERM_CONTROL: + if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) + goto fail; + if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) ) + goto fail; + break; default: if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) ) { @@ -2063,13 +2281,13 @@ void set_task_gate(unsigned int n, unsig void set_tss_desc(unsigned int n, void *addr) { _set_tssldt_desc( - gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY, + per_cpu(gdt_table, n) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY, (unsigned long)addr, offsetof(struct tss_struct, __cacheline_filler) - 1, 9); #ifdef CONFIG_COMPAT _set_tssldt_desc( - compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY, + per_cpu(compat_gdt_table, n) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY, (unsigned long)addr, offsetof(struct tss_struct, __cacheline_filler) - 1, 11); diff -Naurp xen/arch/x86/x86_32/asm-offsets.c xen-redhat/arch/x86/x86_32/asm-offsets.c --- xen/arch/x86/x86_32/asm-offsets.c +++ xen-redhat/arch/x86/x86_32/asm-offsets.c @@ -114,4 +114,7 @@ void __dummy__(void) BLANK(); DEFINE(IRQSTAT_shift, LOG_2(sizeof(irq_cpustat_t))); + BLANK(); + + OFFSET(CPUINFO_ext_features, struct cpuinfo_x86, x86_capability[1]); } diff -Naurp xen/arch/x86/x86_32/mm.c xen-redhat/arch/x86/x86_32/mm.c --- xen/arch/x86/x86_32/mm.c +++ xen-redhat/arch/x86/x86_32/mm.c @@ -191,7 +191,7 @@ void __init subarch_init_memory(void) { /* Guest kernel runs in ring 0, not ring 1. */ struct desc_struct *d; - d = &gdt_table[(FLAT_RING1_CS >> 3) - FIRST_RESERVED_GDT_ENTRY]; + d = &boot_cpu_gdt_table[(FLAT_RING1_CS >> 3) - FIRST_RESERVED_GDT_ENTRY]; d[0].b &= ~_SEGMENT_DPL; d[1].b &= ~_SEGMENT_DPL; } diff -Naurp xen/arch/x86/x86_32/seg_fixup.c xen-redhat/arch/x86/x86_32/seg_fixup.c --- xen/arch/x86/x86_32/seg_fixup.c +++ xen-redhat/arch/x86/x86_32/seg_fixup.c @@ -42,7 +42,7 @@ #define O OPCODE_BYTE #define M HAS_MODRM -static unsigned char insn_decode[256] = { +static const u8 insn_decode[256] = { /* 0x00 - 0x0F */ O|M, O|M, O|M, O|M, X, X, X, X, O|M, O|M, O|M, O|M, X, X, X, X, @@ -69,7 +69,7 @@ static unsigned char insn_decode[256] = X, X, X, X, X, X, X, X, /* 0x80 - 0x8F */ O|M|1, O|M|4, O|M|1, O|M|1, O|M, O|M, O|M, O|M, - O|M, O|M, O|M, O|M, O|M, O|M, O|M, X, + O|M, O|M, O|M, O|M, O|M, X|M, O|M, O|M, /* 0x90 - 0x9F */ X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, @@ -89,17 +89,28 @@ static unsigned char insn_decode[256] = X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0xF0 - 0xFF */ - X, X, X, X, X, X, X, X, + X, X, X, X, X, X, O|M, O|M, X, X, X, X, X, X, O|M, O|M }; -static unsigned char twobyte_decode[256] = { +static const u8 float_decode[64] = { + O|M, O|M, O|M, O|M, O|M, O|M, O|M, O|M, /* 0xD8 */ + O|M, X, O|M, O|M, O|M, O|M, O|M, O|M, /* 0xD9 */ + O|M, O|M, O|M, O|M, O|M, O|M, O|M, O|M, /* 0xDA */ + O|M, X, O|M, O|M, X, O|M, X, O|M, /* 0xDB */ + O|M, O|M, O|M, O|M, O|M, O|M, O|M, O|M, /* 0xDC */ + O|M, O|M, O|M, O|M, O|M, X, O|M, O|M, /* 0xDD */ + O|M, O|M, O|M, O|M, O|M, O|M, O|M, O|M, /* 0xDE */ + O|M, X, O|M, O|M, O|M, O|M, O|M, O|M, /* 0xDF */ +}; + +static const u8 twobyte_decode[256] = { /* 0x00 - 0x0F */ X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x10 - 0x1F */ X, X, X, X, X, X, X, X, - X, X, X, X, X, X, X, X, + O|M, X, X, X, X, X, X, X, /* 0x20 - 0x2F */ X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, @@ -122,16 +133,16 @@ static unsigned char twobyte_decode[256] X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x90 - 0x9F */ - X, X, X, X, X, X, X, X, - X, X, X, X, X, X, X, X, + O|M, O|M, O|M, O|M, O|M, O|M, O|M, O|M, + O|M, O|M, O|M, O|M, O|M, O|M, O|M, O|M, /* 0xA0 - 0xAF */ - X, X, X, X, X, X, X, X, - X, X, X, X, X, X, X, X, + X, X, X, O|M, O|M|1, O|M, O|M, X, + X, X, X, O|M, O|M|1, O|M, X, O|M, /* 0xB0 - 0xBF */ - X, X, X, X, X, X, X, X, - X, X, X, X, X, X, X, X, + X, X, X, O|M, X, X, O|M, O|M, + X, X, O|M|1, O|M, O|M, O|M, O|M, O|M, /* 0xC0 - 0xCF */ - X, X, X, X, X, X, X, X, + O|M, O|M, X, O|M, X, X, X, O|M, X, X, X, X, X, X, X, X, /* 0xD0 - 0xDF */ X, X, X, X, X, X, X, X, @@ -155,22 +166,22 @@ static unsigned char twobyte_decode[256] */ int get_baselimit(u16 seg, unsigned long *base, unsigned long *limit) { - struct vcpu *d = current; - unsigned long *table, a, b; - int ldt = !!(seg & 4); - int idx = (seg >> 3) & 8191; + struct vcpu *curr = current; + uint32_t *table, a, b; + int ldt = !!(seg & 4); + int idx = (seg >> 3) & 8191; /* Get base and check limit. */ if ( ldt ) { - table = (unsigned long *)LDT_VIRT_START(d); - if ( idx >= d->arch.guest_context.ldt_ents ) + table = (uint32_t *)LDT_VIRT_START(curr); + if ( idx >= curr->arch.guest_context.ldt_ents ) goto fail; } else /* gdt */ { - table = (unsigned long *)GDT_VIRT_START(d); - if ( idx >= d->arch.guest_context.gdt_ents ) + table = (uint32_t *)GDT_VIRT_START(curr); + if ( idx >= curr->arch.guest_context.gdt_ents ) goto fail; } @@ -221,29 +232,29 @@ int linearise_address(u16 seg, unsigned int fixup_seg(u16 seg, unsigned long offset) { - struct vcpu *d = current; - unsigned long *table, a, b, base, limit; - int ldt = !!(seg & 4); - int idx = (seg >> 3) & 8191; + struct vcpu *curr = current; + uint32_t *table, a, b, base, limit; + int ldt = !!(seg & 4); + int idx = (seg >> 3) & 8191; /* Get base and check limit. */ if ( ldt ) { - table = (unsigned long *)LDT_VIRT_START(d); - if ( idx >= d->arch.guest_context.ldt_ents ) + table = (uint32_t *)LDT_VIRT_START(curr); + if ( idx >= curr->arch.guest_context.ldt_ents ) { dprintk(XENLOG_DEBUG, "Segment %04x out of LDT range (%ld)\n", - seg, d->arch.guest_context.ldt_ents); + seg, curr->arch.guest_context.ldt_ents); goto fail; } } else /* gdt */ { - table = (unsigned long *)GDT_VIRT_START(d); - if ( idx >= d->arch.guest_context.gdt_ents ) + table = (uint32_t *)GDT_VIRT_START(curr); + if ( idx >= curr->arch.guest_context.gdt_ents ) { dprintk(XENLOG_DEBUG, "Segment %04x out of GDT range (%ld)\n", - seg, d->arch.guest_context.gdt_ents); + seg, curr->arch.guest_context.gdt_ents); goto fail; } } @@ -261,7 +272,7 @@ int fixup_seg(u16 seg, unsigned long off _SEGMENT_G|_SEGMENT_CODE|_SEGMENT_DPL)) != (_SEGMENT_P|_SEGMENT_S|_SEGMENT_DB|_SEGMENT_G|_SEGMENT_DPL) ) { - dprintk(XENLOG_DEBUG, "Bad segment %08lx:%08lx\n", a, b); + dprintk(XENLOG_DEBUG, "Bad segment %08x:%08x\n", a, b); goto fail; } @@ -291,8 +302,7 @@ int fixup_seg(u16 seg, unsigned long off } } - dprintk(XENLOG_DEBUG, "None of the above! " - "(%08lx:%08lx, %08lx, %08lx, %08lx)\n", + dprintk(XENLOG_DEBUG, "None of the above! (%08x:%08x, %08x, %08x, %08x)\n", a, b, base, limit, base+limit); fail: @@ -315,18 +325,16 @@ int fixup_seg(u16 seg, unsigned long off */ int gpf_emulate_4gb(struct cpu_user_regs *regs) { - struct vcpu *d = current; - struct trap_info *ti; - struct trap_bounce *tb; - u8 modrm, mod, reg, rm, decode; - void *memreg; - unsigned long offset; - u8 disp8; - u32 disp32 = 0; + struct vcpu *curr = current; + u8 modrm, mod, rm, decode; + const u32 *base, *index = NULL; + unsigned long offset; + s8 disp8; + s32 disp32 = 0; u8 *eip; /* ptr to instruction start */ u8 *pb, b; /* ptr into instr. / current instr. byte */ - int gs_override = 0; - int twobyte = 0; + int gs_override = 0, scale = 0, opcode = -1; + const u8 *table = insn_decode; /* WARNING: We only work for ring-3 segments. */ if ( unlikely(vm86_mode(regs)) || unlikely(!ring_3(regs)) ) @@ -357,6 +365,12 @@ int gpf_emulate_4gb(struct cpu_user_regs goto fail; } + if ( opcode != -1 ) + { + opcode = (opcode << 8) | b; + break; + } + switch ( b ) { case 0x67: /* Address-size override */ @@ -375,6 +389,30 @@ int gpf_emulate_4gb(struct cpu_user_regs case 0x65: /* GS override */ gs_override = 1; break; + case 0x0f: /* Not really a prefix byte */ + table = twobyte_decode; + opcode = b; + break; + case 0xd8: /* Math coprocessor instructions. */ + case 0xd9: + case 0xda: + case 0xdb: + case 0xdc: + case 0xdd: + case 0xde: + case 0xdf: + /* Float opcodes have a secondary opcode in the modrm byte. */ + table = float_decode; + if ( get_user(modrm, pb + 1) ) + { + dprintk(XENLOG_DEBUG, "Fault while extracting modrm byte\n"); + goto page_fault; + } + + opcode = (b << 8) | modrm; + b = ((b & 7) << 3) + ((modrm >> 3) & 7); + goto done_prefix; + default: /* Not a prefix byte */ goto done_prefix; } @@ -387,47 +425,28 @@ int gpf_emulate_4gb(struct cpu_user_regs goto fail; } - decode = insn_decode[b]; /* opcode byte */ + decode = table[b]; pb++; - if ( decode == 0 && b == 0x0f ) - { - twobyte = 1; - if ( get_user(b, pb) ) - { - dprintk(XENLOG_DEBUG, - "Fault while accessing byte %ld of instruction\n", - (long)(pb-eip)); - goto page_fault; - } - - if ( (pb - eip) >= 15 ) - { - dprintk(XENLOG_DEBUG, "Too many opcode bytes for a " - "legal instruction\n"); - goto fail; - } - - decode = twobyte_decode[b]; - pb++; - } - - if ( decode == 0 ) + if ( !(decode & OPCODE_BYTE) ) { - dprintk(XENLOG_DEBUG, "Unsupported %sopcode %02x\n", - twobyte ? "two byte " : "", b); + if (opcode == -1) + dprintk(XENLOG_DEBUG, "Unsupported opcode %02x\n", b); + else + dprintk(XENLOG_DEBUG, "Unsupported opcode %02x %02x\n", + opcode >> 8, opcode & 255); goto fail; } if ( !(decode & HAS_MODRM) ) { /* Must be a <disp32>, or bail. */ - if ( (decode & 7) != 4 ) + if ( (decode & INSN_SUFFIX_BYTES) != 4 ) goto fail; if ( get_user(offset, (u32 *)pb) ) { - dprintk(XENLOG_DEBUG, "Fault while extracting <disp32>.\n"); + dprintk(XENLOG_DEBUG, "Fault while extracting <moffs32>.\n"); goto page_fault; } pb += 4; @@ -448,29 +467,39 @@ int gpf_emulate_4gb(struct cpu_user_regs pb++; mod = (modrm >> 6) & 3; - reg = (modrm >> 3) & 7; rm = (modrm >> 0) & 7; if ( rm == 4 ) { - dprintk(XENLOG_DEBUG, "FIXME: Add decoding for the SIB byte.\n"); - goto fixme; + u8 sib; + + if ( get_user(sib, pb) ) + { + dprintk(XENLOG_DEBUG, "Fault while extracting sib byte\n"); + goto page_fault; + } + + pb++; + + rm = sib & 7; + if ( (sib & 0x38) != 0x20 ) + index = decode_register((sib >> 3) & 7, regs, 0); + scale = sib >> 6; } /* Decode R/M field. */ - memreg = decode_register(rm, regs, 0); + base = decode_register(rm, regs, 0); /* Decode Mod field. */ - switch ( modrm >> 6 ) + switch ( mod ) { case 0: - disp32 = 0; if ( rm == 5 ) /* disp32 rather than (EBP) */ { - memreg = NULL; + base = NULL; if ( get_user(disp32, (u32 *)pb) ) { - dprintk(XENLOG_DEBUG, "Fault while extracting <disp8>.\n"); + dprintk(XENLOG_DEBUG, "Fault while extracting <base32>.\n"); goto page_fault; } pb += 4; @@ -484,13 +513,13 @@ int gpf_emulate_4gb(struct cpu_user_regs goto page_fault; } pb++; - disp32 = (disp8 & 0x80) ? (disp8 | ~0xff) : disp8;; + disp32 = disp8; break; case 2: if ( get_user(disp32, (u32 *)pb) ) { - dprintk(XENLOG_DEBUG, "Fault while extracting <disp8>.\n"); + dprintk(XENLOG_DEBUG, "Fault while extracting <disp32>.\n"); goto page_fault; } pb += 4; @@ -502,8 +531,10 @@ int gpf_emulate_4gb(struct cpu_user_regs } offset = disp32; - if ( memreg != NULL ) - offset += *(u32 *)memreg; + if ( base != NULL ) + offset += *base; + if ( index != NULL ) + offset += *index << scale; skip_modrm: if ( !fixup_seg((u16)regs->gs, offset) ) @@ -513,10 +544,11 @@ int gpf_emulate_4gb(struct cpu_user_regs perfc_incr(seg_fixups); /* If requested, give a callback on otherwise unused vector 15. */ - if ( VM_ASSIST(d->domain, VMASST_TYPE_4gb_segments_notify) ) + if ( VM_ASSIST(curr->domain, VMASST_TYPE_4gb_segments_notify) ) { - ti = &d->arch.guest_context.trap_ctxt[15]; - tb = &d->arch.trap_bounce; + struct trap_info *ti = &curr->arch.guest_context.trap_ctxt[15]; + struct trap_bounce *tb = &curr->arch.trap_bounce; + tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE; tb->error_code = pb - eip; tb->cs = ti->cs; @@ -527,13 +559,6 @@ int gpf_emulate_4gb(struct cpu_user_regs return EXCRET_fault_fixed; - fixme: - dprintk(XENLOG_DEBUG, "Undecodable instruction " - "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x " - "caused GPF(0) at %04x:%08x\n", - eip[0], eip[1], eip[2], eip[3], - eip[4], eip[5], eip[6], eip[7], - regs->cs, regs->eip); fail: return 0; diff -Naurp xen/arch/x86/x86_32/supervisor_mode_kernel.S xen-redhat/arch/x86/x86_32/supervisor_mode_kernel.S --- xen/arch/x86/x86_32/supervisor_mode_kernel.S +++ xen-redhat/arch/x86/x86_32/supervisor_mode_kernel.S @@ -100,15 +100,10 @@ ENTRY(fixup_ring0_guest_stack) # %gs:%esi now points to the guest stack before the # interrupt/exception occured. - /* - * Reverse the __TSS macro, giving us the CPU number. - * The TSS for this cpu is at init_tss + ( cpu * 128 ). - */ - str %ecx - shrl $3,%ecx # Calculate GDT index for TSS. - subl $(FIRST_RESERVED_GDT_ENTRY+8),%ecx # %ecx = 2*cpu. - shll $6,%ecx # Each TSS entry is 0x80 bytes - addl $init_tss,%ecx # but we have 2*cpu from above. + movl $PER_CPU_GDT_ENTRY*8,%ecx + lsll %ecx,%ecx + shll $7,%ecx # Each TSS entry is 0x80 bytes + addl $init_tss,%ecx # Load Xen stack from TSS. movw TSS_ss0(%ecx),%ax diff -Naurp xen/arch/x86/x86_32/traps.c xen-redhat/arch/x86/x86_32/traps.c --- xen/arch/x86/x86_32/traps.c +++ xen-redhat/arch/x86/x86_32/traps.c @@ -136,19 +136,20 @@ void show_page_walk(unsigned long addr) unmap_domain_page(l1t); } -#define DOUBLEFAULT_STACK_SIZE 2048 -static struct tss_struct doublefault_tss; -static unsigned char doublefault_stack[DOUBLEFAULT_STACK_SIZE]; - +DEFINE_PER_CPU(struct tss_struct *, doublefault_tss); +static unsigned char __attribute__ ((__section__ (".bss.page_aligned"))) + boot_cpu_doublefault_space[PAGE_SIZE]; asmlinkage void do_double_fault(void) { - struct tss_struct *tss = &doublefault_tss; - unsigned int cpu = ((tss->back_link>>3)-__FIRST_TSS_ENTRY)>>1; + struct tss_struct *tss; + unsigned int cpu; watchdog_disable(); console_force_unlock(); + asm ( "lsll %1, %0" : "=r" (cpu) : "rm" (PER_CPU_GDT_ENTRY << 3) ); + /* Find information saved during fault and dump it to the console. */ tss = &init_tss[cpu]; printk("*** DOUBLE FAULT ***\n"); @@ -234,34 +235,36 @@ unsigned long do_iret(void) void __init percpu_traps_init(void) { - struct tss_struct *tss = &doublefault_tss; + struct tss_struct *tss = this_cpu(doublefault_tss); asmlinkage int hypercall(void); - if ( smp_processor_id() != 0 ) - return; + if ( !tss ) + { + /* The hypercall entry vector is only accessible from ring 1. */ + _set_gate(idt_table+HYPERCALL_VECTOR, 14, 1, &hypercall); - /* The hypercall entry vector is only accessible from ring 1. */ - _set_gate(idt_table+HYPERCALL_VECTOR, 14, 1, &hypercall); + tss = (void *)boot_cpu_doublefault_space; + this_cpu(doublefault_tss) = tss; + } /* * Make a separate task for double faults. This will get us debug output if * we blow the kernel stack. */ - memset(tss, 0, sizeof(*tss)); tss->ds = __HYPERVISOR_DS; tss->es = __HYPERVISOR_DS; tss->ss = __HYPERVISOR_DS; - tss->esp = (unsigned long)&doublefault_stack[DOUBLEFAULT_STACK_SIZE]; + tss->esp = (unsigned long)tss + PAGE_SIZE; tss->__cr3 = __pa(idle_pg_table); tss->cs = __HYPERVISOR_CS; tss->eip = (unsigned long)do_double_fault; tss->eflags = 2; tss->bitmap = IOBMP_INVALID_OFFSET; _set_tssldt_desc( - gdt_table + __DOUBLEFAULT_TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY, + this_cpu(gdt_table) + DOUBLEFAULT_TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY, (unsigned long)tss, 235, 9); - set_task_gate(TRAP_double_fault, __DOUBLEFAULT_TSS_ENTRY<<3); + set_task_gate(TRAP_double_fault, DOUBLEFAULT_TSS_ENTRY << 3); } void init_int80_direct_trap(struct vcpu *v) diff -Naurp xen/arch/x86/x86_64/asm-offsets.c xen-redhat/arch/x86/x86_64/asm-offsets.c --- xen/arch/x86/x86_64/asm-offsets.c +++ xen-redhat/arch/x86/x86_64/asm-offsets.c @@ -124,4 +124,7 @@ void __dummy__(void) #endif DEFINE(IRQSTAT_shift, LOG_2(sizeof(irq_cpustat_t))); + BLANK(); + + OFFSET(CPUINFO_ext_features, struct cpuinfo_x86, x86_capability[1]); } diff -Naurp xen/arch/x86/x86_64/compat/mm.c xen-redhat/arch/x86/x86_64/compat/mm.c --- xen/arch/x86/x86_64/compat/mm.c +++ xen-redhat/arch/x86/x86_64/compat/mm.c @@ -298,9 +298,8 @@ int compat_mmuext_op(XEN_GUEST_HANDLE(mm BUG_ON(left == arg1); BUG_ON(left > count); - guest_handle_add_offset(nat_ops, count - left); - BUG_ON(left + i < count); - guest_handle_add_offset(cmp_uops, (signed int)(count - left - i)); + guest_handle_add_offset(nat_ops, i - left); + guest_handle_subtract_offset(cmp_uops, left); left = 1; BUG_ON(!hypercall_xlat_continuation(&left, 0x01, nat_ops, cmp_uops)); BUG_ON(left != arg1); diff -Naurp xen/arch/x86/x86_64/Makefile xen-redhat/arch/x86/x86_64/Makefile --- xen/arch/x86/x86_64/Makefile +++ xen-redhat/arch/x86/x86_64/Makefile @@ -1,4 +1,5 @@ subdir-y += compat +subdir-y += stratus obj-y += entry.o obj-y += gpr_switch.o diff -Naurp xen/arch/x86/x86_64/mm.c xen-redhat/arch/x86/x86_64/mm.c --- xen/arch/x86/x86_64/mm.c +++ xen-redhat/arch/x86/x86_64/mm.c @@ -428,7 +428,7 @@ int check_descriptor(const struct domain unsigned int domain_clamp_alloc_bitsize(struct domain *d, unsigned int bits) { - if ( d == NULL ) + if ( (d == NULL) || (d->arch.physaddr_bitsize == 0) ) return bits; return min(d->arch.physaddr_bitsize, bits); } diff -Naurp xen/arch/x86/x86_64/stratus/host.c xen-redhat/arch/x86/x86_64/stratus/host.c --- xen/arch/x86/x86_64/stratus/host.c +++ xen-redhat/arch/x86/x86_64/stratus/host.c @@ -0,0 +1,107 @@ +//#include "cc_os_defines.h" +//#include "host.h" +#include <asm/io.h> +#include <asm/system.h> +#include <asm/fixmap.h> +#include <asm/apicdef.h> +#include <xen/spinlock.h> + +unsigned int +OS_READ_REG_UINT32( + unsigned int * Reg + ) +{ + return readl(Reg); +} + +void +OS_WRITE_REG_UINT32( + unsigned int * Reg, + unsigned int Value + ) +{ + writel(Value,Reg); +} + + +// - misc apic defines +#define DELIVERY_PENDING 0x00001000 +#define DESTINATION_MASK 0xFF000000 +#define DESTINATION_SHIFT 24 +#define DELIVERY_MODE_MASK 0x00000700 +#define DELIVER_SMI 0x00000200 +#define DELIVER_NMI 0x00000400 +#define DELIVER_INIT 0x00000500 +#define DELIVER_STARTUP 0x00000600 +#define PHYSICAL_DESTINATION 0x00000000 +#define LOGICAL_DESTINATION 0x00000800 +#define EDGE_TRIGGERED 0x00000000 +#define LEVEL_ASSERT 0x00004000 +#define INT_MASKED 0x00010000 +#define ICR_SHORTHAND_MASK 0x000C0000 +#define ICR_USE_DEST_FIELD 0x00000000 +#define ICR_SELF 0x00040000 +#define ICR_ALL_INCL_SELF 0x00080000 +#define ICR_ALL_EXCL_SELF 0x000C0000 + + +#define APIC_REG_UINT32(Base,ByteOffset) \ + (((unsigned int *)(Base))[(ByteOffset)/sizeof(unsigned int)]) +#define LU_ID_REGISTER 0x00000020 +#define LU_INT_CMD_LOW 0x00000300 +#define LU_INT_CMD_HIGH 0x00000310 +#define LU_INT_VECTOR_1 0x00000360 + +// - used to poll until the apic is not busy +#define STALL_WHILE_APIC_BUSY(ApicBase) \ + do { \ + while (OS_READ_REG_UINT32( \ + &APIC_REG_UINT32((ApicBase),LU_INT_CMD_LOW)) & \ + DELIVERY_PENDING) \ + ; \ + } while(0) + +#define APIC_SMI_TO_PHYS_DEST (DELIVER_SMI | PHYSICAL_DESTINATION | \ + ICR_USE_DEST_FIELD | EDGE_TRIGGERED) + +#define SMI_DEST_ALL 0xffffffff +#define SMI_DEST_SELF 0xfffffffe + +// - get apic processor id +#define APIC_PROC_ID(ApicBase) \ + ((OS_READ_REG_UINT32(&APIC_REG_UINT32((ApicBase),LU_ID_REGISTER)) \ + & 0x0F000000) >> 24) + +unsigned int HostGetProcId(void) +{ + return APIC_PROC_ID(APIC_BASE); +} + +void host_request_smi(unsigned int dest) +{ + unsigned char ProcId; + unsigned long flags; + + local_irq_save(flags); + + STALL_WHILE_APIC_BUSY(APIC_BASE); + + switch (dest) { + case SMI_DEST_ALL: + OS_WRITE_REG_UINT32(&APIC_REG_UINT32(APIC_BASE,LU_INT_CMD_HIGH), + (unsigned int)(0xff << DESTINATION_SHIFT)); + break; + case SMI_DEST_SELF: + default: + ProcId = (unsigned char)(dest == SMI_DEST_SELF ? HostGetProcId() : dest); + OS_WRITE_REG_UINT32(&APIC_REG_UINT32(APIC_BASE,LU_INT_CMD_HIGH), + ProcId << DESTINATION_SHIFT); + } + + OS_WRITE_REG_UINT32(&APIC_REG_UINT32(APIC_BASE,LU_INT_CMD_LOW), + APIC_SMI_TO_PHYS_DEST); + + STALL_WHILE_APIC_BUSY(APIC_BASE); + + local_irq_restore(flags); +} diff -Naurp xen/arch/x86/x86_64/stratus/Makefile xen-redhat/arch/x86/x86_64/stratus/Makefile --- xen/arch/x86/x86_64/stratus/Makefile +++ xen-redhat/arch/x86/x86_64/stratus/Makefile @@ -0,0 +1 @@ +obj-y += stratus.o host.o diff -Naurp xen/arch/x86/x86_64/stratus/stratus.c xen-redhat/arch/x86/x86_64/stratus/stratus.c --- xen/arch/x86/x86_64/stratus/stratus.c +++ xen-redhat/arch/x86/x86_64/stratus/stratus.c @@ -0,0 +1,211 @@ +#include <xen/errno.h> +#include <xen/lib.h> +#include <xen/smp.h> +#include <xen/sched.h> +#include <xen/dmi.h> +#include <asm/uaccess.h> +#include <asm/io.h> + +#ifdef __XEN_COMPAT_H +#undef __XEN_COMPAT_H +#endif +#include <public/platform.h> + +extern void host_request_smi(unsigned int dest); + +static long cc_cr4(xenpf_stratus_call_t *cc_call) { + int rw = cc_call->u.cr4.rw; + unsigned long cr4; + + if (rw) { // Write + return -ENOSYS; + } else { // Read + asm("movq %%cr4,%0" : "=r" (cr4)); + cc_call->u.cr4.cr4 = cr4; + } + + return 0; +} + +static long cc_cpuid(xenpf_stratus_call_t *cc_call) { + cpuid( cc_call->u.cpuid.op, + &cc_call->u.cpuid.eax, + &cc_call->u.cpuid.ebx, + &cc_call->u.cpuid.ecx, + &cc_call->u.cpuid.edx ); + + return 0; +} + +static long cc_rw_msr(xenpf_stratus_call_t *cc_call) { + if (cc_call->u.msr.rw == 0) { + // Read + rdmsrl(cc_call->u.msr.msr, cc_call->u.msr.val); + } else { + wrmsrl(cc_call->u.msr.msr, cc_call->u.msr.val); + } + + return 0; +} + +static long cc_lapic_id(xenpf_stratus_call_t *cc_call) { + cc_call->u.ls.id = GET_APIC_ID(apic_read(APIC_ID)); + return 0; +} + +#define DUMP_VECTOR_PHYS (0xf00) +#define HOST_BIOS_VECTOR_PHYS (0xff0) +#define HOST_BIOS_VECTOR_SIZE (0x10) + + +static long cc_rw_hbv(xenpf_stratus_call_t *cc_call) { + int rw = cc_call->u.rw.rw; + int size = cc_call->u.rw.size; + unsigned long where = cc_call->u.rw.where; + + if (((unsigned long)where + (unsigned long)size) > + HOST_BIOS_VECTOR_SIZE) { + return -EFAULT; + } + + where += (unsigned long)__va(HOST_BIOS_VECTOR_PHYS); + + if (rw) { // Write + if (copy_from_user((void*)where, cc_call->u.rw.data, size)) { + return -EFAULT; + } + } else { // Read + if (copy_to_user(cc_call->u.rw.data, (void*)where, size)) { + return -EFAULT; + } + } + + return 0; +} + +static long cc_rw_dumpvec(xenpf_stratus_call_t *cc_call) { + int rw = cc_call->u.rw.rw; + int size = cc_call->u.rw.size; + void *where = __va(DUMP_VECTOR_PHYS); + + if (size > sizeof(int)) { + return -EINVAL; + } + + if (rw) { // Write + if (copy_from_user((void*)where, cc_call->u.rw.data, size)) { + return -EFAULT; + } + } else { // Read + if (copy_to_user(cc_call->u.rw.data, (void*)where, size)) { + return -EFAULT; + } + } + + return 0; +} + +static long cc_rw_region(xenpf_stratus_call_t *cc_call) { + switch (cc_call->u.rw.region) { + case RW_HBV: + return cc_rw_hbv(cc_call); + break; + case RW_DUMPVEC: + return cc_rw_dumpvec(cc_call); + break; + default: + return -EINVAL; + } +} + +static long cc_smi(xenpf_stratus_call_t *cc_call) { + host_request_smi(cc_call->u.smi.dest); + return 0; +} + +static long cc_hbv_memset(xenpf_stratus_call_t *cc_call) { + int size = cc_call->u.hbv_m.size; + + if (size > HOST_BIOS_VECTOR_SIZE) + size = HOST_BIOS_VECTOR_SIZE; + + memset(__va(HOST_BIOS_VECTOR_PHYS), cc_call->u.hbv_m.val, cc_call->u.hbv_m.size); + + return 0; +} + +static int locked_out = 1; + +static int found_stratus(struct dmi_system_id *d) +{ + printk("Stratus platform detected.\n"); + return 0; +} + +#define NO_MATCH { DMI_NONE, NULL} +#define MATCH DMI_MATCH + +static struct dmi_system_id stratus_platform[] = { + { found_stratus, "Stratus Platform", { + MATCH(DMI_BOARD_VENDOR, "Stratus"), + NO_MATCH, NO_MATCH, NO_MATCH + } }, + { NULL, NULL, } +}; + +static int check_stratus_dmi(void) { + // Run dmi scan looking for Stratus Vendor string. + if (dmi_check_system(stratus_platform)) + return 0; + + return 1; +} + +long do_stratus(xenpf_stratus_call_t *call) { + long ret = -EINVAL; + + if (!IS_PRIV(current->domain)) + return -EPERM; + + if (call->cmd == CC_VALIDATE_PLATFORM) + locked_out = check_stratus_dmi(); + + if (locked_out) + return -EPERM; + + switch (call->cmd) { + case CC_TRIGGER_SMI: + ret = cc_smi(call); + break; + case CC_HBV_MEMSET: + ret = cc_hbv_memset(call); + break; + case CC_RW_REGION: + ret = cc_rw_region(call); + break; + case CC_LAPIC_ID: + ret = cc_lapic_id(call); + break; + case CC_CR4: + ret = cc_cr4(call); + break; + case CC_CPUID: + ret = cc_cpuid(call); + break; + case CC_RW_MSR: + ret = cc_rw_msr(call); + break; + case CC_VALIDATE_PLATFORM: + ret = 0; // If we made it here, we are on a Stratus box. + break; + default: + printk("%s:line %d, unknown command %d\n", __func__, + __LINE__, call->cmd); + break; + } + + call->ret = ret; + + return ret; +} + diff -Naurp xen/arch/x86/x86_64/traps.c xen-redhat/arch/x86/x86_64/traps.c --- xen/arch/x86/x86_64/traps.c +++ xen-redhat/arch/x86/x86_64/traps.c @@ -147,15 +147,14 @@ void show_page_walk(unsigned long addr) asmlinkage void double_fault(void); asmlinkage void do_double_fault(struct cpu_user_regs *regs) { - unsigned int cpu, tr; - - asm ( "str %0" : "=r" (tr) ); - cpu = ((tr >> 3) - __FIRST_TSS_ENTRY) >> 2; + unsigned int cpu; watchdog_disable(); console_force_unlock(); + asm ( "lsll %1, %0" : "=r" (cpu) : "rm" (PER_CPU_GDT_ENTRY << 3) ); + /* Find information saved during fault and dump it to the console. */ printk("*** DOUBLE FAULT ***\n"); print_xen_info(); diff -Naurp xen/arch/x86/x86_emulate.c xen-redhat/arch/x86/x86_emulate.c --- xen/arch/x86/x86_emulate.c +++ xen-redhat/arch/x86/x86_emulate.c @@ -103,8 +103,8 @@ static uint8_t opcode_table[256] = { ImplicitOps, ImplicitOps, DstReg|SrcMem|ModRM, DstReg|SrcMem16|ModRM|Mov, 0, 0, 0, 0, /* 0x68 - 0x6F */ - ImplicitOps|Mov, DstMem|SrcImm|ModRM|Mov, - ImplicitOps|Mov, DstMem|SrcImmByte|ModRM|Mov, + ImplicitOps|Mov, DstReg|SrcImm|ModRM|Mov, + ImplicitOps|Mov, DstReg|SrcImmByte|ModRM|Mov, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, /* 0x70 - 0x77 */ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, @@ -1207,34 +1207,37 @@ x86_emulate( case 0x69: /* imul imm16/32 */ case 0x6b: /* imul imm8 */ { - unsigned long reg = *(long *)decode_register(modrm_reg, &_regs, 0); + unsigned long src1; /* ModR/M source operand */ + if ( ea.type == OP_REG ) + src1 = *ea.reg; + else if ( (rc = ops->read(ea.mem.seg, ea.mem.off, + &src1, op_bytes, ctxt)) ) + goto done; _regs.eflags &= ~(EFLG_OF|EFLG_CF); switch ( dst.bytes ) { case 2: dst.val = ((uint32_t)(int16_t)src.val * - (uint32_t)(int16_t)reg); + (uint32_t)(int16_t)src1); if ( (int16_t)dst.val != (uint32_t)dst.val ) _regs.eflags |= EFLG_OF|EFLG_CF; break; #ifdef __x86_64__ case 4: dst.val = ((uint64_t)(int32_t)src.val * - (uint64_t)(int32_t)reg); + (uint64_t)(int32_t)src1); if ( (int32_t)dst.val != dst.val ) _regs.eflags |= EFLG_OF|EFLG_CF; break; #endif default: { - unsigned long m[2] = { src.val, reg }; + unsigned long m[2] = { src.val, src1 }; if ( imul_dbl(m) ) _regs.eflags |= EFLG_OF|EFLG_CF; dst.val = m[0]; break; } } - dst.type = OP_REG; - dst.reg = decode_register(modrm_reg, &_regs, 0); break; } @@ -1863,7 +1866,7 @@ x86_emulate( break; case 0x9e: /* sahf */ - *(uint8_t *)_regs.eflags = (((uint8_t *)&_regs.eax)[1] & 0xd7) | 0x02; + *(uint8_t *)&_regs.eflags = (((uint8_t *)&_regs.eax)[1] & 0xd7) | 0x02; break; case 0x9f: /* lahf */ diff -Naurp xen/arch/x86/x86_emulate.c.orig xen-redhat/arch/x86/x86_emulate.c.orig --- xen/arch/x86/x86_emulate.c.orig +++ xen-redhat/arch/x86/x86_emulate.c.orig @@ -0,0 +1,2428 @@ +/****************************************************************************** + * x86_emulate.c + * + * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. + * + * Copyright (c) 2005-2007 Keir Fraser + * Copyright (c) 2005-2007 XenSource Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef __XEN__ +#include <stddef.h> +#include <stdint.h> +#include <public/xen.h> +#else +#include <xen/config.h> +#include <xen/types.h> +#include <xen/lib.h> +#include <asm/regs.h> +#undef cmpxchg +#endif +#include <asm-x86/x86_emulate.h> + +/* Operand sizes: 8-bit operands or specified/overridden size. */ +#define ByteOp (1<<0) /* 8-bit operands. */ +/* Destination operand type. */ +#define DstBitBase (0<<1) /* Memory operand, bit string. */ +#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ +#define DstReg (2<<1) /* Register operand. */ +#define DstMem (3<<1) /* Memory operand. */ +#define DstMask (3<<1) +/* Source operand type. */ +#define SrcNone (0<<3) /* No source operand. */ +#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ +#define SrcReg (1<<3) /* Register operand. */ +#define SrcMem (2<<3) /* Memory operand. */ +#define SrcMem16 (3<<3) /* Memory operand (16-bit). */ +#define SrcImm (4<<3) /* Immediate operand. */ +#define SrcImmByte (5<<3) /* 8-bit sign-extended immediate operand. */ +#define SrcMask (7<<3) +/* Generic ModRM decode. */ +#define ModRM (1<<6) +/* Destination is only written; never read. */ +#define Mov (1<<7) + +static uint8_t opcode_table[256] = { + /* 0x00 - 0x07 */ + ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, + ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM, + ByteOp|DstReg|SrcImm, DstReg|SrcImm, 0, 0, + /* 0x08 - 0x0F */ + ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, + ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM, + ByteOp|DstReg|SrcImm, DstReg|SrcImm, 0, 0, + /* 0x10 - 0x17 */ + ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, + ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM, + ByteOp|DstReg|SrcImm, DstReg|SrcImm, 0, 0, + /* 0x18 - 0x1F */ + ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, + ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM, + ByteOp|DstReg|SrcImm, DstReg|SrcImm, 0, 0, + /* 0x20 - 0x27 */ + ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, + ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM, + ByteOp|DstReg|SrcImm, DstReg|SrcImm, 0, ImplicitOps, + /* 0x28 - 0x2F */ + ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, + ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM, + ByteOp|DstReg|SrcImm, DstReg|SrcImm, 0, ImplicitOps, + /* 0x30 - 0x37 */ + ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, + ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM, + ByteOp|DstReg|SrcImm, DstReg|SrcImm, 0, ImplicitOps, + /* 0x38 - 0x3F */ + ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, + ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM, + ByteOp|DstReg|SrcImm, DstReg|SrcImm, 0, ImplicitOps, + /* 0x40 - 0x4F */ + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + /* 0x50 - 0x5F */ + ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, + ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, + ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, + ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, + /* 0x60 - 0x67 */ + ImplicitOps, ImplicitOps, DstReg|SrcMem|ModRM, DstReg|SrcMem16|ModRM|Mov, + 0, 0, 0, 0, + /* 0x68 - 0x6F */ + ImplicitOps|Mov, DstMem|SrcImm|ModRM|Mov, + ImplicitOps|Mov, DstMem|SrcImmByte|ModRM|Mov, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + /* 0x70 - 0x77 */ + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + /* 0x78 - 0x7F */ + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + /* 0x80 - 0x87 */ + ByteOp|DstMem|SrcImm|ModRM, DstMem|SrcImm|ModRM, + ByteOp|DstMem|SrcImm|ModRM, DstMem|SrcImmByte|ModRM, + ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, + ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, + /* 0x88 - 0x8F */ + ByteOp|DstMem|SrcReg|ModRM|Mov, DstMem|SrcReg|ModRM|Mov, + ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, + 0, DstReg|SrcNone|ModRM, 0, DstMem|SrcNone|ModRM|Mov, + /* 0x90 - 0x97 */ + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + /* 0x98 - 0x9F */ + ImplicitOps, ImplicitOps, 0, 0, 0, 0, ImplicitOps, ImplicitOps, + /* 0xA0 - 0xA7 */ + ByteOp|ImplicitOps|Mov, ImplicitOps|Mov, + ByteOp|ImplicitOps|Mov, ImplicitOps|Mov, + ByteOp|ImplicitOps|Mov, ImplicitOps|Mov, 0, 0, + /* 0xA8 - 0xAF */ + ByteOp|DstReg|SrcImm, DstReg|SrcImm, + ByteOp|ImplicitOps|Mov, ImplicitOps|Mov, + ByteOp|ImplicitOps|Mov, ImplicitOps|Mov, 0, 0, + /* 0xB0 - 0xB7 */ + ByteOp|DstReg|SrcImm|Mov, ByteOp|DstReg|SrcImm|Mov, + ByteOp|DstReg|SrcImm|Mov, ByteOp|DstReg|SrcImm|Mov, + ByteOp|DstReg|SrcImm|Mov, ByteOp|DstReg|SrcImm|Mov, + ByteOp|DstReg|SrcImm|Mov, ByteOp|DstReg|SrcImm|Mov, + /* 0xB8 - 0xBF */ + DstReg|SrcImm|Mov, DstReg|SrcImm|Mov, DstReg|SrcImm|Mov, DstReg|SrcImm|Mov, + DstReg|SrcImm|Mov, DstReg|SrcImm|Mov, DstReg|SrcImm|Mov, DstReg|SrcImm|Mov, + /* 0xC0 - 0xC7 */ + ByteOp|DstMem|SrcImm|ModRM, DstMem|SrcImmByte|ModRM, + ImplicitOps, ImplicitOps, + 0, 0, ByteOp|DstMem|SrcImm|ModRM|Mov, DstMem|SrcImm|ModRM|Mov, + /* 0xC8 - 0xCF */ + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xD0 - 0xD7 */ + ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM, + ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + /* 0xD8 - 0xDF */ + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xE0 - 0xE7 */ + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + /* 0xE8 - 0xEF */ + ImplicitOps, ImplicitOps, 0, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + /* 0xF0 - 0xF7 */ + 0, 0, 0, 0, + 0, ImplicitOps, ByteOp|DstMem|SrcNone|ModRM, DstMem|SrcNone|ModRM, + /* 0xF8 - 0xFF */ + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ByteOp|DstMem|SrcNone|ModRM, DstMem|SrcNone|ModRM +}; + +static uint8_t twobyte_table[256] = { + /* 0x00 - 0x07 */ + 0, 0, 0, 0, 0, ImplicitOps, 0, 0, + /* 0x08 - 0x0F */ + ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps|ModRM, 0, 0, + /* 0x10 - 0x17 */ + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x18 - 0x1F */ + ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, + ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, + /* 0x20 - 0x27 */ + ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, + 0, 0, 0, 0, + /* 0x28 - 0x2F */ + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x30 - 0x37 */ + ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, + /* 0x38 - 0x3F */ + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x40 - 0x47 */ + DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, + DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, + DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, + DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, + /* 0x48 - 0x4F */ + DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, + DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, + DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, + DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, + /* 0x50 - 0x5F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x60 - 0x6F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x70 - 0x7F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x80 - 0x87 */ + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + /* 0x88 - 0x8F */ + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + /* 0x90 - 0x97 */ + ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov, + ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov, + ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov, + ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov, + /* 0x98 - 0x9F */ + ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov, + ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov, + ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov, + ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov, + /* 0xA0 - 0xA7 */ + 0, 0, 0, DstBitBase|SrcReg|ModRM, 0, 0, 0, 0, + /* 0xA8 - 0xAF */ + 0, 0, 0, DstBitBase|SrcReg|ModRM, 0, 0, 0, DstReg|SrcMem|ModRM, + /* 0xB0 - 0xB7 */ + ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, + 0, DstBitBase|SrcReg|ModRM, + 0, 0, ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov, + /* 0xB8 - 0xBF */ + 0, 0, DstBitBase|SrcImmByte|ModRM, DstBitBase|SrcReg|ModRM, + DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM, + ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov, + /* 0xC0 - 0xC7 */ + ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, 0, + 0, 0, 0, ImplicitOps|ModRM, + /* 0xC8 - 0xCF */ + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + /* 0xD0 - 0xDF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xE0 - 0xEF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xF0 - 0xFF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* Type, address-of, and value of an instruction's operand. */ +struct operand { + enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; + unsigned int bytes; + unsigned long val, orig_val; + union { + /* OP_REG: Pointer to register field. */ + unsigned long *reg; + /* OP_MEM: Segment and offset. */ + struct { + enum x86_segment seg; + unsigned long off; + } mem; + }; +}; + +/* EFLAGS bit definitions. */ +#define EFLG_OF (1<<11) +#define EFLG_DF (1<<10) +#define EFLG_IF (1<<9) +#define EFLG_SF (1<<7) +#define EFLG_ZF (1<<6) +#define EFLG_AF (1<<4) +#define EFLG_PF (1<<2) +#define EFLG_CF (1<<0) + +/* Exception definitions. */ +#define EXC_DE 0 +#define EXC_BR 5 +#define EXC_UD 6 +#define EXC_GP 13 + +/* + * Instruction emulation: + * Most instructions are emulated directly via a fragment of inline assembly + * code. This allows us to save/restore EFLAGS and thus very easily pick up + * any modified flags. + */ + +#if defined(__x86_64__) +#define _LO32 "k" /* force 32-bit operand */ +#define _STK "%%rsp" /* stack pointer */ +#elif defined(__i386__) +#define _LO32 "" /* force 32-bit operand */ +#define _STK "%%esp" /* stack pointer */ +#endif + +/* + * These EFLAGS bits are restored from saved value during emulation, and + * any changes are written back to the saved value after emulation. + */ +#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) + +/* Before executing instruction: restore necessary bits in EFLAGS. */ +#define _PRE_EFLAGS(_sav, _msk, _tmp) \ +/* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \ +"movl %"_sav",%"_LO32 _tmp"; " \ +"push %"_tmp"; " \ +"push %"_tmp"; " \ +"movl %"_msk",%"_LO32 _tmp"; " \ +"andl %"_LO32 _tmp",("_STK"); " \ +"pushf; " \ +"notl %"_LO32 _tmp"; " \ +"andl %"_LO32 _tmp",("_STK"); " \ +"andl %"_LO32 _tmp","STR(BITS_PER_LONG/4)"("_STK"); " \ +"pop %"_tmp"; " \ +"orl %"_LO32 _tmp",("_STK"); " \ +"popf; " \ +"pop %"_sav"; " + +/* After executing instruction: write-back necessary bits in EFLAGS. */ +#define _POST_EFLAGS(_sav, _msk, _tmp) \ +/* _sav |= EFLAGS & _msk; */ \ +"pushf; " \ +"pop %"_tmp"; " \ +"andl %"_msk",%"_LO32 _tmp"; " \ +"orl %"_LO32 _tmp",%"_sav"; " + +/* Raw emulation: instruction has two explicit operands. */ +#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy)\ +do{ unsigned long _tmp; \ + switch ( (_dst).bytes ) \ + { \ + case 2: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","4","2") \ + _op"w %"_wx"3,%1; " \ + _POST_EFLAGS("0","4","2") \ + : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ + : _wy ((_src).val), "i" (EFLAGS_MASK), \ + "m" (_eflags), "m" ((_dst).val) ); \ + break; \ + case 4: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","4","2") \ + _op"l %"_lx"3,%1; " \ + _POST_EFLAGS("0","4","2") \ + : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ + : _ly ((_src).val), "i" (EFLAGS_MASK), \ + "m" (_eflags), "m" ((_dst).val) ); \ + break; \ + case 8: \ + __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy); \ + break; \ + } \ +} while (0) +#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy)\ +do{ unsigned long _tmp; \ + switch ( (_dst).bytes ) \ + { \ + case 1: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","4","2") \ + _op"b %"_bx"3,%1; " \ + _POST_EFLAGS("0","4","2") \ + : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ + : _by ((_src).val), "i" (EFLAGS_MASK), \ + "m" (_eflags), "m" ((_dst).val) ); \ + break; \ + default: \ + __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy);\ + break; \ + } \ +} while (0) +/* Source operand is byte-sized and may be restricted to just %cl. */ +#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ + __emulate_2op(_op, _src, _dst, _eflags, \ + "b", "c", "b", "c", "b", "c", "b", "c") +/* Source operand is byte, word, long or quad sized. */ +#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ + __emulate_2op(_op, _src, _dst, _eflags, \ + "b", "q", "w", "r", _LO32, "r", "", "r") +/* Source operand is word, long or quad sized. */ +#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ + __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ + "w", "r", _LO32, "r", "", "r") + +/* Instruction has only one explicit operand (no source operand). */ +#define emulate_1op(_op,_dst,_eflags) \ +do{ unsigned long _tmp; \ + switch ( (_dst).bytes ) \ + { \ + case 1: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","3","2") \ + _op"b %1; " \ + _POST_EFLAGS("0","3","2") \ + : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ + : "i" (EFLAGS_MASK), "m" (_eflags), "m" ((_dst).val) ); \ + break; \ + case 2: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","3","2") \ + _op"w %1; " \ + _POST_EFLAGS("0","3","2") \ + : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ + : "i" (EFLAGS_MASK), "m" (_eflags), "m" ((_dst).val) ); \ + break; \ + case 4: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","3","2") \ + _op"l %1; " \ + _POST_EFLAGS("0","3","2") \ + : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ + : "i" (EFLAGS_MASK), "m" (_eflags), "m" ((_dst).val) ); \ + break; \ + case 8: \ + __emulate_1op_8byte(_op, _dst, _eflags); \ + break; \ + } \ +} while (0) + +/* Emulate an instruction with quadword operands (x86/64 only). */ +#if defined(__x86_64__) +#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \ +do{ __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","4","2") \ + _op"q %"_qx"3,%1; " \ + _POST_EFLAGS("0","4","2") \ + : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ + : _qy ((_src).val), "i" (EFLAGS_MASK), \ + "m" (_eflags), "m" ((_dst).val) ); \ +} while (0) +#define __emulate_1op_8byte(_op, _dst, _eflags) \ +do{ __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","3","2") \ + _op"q %1; " \ + _POST_EFLAGS("0","3","2") \ + : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ + : "i" (EFLAGS_MASK), "m" (_eflags), "m" ((_dst).val) ); \ +} while (0) +#elif defined(__i386__) +#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) +#define __emulate_1op_8byte(_op, _dst, _eflags) +#endif /* __i386__ */ + +/* Fetch next part of the instruction being emulated. */ +#define insn_fetch_bytes(_size) \ +({ unsigned long _x, _eip = _regs.eip; \ + if ( !mode_64bit() ) _eip = (uint32_t)_eip; /* ignore upper dword */ \ + _regs.eip += (_size); /* real hardware doesn't truncate */ \ + generate_exception_if((uint8_t)(_regs.eip - ctxt->regs->eip) > 15, \ + EXC_GP); \ + rc = ops->insn_fetch(x86_seg_cs, _eip, &_x, (_size), ctxt); \ + if ( rc ) goto done; \ + _x; \ +}) +#define insn_fetch_type(_type) ((_type)insn_fetch_bytes(sizeof(_type))) + +#define _truncate_ea(ea, byte_width) \ +({ unsigned long __ea = (ea); \ + unsigned int _width = (byte_width); \ + ((_width == sizeof(unsigned long)) ? __ea : \ + (__ea & ((1UL << (_width << 3)) - 1))); \ +}) +#define truncate_ea(ea) _truncate_ea((ea), ad_bytes) + +#define mode_64bit() (def_ad_bytes == 8) + +#define fail_if(p) \ +do { \ + rc = (p) ? X86EMUL_UNHANDLEABLE : X86EMUL_OKAY; \ + if ( rc ) goto done; \ +} while (0) + +/* In future we will be able to generate arbitrary exceptions. */ +#define generate_exception_if(p, e) fail_if(p) + +/* To be done... */ +#define mode_ring0() (0) +#define mode_iopl() (0) + +/* Given byte has even parity (even number of 1s)? */ +static int even_parity(uint8_t v) +{ + __asm__ ( "test %%al,%%al; setp %%al" + : "=a" (v) : "0" (v) ); + return v; +} + +/* Update address held in a register, based on addressing mode. */ +#define _register_address_increment(reg, inc, byte_width) \ +do { \ + int _inc = (inc); /* signed type ensures sign extension to long */ \ + unsigned int _width = (byte_width); \ + if ( _width == sizeof(unsigned long) ) \ + (reg) += _inc; \ + else if ( mode_64bit() ) \ + (reg) = ((reg) + _inc) & ((1UL << (_width << 3)) - 1); \ + else \ + (reg) = ((reg) & ~((1UL << (_width << 3)) - 1)) | \ + (((reg) + _inc) & ((1UL << (_width << 3)) - 1)); \ +} while (0) +#define register_address_increment(reg, inc) \ + _register_address_increment((reg), (inc), ad_bytes) + +#define sp_pre_dec(dec) ({ \ + _register_address_increment(_regs.esp, -(dec), ctxt->sp_size/8); \ + _truncate_ea(_regs.esp, ctxt->sp_size/8); \ +}) +#define sp_post_inc(inc) ({ \ + unsigned long __esp = _truncate_ea(_regs.esp, ctxt->sp_size/8); \ + _register_address_increment(_regs.esp, (inc), ctxt->sp_size/8); \ + __esp; \ +}) + +#define jmp_rel(rel) \ +do { \ + _regs.eip += (int)(rel); \ + if ( !mode_64bit() ) \ + _regs.eip = ((op_bytes == 2) \ + ? (uint16_t)_regs.eip : (uint32_t)_regs.eip); \ +} while (0) + +static int __handle_rep_prefix( + struct cpu_user_regs *int_regs, + struct cpu_user_regs *ext_regs, + int ad_bytes) +{ + unsigned long ecx = ((ad_bytes == 2) ? (uint16_t)int_regs->ecx : + (ad_bytes == 4) ? (uint32_t)int_regs->ecx : + int_regs->ecx); + + if ( ecx-- == 0 ) + { + ext_regs->eip = int_regs->eip; + return 1; + } + + if ( ad_bytes == 2 ) + *(uint16_t *)&int_regs->ecx = ecx; + else if ( ad_bytes == 4 ) + int_regs->ecx = (uint32_t)ecx; + else + int_regs->ecx = ecx; + int_regs->eip = ext_regs->eip; + return 0; +} + +#define handle_rep_prefix() \ +do { \ + if ( rep_prefix && __handle_rep_prefix(&_regs, ctxt->regs, ad_bytes) ) \ + goto done; \ +} while (0) + +/* + * Unsigned multiplication with double-word result. + * IN: Multiplicand=m[0], Multiplier=m[1] + * OUT: Return CF/OF (overflow status); Result=m[1]:m[0] + */ +static int mul_dbl(unsigned long m[2]) +{ + int rc; + asm ( "mul %4; seto %b2" + : "=a" (m[0]), "=d" (m[1]), "=q" (rc) + : "0" (m[0]), "1" (m[1]), "2" (0) ); + return rc; +} + +/* + * Signed multiplication with double-word result. + * IN: Multiplicand=m[0], Multiplier=m[1] + * OUT: Return CF/OF (overflow status); Result=m[1]:m[0] + */ +static int imul_dbl(unsigned long m[2]) +{ + int rc; + asm ( "imul %4; seto %b2" + : "=a" (m[0]), "=d" (m[1]), "=q" (rc) + : "0" (m[0]), "1" (m[1]), "2" (0) ); + return rc; +} + +/* + * Unsigned division of double-word dividend. + * IN: Dividend=u[1]:u[0], Divisor=v + * OUT: Return 1: #DE + * Return 0: Quotient=u[0], Remainder=u[1] + */ +static int div_dbl(unsigned long u[2], unsigned long v) +{ + if ( (v == 0) || (u[1] >= v) ) + return 1; + asm ( "div %4" + : "=a" (u[0]), "=d" (u[1]) + : "0" (u[0]), "1" (u[1]), "r" (v) ); + return 0; +} + +/* + * Signed division of double-word dividend. + * IN: Dividend=u[1]:u[0], Divisor=v + * OUT: Return 1: #DE + * Return 0: Quotient=u[0], Remainder=u[1] + * NB. We don't use idiv directly as it's moderately hard to work out + * ahead of time whether it will #DE, which we cannot allow to happen. + */ +static int idiv_dbl(unsigned long u[2], unsigned long v) +{ + int negu = (long)u[1] < 0, negv = (long)v < 0; + + /* u = abs(u) */ + if ( negu ) + { + u[1] = ~u[1]; + if ( (u[0] = -u[0]) == 0 ) + u[1]++; + } + + /* abs(u) / abs(v) */ + if ( div_dbl(u, negv ? -v : v) ) + return 1; + + /* Remainder has same sign as dividend. It cannot overflow. */ + if ( negu ) + u[1] = -u[1]; + + /* Quotient is overflowed if sign bit is set. */ + if ( negu ^ negv ) + { + if ( (long)u[0] >= 0 ) + u[0] = -u[0]; + else if ( (u[0] << 1) != 0 ) /* == 0x80...0 is okay */ + return 1; + } + else if ( (long)u[0] < 0 ) + return 1; + + return 0; +} + +static int +test_cc( + unsigned int condition, unsigned int flags) +{ + int rc = 0; + + switch ( (condition & 15) >> 1 ) + { + case 0: /* o */ + rc |= (flags & EFLG_OF); + break; + case 1: /* b/c/nae */ + rc |= (flags & EFLG_CF); + break; + case 2: /* z/e */ + rc |= (flags & EFLG_ZF); + break; + case 3: /* be/na */ + rc |= (flags & (EFLG_CF|EFLG_ZF)); + break; + case 4: /* s */ + rc |= (flags & EFLG_SF); + break; + case 5: /* p/pe */ + rc |= (flags & EFLG_PF); + break; + case 7: /* le/ng */ + rc |= (flags & EFLG_ZF); + /* fall through */ + case 6: /* l/nge */ + rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); + break; + } + + /* Odd condition identifiers (lsb == 1) have inverted sense. */ + return (!!rc ^ (condition & 1)); +} + +void * +decode_register( + uint8_t modrm_reg, struct cpu_user_regs *regs, int highbyte_regs) +{ + void *p; + + switch ( modrm_reg ) + { + case 0: p = ®s->eax; break; + case 1: p = ®s->ecx; break; + case 2: p = ®s->edx; break; + case 3: p = ®s->ebx; break; + case 4: p = (highbyte_regs ? + ((unsigned char *)®s->eax + 1) : + (unsigned char *)®s->esp); break; + case 5: p = (highbyte_regs ? + ((unsigned char *)®s->ecx + 1) : + (unsigned char *)®s->ebp); break; + case 6: p = (highbyte_regs ? + ((unsigned char *)®s->edx + 1) : + (unsigned char *)®s->esi); break; + case 7: p = (highbyte_regs ? + ((unsigned char *)®s->ebx + 1) : + (unsigned char *)®s->edi); break; +#if defined(__x86_64__) + case 8: p = ®s->r8; break; + case 9: p = ®s->r9; break; + case 10: p = ®s->r10; break; + case 11: p = ®s->r11; break; + case 12: p = ®s->r12; break; + case 13: p = ®s->r13; break; + case 14: p = ®s->r14; break; + case 15: p = ®s->r15; break; +#endif + default: p = NULL; break; + } + + return p; +} + +int +x86_emulate( + struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + /* Shadow copy of register state. Committed on successful emulation. */ + struct cpu_user_regs _regs = *ctxt->regs; + + uint8_t b, d, sib, sib_index, sib_base, twobyte = 0, rex_prefix = 0; + uint8_t modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0; + unsigned int op_bytes, def_op_bytes, ad_bytes, def_ad_bytes; + unsigned int lock_prefix = 0, rep_prefix = 0; + int override_seg = -1, rc = X86EMUL_OKAY; + struct operand src, dst; + + /* Data operand effective address (usually computed from ModRM). */ + struct operand ea; + + /* Default is a memory operand relative to segment DS. */ + ea.type = OP_MEM; + ea.mem.seg = x86_seg_ds; + ea.mem.off = 0; + + op_bytes = def_op_bytes = ad_bytes = def_ad_bytes = ctxt->addr_size/8; + if ( op_bytes == 8 ) + { + op_bytes = def_op_bytes = 4; +#ifndef __x86_64__ + return X86EMUL_UNHANDLEABLE; +#endif + } + + /* Prefix bytes. */ + for ( ; ; ) + { + switch ( b = insn_fetch_type(uint8_t) ) + { + case 0x66: /* operand-size override */ + op_bytes = def_op_bytes ^ 6; + break; + case 0x67: /* address-size override */ + ad_bytes = def_ad_bytes ^ (mode_64bit() ? 12 : 6); + break; + case 0x2e: /* CS override */ + override_seg = x86_seg_cs; + break; + case 0x3e: /* DS override */ + override_seg = x86_seg_ds; + break; + case 0x26: /* ES override */ + override_seg = x86_seg_es; + break; + case 0x64: /* FS override */ + override_seg = x86_seg_fs; + break; + case 0x65: /* GS override */ + override_seg = x86_seg_gs; + break; + case 0x36: /* SS override */ + override_seg = x86_seg_ss; + break; + case 0xf0: /* LOCK */ + lock_prefix = 1; + break; + case 0xf2: /* REPNE/REPNZ */ + case 0xf3: /* REP/REPE/REPZ */ + rep_prefix = 1; + break; + case 0x40 ... 0x4f: /* REX */ + if ( !mode_64bit() ) + goto done_prefixes; + rex_prefix = b; + continue; + default: + goto done_prefixes; + } + + /* Any legacy prefix after a REX prefix nullifies its effect. */ + rex_prefix = 0; + } + done_prefixes: + + if ( rex_prefix & 8 ) /* REX.W */ + op_bytes = 8; + + /* Opcode byte(s). */ + d = opcode_table[b]; + if ( d == 0 ) + { + /* Two-byte opcode? */ + if ( b == 0x0f ) + { + twobyte = 1; + b = insn_fetch_type(uint8_t); + d = twobyte_table[b]; + } + + /* Unrecognised? */ + if ( d == 0 ) + goto cannot_emulate; + } + + /* Lock prefix is allowed only on RMW instructions. */ + generate_exception_if((d & Mov) && lock_prefix, EXC_GP); + + /* ModRM and SIB bytes. */ + if ( d & ModRM ) + { + modrm = insn_fetch_type(uint8_t); + modrm_mod = (modrm & 0xc0) >> 6; + modrm_reg = ((rex_prefix & 4) << 1) | ((modrm & 0x38) >> 3); + modrm_rm = modrm & 0x07; + + if ( modrm_mod == 3 ) + { + modrm_rm |= (rex_prefix & 1) << 3; + ea.type = OP_REG; + ea.reg = decode_register( + modrm_rm, &_regs, (d & ByteOp) && (rex_prefix == 0)); + } + else if ( ad_bytes == 2 ) + { + /* 16-bit ModR/M decode. */ + switch ( modrm_rm ) + { + case 0: + ea.mem.off = _regs.ebx + _regs.esi; + break; + case 1: + ea.mem.off = _regs.ebx + _regs.edi; + break; + case 2: + ea.mem.seg = x86_seg_ss; + ea.mem.off = _regs.ebp + _regs.esi; + break; + case 3: + ea.mem.seg = x86_seg_ss; + ea.mem.off = _regs.ebp + _regs.edi; + break; + case 4: + ea.mem.off = _regs.esi; + break; + case 5: + ea.mem.off = _regs.edi; + break; + case 6: + if ( modrm_mod == 0 ) + break; + ea.mem.seg = x86_seg_ss; + ea.mem.off = _regs.ebp; + break; + case 7: + ea.mem.off = _regs.ebx; + break; + } + switch ( modrm_mod ) + { + case 0: + if ( modrm_rm == 6 ) + ea.mem.off = insn_fetch_type(int16_t); + break; + case 1: + ea.mem.off += insn_fetch_type(int8_t); + break; + case 2: + ea.mem.off += insn_fetch_type(int16_t); + break; + } + ea.mem.off = truncate_ea(ea.mem.off); + } + else + { + /* 32/64-bit ModR/M decode. */ + if ( modrm_rm == 4 ) + { + sib = insn_fetch_type(uint8_t); + sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8); + sib_base = (sib & 7) | ((rex_prefix << 3) & 8); + if ( sib_index != 4 ) + ea.mem.off = *(long*)decode_register(sib_index, &_regs, 0); + ea.mem.off <<= (sib >> 6) & 3; + if ( (modrm_mod == 0) && ((sib_base & 7) == 5) ) + ea.mem.off += insn_fetch_type(int32_t); + else if ( sib_base == 4 ) + { + ea.mem.seg = x86_seg_ss; + ea.mem.off += _regs.esp; + if ( !twobyte && (b == 0x8f) ) + /* POP <rm> computes its EA post increment. */ + ea.mem.off += ((mode_64bit() && (op_bytes == 4)) + ? 8 : op_bytes); + } + else if ( sib_base == 5 ) + { + ea.mem.seg = x86_seg_ss; + ea.mem.off += _regs.ebp; + } + else + ea.mem.off += *(long*)decode_register(sib_base, &_regs, 0); + } + else + { + modrm_rm |= (rex_prefix & 1) << 3; + ea.mem.off = *(long *)decode_register(modrm_rm, &_regs, 0); + if ( (modrm_rm == 5) && (modrm_mod != 0) ) + ea.mem.seg = x86_seg_ss; + } + switch ( modrm_mod ) + { + case 0: + if ( (modrm_rm & 7) != 5 ) + break; + ea.mem.off = insn_fetch_type(int32_t); + if ( !mode_64bit() ) + break; + /* Relative to RIP of next instruction. Argh! */ + ea.mem.off += _regs.eip; + if ( (d & SrcMask) == SrcImm ) + ea.mem.off += (d & ByteOp) ? 1 : + ((op_bytes == 8) ? 4 : op_bytes); + else if ( (d & SrcMask) == SrcImmByte ) + ea.mem.off += 1; + else if ( ((b == 0xf6) || (b == 0xf7)) && + ((modrm_reg & 7) <= 1) ) + /* Special case in Grp3: test has immediate operand. */ + ea.mem.off += (d & ByteOp) ? 1 + : ((op_bytes == 8) ? 4 : op_bytes); + break; + case 1: + ea.mem.off += insn_fetch_type(int8_t); + break; + case 2: + ea.mem.off += insn_fetch_type(int32_t); + break; + } + ea.mem.off = truncate_ea(ea.mem.off); + } + } + + if ( override_seg != -1 ) + ea.mem.seg = override_seg; + + /* Special instructions do their own operand decoding. */ + if ( (d & DstMask) == ImplicitOps ) + goto special_insn; + + /* Decode and fetch the source operand: register, memory or immediate. */ + switch ( d & SrcMask ) + { + case SrcNone: + break; + case SrcReg: + src.type = OP_REG; + if ( d & ByteOp ) + { + src.reg = decode_register(modrm_reg, &_regs, (rex_prefix == 0)); + src.val = *(uint8_t *)src.reg; + src.bytes = 1; + } + else + { + src.reg = decode_register(modrm_reg, &_regs, 0); + switch ( (src.bytes = op_bytes) ) + { + case 2: src.val = *(uint16_t *)src.reg; break; + case 4: src.val = *(uint32_t *)src.reg; break; + case 8: src.val = *(uint64_t *)src.reg; break; + } + } + break; + case SrcMem16: + ea.bytes = 2; + goto srcmem_common; + case SrcMem: + ea.bytes = (d & ByteOp) ? 1 : op_bytes; + srcmem_common: + src = ea; + if ( src.type == OP_REG ) + { + switch ( src.bytes ) + { + case 1: src.val = *(uint8_t *)src.reg; break; + case 2: src.val = *(uint16_t *)src.reg; break; + case 4: src.val = *(uint32_t *)src.reg; break; + case 8: src.val = *(uint64_t *)src.reg; break; + } + } + else if ( (rc = ops->read(src.mem.seg, src.mem.off, + &src.val, src.bytes, ctxt)) ) + goto done; + break; + case SrcImm: + src.type = OP_IMM; + src.bytes = (d & ByteOp) ? 1 : op_bytes; + if ( src.bytes == 8 ) src.bytes = 4; + /* NB. Immediates are sign-extended as necessary. */ + switch ( src.bytes ) + { + case 1: src.val = insn_fetch_type(int8_t); break; + case 2: src.val = insn_fetch_type(int16_t); break; + case 4: src.val = insn_fetch_type(int32_t); break; + } + break; + case SrcImmByte: + src.type = OP_IMM; + src.bytes = 1; + src.val = insn_fetch_type(int8_t); + break; + } + + /* Decode and fetch the destination operand: register or memory. */ + switch ( d & DstMask ) + { + case DstReg: + dst.type = OP_REG; + if ( d & ByteOp ) + { + dst.reg = decode_register(modrm_reg, &_regs, (rex_prefix == 0)); + dst.val = *(uint8_t *)dst.reg; + dst.bytes = 1; + } + else + { + dst.reg = decode_register(modrm_reg, &_regs, 0); + switch ( (dst.bytes = op_bytes) ) + { + case 2: dst.val = *(uint16_t *)dst.reg; break; + case 4: dst.val = *(uint32_t *)dst.reg; break; + case 8: dst.val = *(uint64_t *)dst.reg; break; + } + } + break; + case DstBitBase: + if ( ((d & SrcMask) == SrcImmByte) || (ea.type == OP_REG) ) + { + src.val &= (op_bytes << 3) - 1; + } + else + { + /* + * EA += BitOffset DIV op_bytes*8 + * BitOffset = BitOffset MOD op_bytes*8 + * DIV truncates towards negative infinity. + * MOD always produces a positive result. + */ + if ( op_bytes == 2 ) + src.val = (int16_t)src.val; + else if ( op_bytes == 4 ) + src.val = (int32_t)src.val; + if ( (long)src.val < 0 ) + { + unsigned long byte_offset; + byte_offset = op_bytes + (((-src.val-1) >> 3) & ~(op_bytes-1)); + ea.mem.off -= byte_offset; + src.val = (byte_offset << 3) + src.val; + } + else + { + ea.mem.off += (src.val >> 3) & ~(op_bytes - 1); + src.val &= (op_bytes << 3) - 1; + } + } + /* Becomes a normal DstMem operation from here on. */ + d = (d & ~DstMask) | DstMem; + case DstMem: + ea.bytes = (d & ByteOp) ? 1 : op_bytes; + dst = ea; + if ( dst.type == OP_REG ) + { + switch ( dst.bytes ) + { + case 1: dst.val = *(uint8_t *)dst.reg; break; + case 2: dst.val = *(uint16_t *)dst.reg; break; + case 4: dst.val = *(uint32_t *)dst.reg; break; + case 8: dst.val = *(uint64_t *)dst.reg; break; + } + } + else if ( !(d & Mov) ) /* optimisation - avoid slow emulated read */ + { + if ( (rc = ops->read(dst.mem.seg, dst.mem.off, + &dst.val, dst.bytes, ctxt)) ) + goto done; + dst.orig_val = dst.val; + } + break; + } + + /* LOCK prefix allowed only on instructions with memory destination. */ + generate_exception_if(lock_prefix && (dst.type != OP_MEM), EXC_GP); + + if ( twobyte ) + goto twobyte_insn; + + switch ( b ) + { + case 0x04 ... 0x05: /* add imm,%%eax */ + dst.reg = (unsigned long *)&_regs.eax; + dst.val = _regs.eax; + case 0x00 ... 0x03: add: /* add */ + emulate_2op_SrcV("add", src, dst, _regs.eflags); + break; + + case 0x0c ... 0x0d: /* or imm,%%eax */ + dst.reg = (unsigned long *)&_regs.eax; + dst.val = _regs.eax; + case 0x08 ... 0x0b: or: /* or */ + emulate_2op_SrcV("or", src, dst, _regs.eflags); + break; + + case 0x14 ... 0x15: /* adc imm,%%eax */ + dst.reg = (unsigned long *)&_regs.eax; + dst.val = _regs.eax; + case 0x10 ... 0x13: adc: /* adc */ + emulate_2op_SrcV("adc", src, dst, _regs.eflags); + break; + + case 0x1c ... 0x1d: /* sbb imm,%%eax */ + dst.reg = (unsigned long *)&_regs.eax; + dst.val = _regs.eax; + case 0x18 ... 0x1b: sbb: /* sbb */ + emulate_2op_SrcV("sbb", src, dst, _regs.eflags); + break; + + case 0x24 ... 0x25: /* and imm,%%eax */ + dst.reg = (unsigned long *)&_regs.eax; + dst.val = _regs.eax; + case 0x20 ... 0x23: and: /* and */ + emulate_2op_SrcV("and", src, dst, _regs.eflags); + break; + + case 0x2c ... 0x2d: /* sub imm,%%eax */ + dst.reg = (unsigned long *)&_regs.eax; + dst.val = _regs.eax; + case 0x28 ... 0x2b: sub: /* sub */ + emulate_2op_SrcV("sub", src, dst, _regs.eflags); + break; + + case 0x34 ... 0x35: /* xor imm,%%eax */ + dst.reg = (unsigned long *)&_regs.eax; + dst.val = _regs.eax; + case 0x30 ... 0x33: xor: /* xor */ + emulate_2op_SrcV("xor", src, dst, _regs.eflags); + break; + + case 0x3c ... 0x3d: /* cmp imm,%%eax */ + dst.reg = (unsigned long *)&_regs.eax; + dst.val = _regs.eax; + case 0x38 ... 0x3b: cmp: /* cmp */ + emulate_2op_SrcV("cmp", src, dst, _regs.eflags); + break; + + case 0x62: /* bound */ { + unsigned long src_val2; + int lb, ub, idx; + generate_exception_if(mode_64bit() || (src.type != OP_MEM), EXC_UD); + if ( (rc = ops->read(src.mem.seg, src.mem.off + op_bytes, + &src_val2, op_bytes, ctxt)) ) + goto done; + ub = (op_bytes == 2) ? (int16_t)src_val2 : (int32_t)src_val2; + lb = (op_bytes == 2) ? (int16_t)src.val : (int32_t)src.val; + idx = (op_bytes == 2) ? (int16_t)dst.val : (int32_t)dst.val; + generate_exception_if((idx < lb) || (idx > ub), EXC_BR); + dst.type = OP_NONE; + break; + } + + case 0x63: /* movsxd (x86/64) / arpl (x86/32) */ + if ( mode_64bit() ) + { + /* movsxd */ + if ( src.type == OP_REG ) + src.val = *(int32_t *)src.reg; + else if ( (rc = ops->read(src.mem.seg, src.mem.off, + &src.val, 4, ctxt)) ) + goto done; + dst.val = (int32_t)src.val; + } + else + { + /* arpl */ + uint16_t src_val = dst.val; + dst = src; + _regs.eflags &= ~EFLG_ZF; + _regs.eflags |= ((src_val & 3) > (dst.val & 3)) ? EFLG_ZF : 0; + if ( _regs.eflags & EFLG_ZF ) + dst.val = (dst.val & ~3) | (src_val & 3); + else + dst.type = OP_NONE; + } + break; + + case 0x69: /* imul imm16/32 */ + case 0x6b: /* imul imm8 */ { + unsigned long reg = *(long *)decode_register(modrm_reg, &_regs, 0); + _regs.eflags &= ~(EFLG_OF|EFLG_CF); + switch ( dst.bytes ) + { + case 2: + dst.val = ((uint32_t)(int16_t)src.val * + (uint32_t)(int16_t)reg); + if ( (int16_t)dst.val != (uint32_t)dst.val ) + _regs.eflags |= EFLG_OF|EFLG_CF; + break; +#ifdef __x86_64__ + case 4: + dst.val = ((uint64_t)(int32_t)src.val * + (uint64_t)(int32_t)reg); + if ( (int32_t)dst.val != dst.val ) + _regs.eflags |= EFLG_OF|EFLG_CF; + break; +#endif + default: { + unsigned long m[2] = { src.val, reg }; + if ( imul_dbl(m) ) + _regs.eflags |= EFLG_OF|EFLG_CF; + dst.val = m[0]; + break; + } + } + dst.type = OP_REG; + dst.reg = decode_register(modrm_reg, &_regs, 0); + break; + } + + case 0x82: /* Grp1 (x86/32 only) */ + generate_exception_if(mode_64bit(), EXC_UD); + case 0x80: case 0x81: case 0x83: /* Grp1 */ + switch ( modrm_reg & 7 ) + { + case 0: goto add; + case 1: goto or; + case 2: goto adc; + case 3: goto sbb; + case 4: goto and; + case 5: goto sub; + case 6: goto xor; + case 7: goto cmp; + } + break; + + case 0xa8 ... 0xa9: /* test imm,%%eax */ + dst.reg = (unsigned long *)&_regs.eax; + dst.val = _regs.eax; + case 0x84 ... 0x85: test: /* test */ + emulate_2op_SrcV("test", src, dst, _regs.eflags); + break; + + case 0x86 ... 0x87: xchg: /* xchg */ + /* Write back the register source. */ + switch ( dst.bytes ) + { + case 1: *(uint8_t *)src.reg = (uint8_t)dst.val; break; + case 2: *(uint16_t *)src.reg = (uint16_t)dst.val; break; + case 4: *src.reg = (uint32_t)dst.val; break; /* 64b reg: zero-extend */ + case 8: *src.reg = dst.val; break; + } + /* Write back the memory destination with implicit LOCK prefix. */ + dst.val = src.val; + lock_prefix = 1; + break; + + case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ + generate_exception_if((modrm_reg & 7) != 0, EXC_UD); + case 0x88 ... 0x8b: /* mov */ + dst.val = src.val; + break; + + case 0x8d: /* lea */ + dst.val = ea.mem.off; + break; + + case 0x8f: /* pop (sole member of Grp1a) */ + generate_exception_if((modrm_reg & 7) != 0, EXC_UD); + /* 64-bit mode: POP defaults to a 64-bit operand. */ + if ( mode_64bit() && (dst.bytes == 4) ) + dst.bytes = 8; + if ( (rc = ops->read(x86_seg_ss, sp_post_inc(dst.bytes), + &dst.val, dst.bytes, ctxt)) != 0 ) + goto done; + break; + + case 0xb0 ... 0xb7: /* mov imm8,r8 */ + dst.reg = decode_register( + (b & 7) | ((rex_prefix & 1) << 3), &_regs, (rex_prefix == 0)); + dst.val = src.val; + break; + + case 0xb8 ... 0xbf: /* mov imm{16,32,64},r{16,32,64} */ + if ( dst.bytes == 8 ) /* Fetch more bytes to obtain imm64 */ + src.val = ((uint32_t)src.val | + ((uint64_t)insn_fetch_type(uint32_t) << 32)); + dst.reg = decode_register( + (b & 7) | ((rex_prefix & 1) << 3), &_regs, 0); + dst.val = src.val; + break; + + case 0xc0 ... 0xc1: grp2: /* Grp2 */ + switch ( modrm_reg & 7 ) + { + case 0: /* rol */ + emulate_2op_SrcB("rol", src, dst, _regs.eflags); + break; + case 1: /* ror */ + emulate_2op_SrcB("ror", src, dst, _regs.eflags); + break; + case 2: /* rcl */ + emulate_2op_SrcB("rcl", src, dst, _regs.eflags); + break; + case 3: /* rcr */ + emulate_2op_SrcB("rcr", src, dst, _regs.eflags); + break; + case 4: /* sal/shl */ + case 6: /* sal/shl */ + emulate_2op_SrcB("sal", src, dst, _regs.eflags); + break; + case 5: /* shr */ + emulate_2op_SrcB("shr", src, dst, _regs.eflags); + break; + case 7: /* sar */ + emulate_2op_SrcB("sar", src, dst, _regs.eflags); + break; + } + break; + + case 0xd0 ... 0xd1: /* Grp2 */ + src.val = 1; + goto grp2; + + case 0xd2 ... 0xd3: /* Grp2 */ + src.val = _regs.ecx; + goto grp2; + + case 0xf6 ... 0xf7: /* Grp3 */ + switch ( modrm_reg & 7 ) + { + case 0 ... 1: /* test */ + /* Special case in Grp3: test has an immediate source operand. */ + src.type = OP_IMM; + src.bytes = (d & ByteOp) ? 1 : op_bytes; + if ( src.bytes == 8 ) src.bytes = 4; + switch ( src.bytes ) + { + case 1: src.val = insn_fetch_type(int8_t); break; + case 2: src.val = insn_fetch_type(int16_t); break; + case 4: src.val = insn_fetch_type(int32_t); break; + } + goto test; + case 2: /* not */ + dst.val = ~dst.val; + break; + case 3: /* neg */ + emulate_1op("neg", dst, _regs.eflags); + break; + case 4: /* mul */ + src = dst; + dst.type = OP_REG; + dst.reg = (unsigned long *)&_regs.eax; + dst.val = *dst.reg; + _regs.eflags &= ~(EFLG_OF|EFLG_CF); + switch ( src.bytes ) + { + case 1: + dst.val *= src.val; + if ( (uint8_t)dst.val != (uint16_t)dst.val ) + _regs.eflags |= EFLG_OF|EFLG_CF; + break; + case 2: + dst.val *= src.val; + if ( (uint16_t)dst.val != (uint32_t)dst.val ) + _regs.eflags |= EFLG_OF|EFLG_CF; + *(uint16_t *)&_regs.edx = dst.val >> 16; + break; +#ifdef __x86_64__ + case 4: + dst.val *= src.val; + if ( (uint32_t)dst.val != dst.val ) + _regs.eflags |= EFLG_OF|EFLG_CF; + _regs.edx = (uint32_t)(dst.val >> 32); + break; +#endif + default: { + unsigned long m[2] = { src.val, dst.val }; + if ( mul_dbl(m) ) + _regs.eflags |= EFLG_OF|EFLG_CF; + _regs.edx = m[1]; + dst.val = m[0]; + break; + } + } + break; + case 5: /* imul */ + src = dst; + dst.type = OP_REG; + dst.reg = (unsigned long *)&_regs.eax; + dst.val = *dst.reg; + _regs.eflags &= ~(EFLG_OF|EFLG_CF); + switch ( src.bytes ) + { + case 1: + dst.val = ((uint16_t)(int8_t)src.val * + (uint16_t)(int8_t)dst.val); + if ( (int8_t)dst.val != (uint16_t)dst.val ) + _regs.eflags |= EFLG_OF|EFLG_CF; + break; + case 2: + dst.val = ((uint32_t)(int16_t)src.val * + (uint32_t)(int16_t)dst.val); + if ( (int16_t)dst.val != (uint32_t)dst.val ) + _regs.eflags |= EFLG_OF|EFLG_CF; + *(uint16_t *)&_regs.edx = dst.val >> 16; + break; +#ifdef __x86_64__ + case 4: + dst.val = ((uint64_t)(int32_t)src.val * + (uint64_t)(int32_t)dst.val); + if ( (int32_t)dst.val != dst.val ) + _regs.eflags |= EFLG_OF|EFLG_CF; + _regs.edx = (uint32_t)(dst.val >> 32); + break; +#endif + default: { + unsigned long m[2] = { src.val, dst.val }; + if ( imul_dbl(m) ) + _regs.eflags |= EFLG_OF|EFLG_CF; + _regs.edx = m[1]; + dst.val = m[0]; + break; + } + } + break; + case 6: /* div */ { + unsigned long u[2], v; + src = dst; + dst.type = OP_REG; + dst.reg = (unsigned long *)&_regs.eax; + switch ( src.bytes ) + { + case 1: + u[0] = (uint16_t)_regs.eax; + u[1] = 0; + v = (uint8_t)src.val; + generate_exception_if( + div_dbl(u, v) || ((uint8_t)u[0] != (uint16_t)u[0]), + EXC_DE); + dst.val = (uint8_t)u[0]; + ((uint8_t *)&_regs.eax)[1] = u[1]; + break; + case 2: + u[0] = ((uint32_t)_regs.edx << 16) | (uint16_t)_regs.eax; + u[1] = 0; + v = (uint16_t)src.val; + generate_exception_if( + div_dbl(u, v) || ((uint16_t)u[0] != (uint32_t)u[0]), + EXC_DE); + dst.val = (uint16_t)u[0]; + *(uint16_t *)&_regs.edx = u[1]; + break; +#ifdef __x86_64__ + case 4: + u[0] = (_regs.edx << 32) | (uint32_t)_regs.eax; + u[1] = 0; + v = (uint32_t)src.val; + generate_exception_if( + div_dbl(u, v) || ((uint32_t)u[0] != u[0]), + EXC_DE); + dst.val = (uint32_t)u[0]; + _regs.edx = (uint32_t)u[1]; + break; +#endif + default: + u[0] = _regs.eax; + u[1] = _regs.edx; + v = src.val; + generate_exception_if(div_dbl(u, v), EXC_DE); + dst.val = u[0]; + _regs.edx = u[1]; + break; + } + break; + } + case 7: /* idiv */ { + unsigned long u[2], v; + src = dst; + dst.type = OP_REG; + dst.reg = (unsigned long *)&_regs.eax; + switch ( src.bytes ) + { + case 1: + u[0] = (int16_t)_regs.eax; + u[1] = ((long)u[0] < 0) ? ~0UL : 0UL; + v = (int8_t)src.val; + generate_exception_if( + idiv_dbl(u, v) || ((int8_t)u[0] != (int16_t)u[0]), + EXC_DE); + dst.val = (int8_t)u[0]; + ((int8_t *)&_regs.eax)[1] = u[1]; + break; + case 2: + u[0] = (int32_t)((_regs.edx << 16) | (uint16_t)_regs.eax); + u[1] = ((long)u[0] < 0) ? ~0UL : 0UL; + v = (int16_t)src.val; + generate_exception_if( + idiv_dbl(u, v) || ((int16_t)u[0] != (int32_t)u[0]), + EXC_DE); + dst.val = (int16_t)u[0]; + *(int16_t *)&_regs.edx = u[1]; + break; +#ifdef __x86_64__ + case 4: + u[0] = (_regs.edx << 32) | (uint32_t)_regs.eax; + u[1] = ((long)u[0] < 0) ? ~0UL : 0UL; + v = (int32_t)src.val; + generate_exception_if( + idiv_dbl(u, v) || ((int32_t)u[0] != u[0]), + EXC_DE); + dst.val = (int32_t)u[0]; + _regs.edx = (uint32_t)u[1]; + break; +#endif + default: + u[0] = _regs.eax; + u[1] = _regs.edx; + v = src.val; + generate_exception_if(idiv_dbl(u, v), EXC_DE); + dst.val = u[0]; + _regs.edx = u[1]; + break; + } + break; + } + default: + goto cannot_emulate; + } + break; + + case 0xfe: /* Grp4 */ + generate_exception_if((modrm_reg & 7) >= 2, EXC_UD); + case 0xff: /* Grp5 */ + switch ( modrm_reg & 7 ) + { + case 0: /* inc */ + emulate_1op("inc", dst, _regs.eflags); + break; + case 1: /* dec */ + emulate_1op("dec", dst, _regs.eflags); + break; + case 2: /* call (near) */ + case 4: /* jmp (near) */ + if ( ((op_bytes = dst.bytes) != 8) && mode_64bit() ) + { + dst.bytes = op_bytes = 8; + if ( dst.type == OP_REG ) + dst.val = *dst.reg; + else if ( (rc = ops->read(dst.mem.seg, dst.mem.off, + &dst.val, 8, ctxt)) != 0 ) + goto done; + } + src.val = _regs.eip; + _regs.eip = dst.val; + if ( (modrm_reg & 7) == 2 ) + goto push; /* call */ + break; + case 6: /* push */ + /* 64-bit mode: PUSH defaults to a 64-bit operand. */ + if ( mode_64bit() && (dst.bytes == 4) ) + { + dst.bytes = 8; + if ( dst.type == OP_REG ) + dst.val = *dst.reg; + else if ( (rc = ops->read(dst.mem.seg, dst.mem.off, + &dst.val, 8, ctxt)) != 0 ) + goto done; + } + if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes), + dst.val, dst.bytes, ctxt)) != 0 ) + goto done; + dst.type = OP_NONE; + break; + case 7: + generate_exception_if(1, EXC_UD); + default: + goto cannot_emulate; + } + break; + } + + writeback: + switch ( dst.type ) + { + case OP_REG: + /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ + switch ( dst.bytes ) + { + case 1: *(uint8_t *)dst.reg = (uint8_t)dst.val; break; + case 2: *(uint16_t *)dst.reg = (uint16_t)dst.val; break; + case 4: *dst.reg = (uint32_t)dst.val; break; /* 64b: zero-ext */ + case 8: *dst.reg = dst.val; break; + } + break; + case OP_MEM: + if ( !(d & Mov) && (dst.orig_val == dst.val) ) + /* nothing to do */; + else if ( lock_prefix ) + rc = ops->cmpxchg( + dst.mem.seg, dst.mem.off, dst.orig_val, + dst.val, dst.bytes, ctxt); + else + rc = ops->write( + dst.mem.seg, dst.mem.off, dst.val, dst.bytes, ctxt); + if ( rc != 0 ) + goto done; + default: + break; + } + + /* Commit shadow register state. */ + _regs.eflags &= ~EF_RF; + *ctxt->regs = _regs; + + done: + return rc; + + special_insn: + dst.type = OP_NONE; + + /* + * The only implicit-operands instructions allowed a LOCK prefix are + * CMPXCHG{8,16}B, MOV CRn, MOV DRn. + */ + generate_exception_if(lock_prefix && + ((b < 0x20) || (b > 0x23)) && /* MOV CRn/DRn */ + (b != 0xc7), /* CMPXCHG{8,16}B */ + EXC_GP); + + if ( twobyte ) + goto twobyte_special_insn; + + switch ( b ) + { + case 0x27: /* daa */ { + uint8_t al = _regs.eax; + unsigned long eflags = _regs.eflags; + generate_exception_if(mode_64bit(), EXC_UD); + _regs.eflags &= ~(EFLG_CF|EFLG_AF); + if ( ((al & 0x0f) > 9) || (eflags & EFLG_AF) ) + { + *(uint8_t *)&_regs.eax += 6; + _regs.eflags |= EFLG_AF; + } + if ( (al > 0x99) || (eflags & EFLG_CF) ) + { + *(uint8_t *)&_regs.eax += 0x60; + _regs.eflags |= EFLG_CF; + } + _regs.eflags &= ~(EFLG_SF|EFLG_ZF|EFLG_PF); + _regs.eflags |= ((uint8_t)_regs.eax == 0) ? EFLG_ZF : 0; + _regs.eflags |= (( int8_t)_regs.eax < 0) ? EFLG_SF : 0; + _regs.eflags |= even_parity(_regs.eax) ? EFLG_PF : 0; + break; + } + + case 0x2f: /* das */ { + uint8_t al = _regs.eax; + unsigned long eflags = _regs.eflags; + generate_exception_if(mode_64bit(), EXC_UD); + _regs.eflags &= ~(EFLG_CF|EFLG_AF); + if ( ((al & 0x0f) > 9) || (eflags & EFLG_AF) ) + { + _regs.eflags |= EFLG_AF; + if ( (al < 6) || (eflags & EFLG_CF) ) + _regs.eflags |= EFLG_CF; + *(uint8_t *)&_regs.eax -= 6; + } + if ( (al > 0x99) || (eflags & EFLG_CF) ) + { + *(uint8_t *)&_regs.eax -= 0x60; + _regs.eflags |= EFLG_CF; + } + _regs.eflags &= ~(EFLG_SF|EFLG_ZF|EFLG_PF); + _regs.eflags |= ((uint8_t)_regs.eax == 0) ? EFLG_ZF : 0; + _regs.eflags |= (( int8_t)_regs.eax < 0) ? EFLG_SF : 0; + _regs.eflags |= even_parity(_regs.eax) ? EFLG_PF : 0; + break; + } + + case 0x37: /* aaa */ + case 0x3f: /* aas */ + generate_exception_if(mode_64bit(), EXC_UD); + _regs.eflags &= ~EFLG_CF; + if ( ((uint8_t)_regs.eax > 9) || (_regs.eflags & EFLG_AF) ) + { + ((uint8_t *)&_regs.eax)[0] += (b == 0x37) ? 6 : -6; + ((uint8_t *)&_regs.eax)[1] += (b == 0x37) ? 1 : -1; + _regs.eflags |= EFLG_CF | EFLG_AF; + } + ((uint8_t *)&_regs.eax)[0] &= 0x0f; + break; + + case 0x40 ... 0x4f: /* inc/dec reg */ + dst.type = OP_REG; + dst.reg = decode_register(b & 7, &_regs, 0); + dst.bytes = op_bytes; + dst.val = *dst.reg; + if ( b & 8 ) + emulate_1op("dec", dst, _regs.eflags); + else + emulate_1op("inc", dst, _regs.eflags); + break; + + case 0x50 ... 0x57: /* push reg */ + src.val = *(unsigned long *)decode_register( + (b & 7) | ((rex_prefix & 1) << 3), &_regs, 0); + goto push; + + case 0x58 ... 0x5f: /* pop reg */ + dst.type = OP_REG; + dst.reg = decode_register( + (b & 7) | ((rex_prefix & 1) << 3), &_regs, 0); + dst.bytes = op_bytes; + if ( mode_64bit() && (dst.bytes == 4) ) + dst.bytes = 8; + if ( (rc = ops->read(x86_seg_ss, sp_post_inc(dst.bytes), + &dst.val, dst.bytes, ctxt)) != 0 ) + goto done; + break; + + case 0x60: /* pusha */ { + int i; + unsigned long regs[] = { + _regs.eax, _regs.ecx, _regs.edx, _regs.ebx, + _regs.esp, _regs.ebp, _regs.esi, _regs.edi }; + generate_exception_if(mode_64bit(), EXC_UD); + for ( i = 0; i < 8; i++ ) + if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes), + regs[i], op_bytes, ctxt)) != 0 ) + goto done; + break; + } + + case 0x61: /* popa */ { + int i; + unsigned long dummy_esp, *regs[] = { + (unsigned long *)&_regs.edi, (unsigned long *)&_regs.esi, + (unsigned long *)&_regs.ebp, (unsigned long *)&dummy_esp, + (unsigned long *)&_regs.ebx, (unsigned long *)&_regs.edx, + (unsigned long *)&_regs.ecx, (unsigned long *)&_regs.eax }; + generate_exception_if(mode_64bit(), EXC_UD); + for ( i = 0; i < 8; i++ ) + if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes), + regs[i], op_bytes, ctxt)) != 0 ) + goto done; + break; + } + + case 0x68: /* push imm{16,32,64} */ + src.val = ((op_bytes == 2) + ? (int32_t)insn_fetch_type(int16_t) + : insn_fetch_type(int32_t)); + goto push; + + case 0x6a: /* push imm8 */ + src.val = insn_fetch_type(int8_t); + push: + d |= Mov; /* force writeback */ + dst.type = OP_MEM; + dst.bytes = op_bytes; + if ( mode_64bit() && (dst.bytes == 4) ) + dst.bytes = 8; + dst.val = src.val; + dst.mem.seg = x86_seg_ss; + dst.mem.off = sp_pre_dec(dst.bytes); + break; + + case 0x6c ... 0x6d: /* ins %dx,%es:%edi */ + handle_rep_prefix(); + generate_exception_if(!mode_iopl(), EXC_GP); + dst.type = OP_MEM; + dst.bytes = !(b & 1) ? 1 : (op_bytes == 8) ? 4 : op_bytes; + dst.mem.seg = x86_seg_es; + dst.mem.off = truncate_ea(_regs.edi); + fail_if(ops->read_io == NULL); + if ( (rc = ops->read_io((uint16_t)_regs.edx, dst.bytes, + &dst.val, ctxt)) != 0 ) + goto done; + register_address_increment( + _regs.edi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes); + break; + + case 0x6e ... 0x6f: /* outs %esi,%dx */ + handle_rep_prefix(); + generate_exception_if(!mode_iopl(), EXC_GP); + dst.bytes = !(b & 1) ? 1 : (op_bytes == 8) ? 4 : op_bytes; + if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi), + &dst.val, dst.bytes, ctxt)) != 0 ) + goto done; + fail_if(ops->write_io == NULL); + if ( (rc = ops->write_io((uint16_t)_regs.edx, dst.bytes, + dst.val, ctxt)) != 0 ) + goto done; + register_address_increment( + _regs.esi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes); + break; + + case 0x70 ... 0x7f: /* jcc (short) */ { + int rel = insn_fetch_type(int8_t); + if ( test_cc(b, _regs.eflags) ) + jmp_rel(rel); + break; + } + + case 0x90: /* nop / xchg %%r8,%%rax */ + if ( !(rex_prefix & 1) ) + break; /* nop */ + + case 0x91 ... 0x97: /* xchg reg,%%rax */ + src.type = dst.type = OP_REG; + src.bytes = dst.bytes = op_bytes; + src.reg = (unsigned long *)&_regs.eax; + src.val = *src.reg; + dst.reg = decode_register( + (b & 7) | ((rex_prefix & 1) << 3), &_regs, 0); + dst.val = *dst.reg; + goto xchg; + + case 0x98: /* cbw/cwde/cdqe */ + switch ( op_bytes ) + { + case 2: *(int16_t *)&_regs.eax = (int8_t)_regs.eax; break; /* cbw */ + case 4: _regs.eax = (uint32_t)(int16_t)_regs.eax; break; /* cwde */ + case 8: _regs.eax = (int32_t)_regs.eax; break; /* cdqe */ + } + break; + + case 0x99: /* cwd/cdq/cqo */ + switch ( op_bytes ) + { + case 2: + *(int16_t *)&_regs.edx = ((int16_t)_regs.eax < 0) ? -1 : 0; + break; + case 4: + _regs.edx = (uint32_t)(((int32_t)_regs.eax < 0) ? -1 : 0); + break; + case 8: + _regs.edx = (_regs.eax < 0) ? -1 : 0; + break; + } + break; + + case 0x9e: /* sahf */ + *(uint8_t *)_regs.eflags = (((uint8_t *)&_regs.eax)[1] & 0xd7) | 0x02; + break; + + case 0x9f: /* lahf */ + ((uint8_t *)&_regs.eax)[1] = (_regs.eflags & 0xd7) | 0x02; + break; + + case 0xa0 ... 0xa1: /* mov mem.offs,{%al,%ax,%eax,%rax} */ + /* Source EA is not encoded via ModRM. */ + dst.type = OP_REG; + dst.reg = (unsigned long *)&_regs.eax; + dst.bytes = (d & ByteOp) ? 1 : op_bytes; + if ( (rc = ops->read(ea.mem.seg, insn_fetch_bytes(ad_bytes), + &dst.val, dst.bytes, ctxt)) != 0 ) + goto done; + break; + + case 0xa2 ... 0xa3: /* mov {%al,%ax,%eax,%rax},mem.offs */ + /* Destination EA is not encoded via ModRM. */ + dst.type = OP_MEM; + dst.mem.seg = ea.mem.seg; + dst.mem.off = insn_fetch_bytes(ad_bytes); + dst.bytes = (d & ByteOp) ? 1 : op_bytes; + dst.val = (unsigned long)_regs.eax; + break; + + case 0xa4 ... 0xa5: /* movs */ + handle_rep_prefix(); + dst.type = OP_MEM; + dst.bytes = (d & ByteOp) ? 1 : op_bytes; + dst.mem.seg = x86_seg_es; + dst.mem.off = truncate_ea(_regs.edi); + if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi), + &dst.val, dst.bytes, ctxt)) != 0 ) + goto done; + register_address_increment( + _regs.esi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes); + register_address_increment( + _regs.edi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes); + break; + + case 0xaa ... 0xab: /* stos */ + handle_rep_prefix(); + dst.type = OP_MEM; + dst.bytes = (d & ByteOp) ? 1 : op_bytes; + dst.mem.seg = x86_seg_es; + dst.mem.off = truncate_ea(_regs.edi); + dst.val = _regs.eax; + register_address_increment( + _regs.edi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes); + break; + + case 0xac ... 0xad: /* lods */ + handle_rep_prefix(); + dst.type = OP_REG; + dst.bytes = (d & ByteOp) ? 1 : op_bytes; + dst.reg = (unsigned long *)&_regs.eax; + if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi), + &dst.val, dst.bytes, ctxt)) != 0 ) + goto done; + register_address_increment( + _regs.esi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes); + break; + + case 0xc2: /* ret imm16 (near) */ + case 0xc3: /* ret (near) */ { + int offset = (b == 0xc2) ? insn_fetch_type(uint16_t) : 0; + op_bytes = mode_64bit() ? 8 : op_bytes; + if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes + offset), + &dst.val, op_bytes, ctxt)) != 0 ) + goto done; + _regs.eip = dst.val; + break; + } + + case 0xd4: /* aam */ { + unsigned int base = insn_fetch_type(uint8_t); + uint8_t al = _regs.eax; + generate_exception_if(mode_64bit(), EXC_UD); + generate_exception_if(base == 0, EXC_DE); + *(uint16_t *)&_regs.eax = ((al / base) << 8) | (al % base); + _regs.eflags &= ~(EFLG_SF|EFLG_ZF|EFLG_PF); + _regs.eflags |= ((uint8_t)_regs.eax == 0) ? EFLG_ZF : 0; + _regs.eflags |= (( int8_t)_regs.eax < 0) ? EFLG_SF : 0; + _regs.eflags |= even_parity(_regs.eax) ? EFLG_PF : 0; + break; + } + + case 0xd5: /* aad */ { + unsigned int base = insn_fetch_type(uint8_t); + uint16_t ax = _regs.eax; + generate_exception_if(mode_64bit(), EXC_UD); + *(uint16_t *)&_regs.eax = (uint8_t)(ax + ((ax >> 8) * base)); + _regs.eflags &= ~(EFLG_SF|EFLG_ZF|EFLG_PF); + _regs.eflags |= ((uint8_t)_regs.eax == 0) ? EFLG_ZF : 0; + _regs.eflags |= (( int8_t)_regs.eax < 0) ? EFLG_SF : 0; + _regs.eflags |= even_parity(_regs.eax) ? EFLG_PF : 0; + break; + } + + case 0xd6: /* salc */ + generate_exception_if(mode_64bit(), EXC_UD); + *(uint8_t *)&_regs.eax = (_regs.eflags & EFLG_CF) ? 0xff : 0x00; + break; + + case 0xd7: /* xlat */ { + unsigned long al = (uint8_t)_regs.eax; + if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.ebx + al), + &al, 1, ctxt)) != 0 ) + goto done; + *(uint8_t *)&_regs.eax = al; + break; + } + + case 0xe0 ... 0xe2: /* loop{,z,nz} */ { + int rel = insn_fetch_type(int8_t); + int do_jmp = !(_regs.eflags & EFLG_ZF); /* loopnz */ + if ( b == 0xe1 ) + do_jmp = !do_jmp; /* loopz */ + else if ( b == 0xe2 ) + do_jmp = 1; /* loop */ + switch ( ad_bytes ) + { + case 2: + do_jmp &= --(*(uint16_t *)&_regs.ecx) != 0; + break; + case 4: + do_jmp &= --(*(uint32_t *)&_regs.ecx) != 0; + _regs.ecx = (uint32_t)_regs.ecx; /* zero extend in x86/64 mode */ + break; + default: /* case 8: */ + do_jmp &= --_regs.ecx != 0; + break; + } + if ( do_jmp ) + jmp_rel(rel); + break; + } + + case 0xe3: /* jcxz/jecxz (short) */ { + int rel = insn_fetch_type(int8_t); + if ( (ad_bytes == 2) ? !(uint16_t)_regs.ecx : + (ad_bytes == 4) ? !(uint32_t)_regs.ecx : !_regs.ecx ) + jmp_rel(rel); + break; + } + + case 0xe4: /* in imm8,%al */ + case 0xe5: /* in imm8,%eax */ + case 0xe6: /* out %al,imm8 */ + case 0xe7: /* out %eax,imm8 */ + case 0xec: /* in %dx,%al */ + case 0xed: /* in %dx,%eax */ + case 0xee: /* out %al,%dx */ + case 0xef: /* out %eax,%dx */ { + unsigned int port = ((b < 0xe8) + ? insn_fetch_type(uint8_t) + : (uint16_t)_regs.edx); + generate_exception_if(!mode_iopl(), EXC_GP); + op_bytes = !(b & 1) ? 1 : (op_bytes == 8) ? 4 : op_bytes; + if ( b & 2 ) + { + /* out */ + fail_if(ops->write_io == NULL); + rc = ops->write_io(port, op_bytes, _regs.eax, ctxt); + + } + else + { + /* in */ + dst.type = OP_REG; + dst.bytes = op_bytes; + dst.reg = (unsigned long *)&_regs.eax; + fail_if(ops->read_io == NULL); + rc = ops->read_io(port, dst.bytes, &dst.val, ctxt); + } + if ( rc != 0 ) + goto done; + break; + } + + case 0xe8: /* call (near) */ { + int rel = (((op_bytes == 2) && !mode_64bit()) + ? (int32_t)insn_fetch_type(int16_t) + : insn_fetch_type(int32_t)); + op_bytes = mode_64bit() ? 8 : op_bytes; + src.val = _regs.eip; + jmp_rel(rel); + goto push; + } + + case 0xe9: /* jmp (near) */ { + int rel = (((op_bytes == 2) && !mode_64bit()) + ? (int32_t)insn_fetch_type(int16_t) + : insn_fetch_type(int32_t)); + jmp_rel(rel); + break; + } + + case 0xeb: /* jmp (short) */ + jmp_rel(insn_fetch_type(int8_t)); + break; + + case 0xf5: /* cmc */ + _regs.eflags ^= EFLG_CF; + break; + + case 0xf8: /* clc */ + _regs.eflags &= ~EFLG_CF; + break; + + case 0xf9: /* stc */ + _regs.eflags |= EFLG_CF; + break; + + case 0xfa: /* cli */ + generate_exception_if(!mode_iopl(), EXC_GP); + fail_if(ops->write_rflags == NULL); + if ( (rc = ops->write_rflags(_regs.eflags & ~EFLG_IF, ctxt)) != 0 ) + goto done; + break; + + case 0xfb: /* sti */ + generate_exception_if(!mode_iopl(), EXC_GP); + fail_if(ops->write_rflags == NULL); + if ( (rc = ops->write_rflags(_regs.eflags | EFLG_IF, ctxt)) != 0 ) + goto done; + break; + + case 0xfc: /* cld */ + _regs.eflags &= ~EFLG_DF; + break; + + case 0xfd: /* std */ + _regs.eflags |= EFLG_DF; + break; + } + goto writeback; + + twobyte_insn: + switch ( b ) + { + case 0x40 ... 0x4f: /* cmovcc */ + dst.val = src.val; + if ( !test_cc(b, _regs.eflags) ) + dst.type = OP_NONE; + break; + + case 0x90 ... 0x9f: /* setcc */ + dst.val = test_cc(b, _regs.eflags); + break; + + case 0xb0 ... 0xb1: /* cmpxchg */ + /* Save real source value, then compare EAX against destination. */ + src.orig_val = src.val; + src.val = _regs.eax; + emulate_2op_SrcV("cmp", src, dst, _regs.eflags); + /* Always write back. The question is: where to? */ + d |= Mov; + if ( _regs.eflags & EFLG_ZF ) + { + /* Success: write back to memory. */ + dst.val = src.orig_val; + } + else + { + /* Failure: write the value we saw to EAX. */ + dst.type = OP_REG; + dst.reg = (unsigned long *)&_regs.eax; + } + break; + + case 0xa3: bt: /* bt */ + emulate_2op_SrcV_nobyte("bt", src, dst, _regs.eflags); + break; + + case 0xb3: btr: /* btr */ + emulate_2op_SrcV_nobyte("btr", src, dst, _regs.eflags); + break; + + case 0xab: bts: /* bts */ + emulate_2op_SrcV_nobyte("bts", src, dst, _regs.eflags); + break; + + case 0xaf: /* imul */ + _regs.eflags &= ~(EFLG_OF|EFLG_CF); + switch ( dst.bytes ) + { + case 2: + dst.val = ((uint32_t)(int16_t)src.val * + (uint32_t)(int16_t)dst.val); + if ( (int16_t)dst.val != (uint32_t)dst.val ) + _regs.eflags |= EFLG_OF|EFLG_CF; + break; +#ifdef __x86_64__ + case 4: + dst.val = ((uint64_t)(int32_t)src.val * + (uint64_t)(int32_t)dst.val); + if ( (int32_t)dst.val != dst.val ) + _regs.eflags |= EFLG_OF|EFLG_CF; + break; +#endif + default: { + unsigned long m[2] = { src.val, dst.val }; + if ( imul_dbl(m) ) + _regs.eflags |= EFLG_OF|EFLG_CF; + dst.val = m[0]; + break; + } + } + break; + + case 0xb6: /* movzx rm8,r{16,32,64} */ + /* Recompute DstReg as we may have decoded AH/BH/CH/DH. */ + dst.reg = decode_register(modrm_reg, &_regs, 0); + dst.bytes = op_bytes; + dst.val = (uint8_t)src.val; + break; + + case 0xbc: /* bsf */ { + int zf; + asm ( "bsf %2,%0; setz %b1" + : "=r" (dst.val), "=q" (zf) + : "r" (src.val), "1" (0) ); + _regs.eflags &= ~EFLG_ZF; + _regs.eflags |= zf ? EFLG_ZF : 0; + break; + } + + case 0xbd: /* bsr */ { + int zf; + asm ( "bsr %2,%0; setz %b1" + : "=r" (dst.val), "=q" (zf) + : "r" (src.val), "1" (0) ); + _regs.eflags &= ~EFLG_ZF; + _regs.eflags |= zf ? EFLG_ZF : 0; + break; + } + + case 0xb7: /* movzx rm16,r{16,32,64} */ + dst.val = (uint16_t)src.val; + break; + + case 0xbb: btc: /* btc */ + emulate_2op_SrcV_nobyte("btc", src, dst, _regs.eflags); + break; + + case 0xba: /* Grp8 */ + switch ( modrm_reg & 7 ) + { + case 4: goto bt; + case 5: goto bts; + case 6: goto btr; + case 7: goto btc; + default: generate_exception_if(1, EXC_UD); + } + break; + + case 0xbe: /* movsx rm8,r{16,32,64} */ + /* Recompute DstReg as we may have decoded AH/BH/CH/DH. */ + dst.reg = decode_register(modrm_reg, &_regs, 0); + dst.bytes = op_bytes; + dst.val = (int8_t)src.val; + break; + + case 0xbf: /* movsx rm16,r{16,32,64} */ + dst.val = (int16_t)src.val; + break; + + case 0xc0 ... 0xc1: /* xadd */ + /* Write back the register source. */ + switch ( dst.bytes ) + { + case 1: *(uint8_t *)src.reg = (uint8_t)dst.val; break; + case 2: *(uint16_t *)src.reg = (uint16_t)dst.val; break; + case 4: *src.reg = (uint32_t)dst.val; break; /* 64b reg: zero-extend */ + case 8: *src.reg = dst.val; break; + } + goto add; + } + goto writeback; + + twobyte_special_insn: + switch ( b ) + { + case 0x06: /* clts */ + generate_exception_if(!mode_ring0(), EXC_GP); + fail_if((ops->read_cr == NULL) || (ops->write_cr == NULL)); + if ( (rc = ops->read_cr(0, &dst.val, ctxt)) || + (rc = ops->write_cr(0, dst.val&~8, ctxt)) ) + goto done; + break; + + case 0x08: /* invd */ + case 0x09: /* wbinvd */ + generate_exception_if(!mode_ring0(), EXC_GP); + fail_if(ops->wbinvd == NULL); + if ( (rc = ops->wbinvd(ctxt)) != 0 ) + goto done; + break; + + case 0x0d: /* GrpP (prefetch) */ + case 0x18: /* Grp16 (prefetch/nop) */ + case 0x19 ... 0x1f: /* nop (amd-defined) */ + break; + + case 0x20: /* mov cr,reg */ + case 0x21: /* mov dr,reg */ + case 0x22: /* mov reg,cr */ + case 0x23: /* mov reg,dr */ + generate_exception_if(!mode_ring0(), EXC_GP); + modrm_rm |= (rex_prefix & 1) << 3; + modrm_reg |= lock_prefix << 3; + if ( b & 2 ) + { + /* Write to CR/DR. */ + src.val = *(unsigned long *)decode_register(modrm_rm, &_regs, 0); + if ( !mode_64bit() ) + src.val = (uint32_t)src.val; + rc = ((b & 1) + ? (ops->write_dr + ? ops->write_dr(modrm_reg, src.val, ctxt) + : X86EMUL_UNHANDLEABLE) + : (ops->write_cr + ? ops->write_dr(modrm_reg, src.val, ctxt) + : X86EMUL_UNHANDLEABLE)); + } + else + { + /* Read from CR/DR. */ + dst.type = OP_REG; + dst.bytes = mode_64bit() ? 8 : 4; + dst.reg = decode_register(modrm_rm, &_regs, 0); + rc = ((b & 1) + ? (ops->read_dr + ? ops->read_dr(modrm_reg, &dst.val, ctxt) + : X86EMUL_UNHANDLEABLE) + : (ops->read_cr + ? ops->read_dr(modrm_reg, &dst.val, ctxt) + : X86EMUL_UNHANDLEABLE)); + } + if ( rc != 0 ) + goto done; + break; + + case 0x30: /* wrmsr */ { + uint64_t val = ((uint64_t)_regs.edx << 32) | (uint32_t)_regs.eax; + generate_exception_if(!mode_ring0(), EXC_GP); + fail_if(ops->write_msr == NULL); + if ( (rc = ops->write_msr((uint32_t)_regs.ecx, val, ctxt)) != 0 ) + goto done; + break; + } + + case 0x32: /* rdmsr */ { + uint64_t val; + generate_exception_if(!mode_ring0(), EXC_GP); + fail_if(ops->read_msr == NULL); + if ( (rc = ops->read_msr((uint32_t)_regs.ecx, &val, ctxt)) != 0 ) + goto done; + _regs.edx = (uint32_t)(val >> 32); + _regs.eax = (uint32_t)(val >> 0); + break; + } + + case 0x80 ... 0x8f: /* jcc (near) */ { + int rel = (((op_bytes == 2) && !mode_64bit()) + ? (int32_t)insn_fetch_type(int16_t) + : insn_fetch_type(int32_t)); + if ( test_cc(b, _regs.eflags) ) + jmp_rel(rel); + break; + } + + case 0xc7: /* Grp9 (cmpxchg8b) */ +#if defined(__i386__) + { + unsigned long old_lo, old_hi; + generate_exception_if((modrm_reg & 7) != 1, EXC_UD); + if ( (rc = ops->read(ea.mem.seg, ea.mem.off+0, &old_lo, 4, ctxt)) || + (rc = ops->read(ea.mem.seg, ea.mem.off+4, &old_hi, 4, ctxt)) ) + goto done; + if ( (old_lo != _regs.eax) || (old_hi != _regs.edx) ) + { + _regs.eax = old_lo; + _regs.edx = old_hi; + _regs.eflags &= ~EFLG_ZF; + } + else if ( ops->cmpxchg8b == NULL ) + { + rc = X86EMUL_UNHANDLEABLE; + goto done; + } + else + { + if ( (rc = ops->cmpxchg8b(ea.mem.seg, ea.mem.off, old_lo, old_hi, + _regs.ebx, _regs.ecx, ctxt)) != 0 ) + goto done; + _regs.eflags |= EFLG_ZF; + } + break; + } +#elif defined(__x86_64__) + { + unsigned long old, new; + generate_exception_if((modrm_reg & 7) != 1, EXC_UD); + if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &old, 8, ctxt)) != 0 ) + goto done; + if ( ((uint32_t)(old>>0) != (uint32_t)_regs.eax) || + ((uint32_t)(old>>32) != (uint32_t)_regs.edx) ) + { + _regs.eax = (uint32_t)(old>>0); + _regs.edx = (uint32_t)(old>>32); + _regs.eflags &= ~EFLG_ZF; + } + else + { + new = (_regs.ecx<<32)|(uint32_t)_regs.ebx; + if ( (rc = ops->cmpxchg(ea.mem.seg, ea.mem.off, old, + new, 8, ctxt)) != 0 ) + goto done; + _regs.eflags |= EFLG_ZF; + } + break; + } +#endif + + case 0xc8 ... 0xcf: /* bswap */ + dst.type = OP_REG; + dst.reg = decode_register( + (b & 7) | ((rex_prefix & 1) << 3), &_regs, 0); + switch ( dst.bytes = op_bytes ) + { + default: /* case 2: */ + /* Undefined behaviour. Writes zero on all tested CPUs. */ + dst.val = 0; + break; + case 4: +#ifdef __x86_64__ + __asm__ ( "bswap %k0" : "=r" (dst.val) : "0" (*dst.reg) ); + break; + case 8: +#endif + __asm__ ( "bswap %0" : "=r" (dst.val) : "0" (*dst.reg) ); + break; + } + break; + } + goto writeback; + + cannot_emulate: +#if 0 + gdprintk(XENLOG_DEBUG, "Instr:"); + for ( ea.mem.off = ctxt->regs->eip; ea.mem.off < _regs.eip; ea.mem.off++ ) + { + unsigned long x; + ops->insn_fetch(x86_seg_cs, ea.mem.off, &x, 1, ctxt); + printk(" %02x", (uint8_t)x); + } + printk("\n"); +#endif + return X86EMUL_UNHANDLEABLE; +} diff -Naurp xen/common/domain.c xen-redhat/common/domain.c --- xen/common/domain.c +++ xen-redhat/common/domain.c @@ -30,6 +30,24 @@ #include <public/vcpu.h> #include <acm/acm_hooks.h> +/* opt_dom0_vcpus_pin: If true, dom0 VCPUs are pinned. */ +unsigned int opt_dom0_vcpus_pin = 1; +boolean_param("dom0_vcpus_pin", opt_dom0_vcpus_pin); + +enum cpufreq_controller cpufreq_controller = FREQCTL_dom0_kernel; +static void __init setup_cpufreq_option(char *str) +{ + if ( !strcmp(str, "dom0-kernel") ) + { + cpufreq_controller = FREQCTL_dom0_kernel; + opt_dom0_vcpus_pin = 1; + } else if ( !strcmp(str, "off") || !strcmp(str, "none") ) { + cpufreq_controller = FREQCTL_none; + opt_dom0_vcpus_pin = 0; + } +} +custom_param("cpufreq", setup_cpufreq_option); + /* Protect updates/reads (resp.) of domain_list and domain_hash. */ DEFINE_SPINLOCK(domlist_update_lock); DEFINE_RCU_READ_LOCK(domlist_read_lock); @@ -63,6 +81,8 @@ struct domain *alloc_domain(domid_t domi spin_lock_init(&d->shutdown_lock); INIT_LIST_HEAD(&d->page_list); INIT_LIST_HEAD(&d->xenpage_list); + /* HV */ + atomic_set(&d->hard_virt, 0); return d; } @@ -189,6 +209,9 @@ struct domain *domain_create( if ( domcr_flags & DOMCRF_hvm ) d->is_hvm = 1; + if ( (domid == 0) && opt_dom0_vcpus_pin ) + d->is_pinned = 1; + rangeset_domain_initialise(d); if ( !is_idle_domain(d) ) @@ -238,7 +261,7 @@ struct domain *domain_create( return d; fail: - d->is_dying = 1; + d->is_dying = DOMDYING_dead; atomic_set(&d->refcnt, DOMAIN_DESTROYED); if ( init_status & INIT_arch ) arch_domain_destroy(d); @@ -298,26 +321,38 @@ struct domain *rcu_lock_domain_by_id(dom } -void domain_kill(struct domain *d) +int domain_kill(struct domain *d) { - domain_pause(d); + int rc = 0; - /* Already dying? Then bail. */ - if ( test_and_set_bool(d->is_dying) ) + if ( d == current->domain ) + return -EINVAL; + + /* Protected by domctl_lock. */ + switch ( d->is_dying ) { - domain_unpause(d); - return; + case DOMDYING_alive: + domain_pause(d); + d->is_dying = DOMDYING_dying; + evtchn_destroy(d); + gnttab_release_mappings(d); + /* fallthrough */ + case DOMDYING_dying: + rc = domain_relinquish_resources(d); + if ( rc != 0 ) + { + BUG_ON(rc != -EAGAIN); + break; + } + d->is_dying = DOMDYING_dead; + put_domain(d); + send_guest_global_virq(dom0, VIRQ_DOM_EXC); + /* fallthrough */ + case DOMDYING_dead: + break; } - evtchn_destroy(d); - gnttab_release_mappings(d); - domain_relinquish_resources(d); - put_domain(d); - - /* Kick page scrubbing after domain_relinquish_resources(). */ - page_scrub_kick(); - - send_guest_global_virq(dom0, VIRQ_DOM_EXC); + return rc; } diff -Naurp xen/common/domctl.c xen-redhat/common/domctl.c --- xen/common/domctl.c +++ xen-redhat/common/domctl.c @@ -43,7 +43,8 @@ void cpumask_to_xenctl_cpumap( bitmap_long_to_byte(bytemap, cpus_addr(*cpumask), NR_CPUS); - copy_to_guest(xenctl_cpumap->bitmap, bytemap, copy_bytes); + if ( copy_bytes != 0 ) + copy_to_guest(xenctl_cpumap->bitmap, bytemap, copy_bytes); for ( i = copy_bytes; i < guest_bytes; i++ ) copy_to_guest_offset(xenctl_cpumap->bitmap, i, &zero, 1); @@ -55,15 +56,20 @@ void xenctl_cpumap_to_cpumask( unsigned int guest_bytes, copy_bytes; uint8_t bytemap[(NR_CPUS + 7) / 8]; + if ( guest_handle_is_null(xenctl_cpumap->bitmap) ) + return; + guest_bytes = (xenctl_cpumap->nr_cpus + 7) / 8; copy_bytes = min_t(unsigned int, guest_bytes, sizeof(bytemap)); - cpus_clear(*cpumask); - - if ( guest_handle_is_null(xenctl_cpumap->bitmap) ) - return; + memset(bytemap, 0, sizeof(bytemap)); - copy_from_guest(bytemap, xenctl_cpumap->bitmap, copy_bytes); + if ( copy_bytes != 0 ) + { + copy_from_guest(bytemap, xenctl_cpumap->bitmap, copy_bytes); + if ( (xenctl_cpumap->nr_cpus & 7) && (guest_bytes <= sizeof(bytemap)) ) + bytemap[guest_bytes-1] &= ~(0xff << (xenctl_cpumap->nr_cpus & 7)); + } bitmap_byte_to_long(cpus_addr(*cpumask), bytemap, NR_CPUS); } @@ -114,10 +120,10 @@ void getdomaininfo(struct domain *d, str info->cpu_time = cpu_time; info->flags = flags | - (d->is_dying ? XEN_DOMINF_dying : 0) | - (d->is_shut_down ? XEN_DOMINF_shutdown : 0) | - (d->is_paused_by_controller ? XEN_DOMINF_paused : 0) | - (d->debugger_attached ? XEN_DOMINF_debugged : 0) | + ((d->is_dying == DOMDYING_dead) ? XEN_DOMINF_dying : 0) | + (d->is_shut_down ? XEN_DOMINF_shutdown : 0) | + (d->is_paused_by_controller ? XEN_DOMINF_paused : 0) | + (d->debugger_attached ? XEN_DOMINF_debugged : 0) | d->shutdown_code << XEN_DOMINF_shutdownshift; if ( is_hvm_domain(d) ) @@ -188,7 +194,8 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc if ( op->interface_version != XEN_DOMCTL_INTERFACE_VERSION ) return -EACCES; - spin_lock(&domctl_lock); + if ( !spin_trylock(&domctl_lock) ) + return hypercall_create_continuation(__HYPERVISOR_domctl, "h", u_domctl); switch ( op->cmd ) { @@ -222,13 +229,15 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc if ( (c.nat = xmalloc(struct vcpu_guest_context)) == NULL ) goto svc_out; - if ( !IS_COMPAT(v->domain) ) - ret = copy_from_guest(c.nat, op->u.vcpucontext.ctxt, 1); #ifdef CONFIG_COMPAT + if ( !is_pv_32on64_vcpu(v) ) + ret = copy_from_guest(c.nat, op->u.vcpucontext.ctxt, 1); else ret = copy_from_guest(c.cmp, guest_handle_cast(op->u.vcpucontext.ctxt, void), 1); +#else + ret = copy_from_guest(c.nat, op->u.vcpucontext.ctxt, 1); #endif ret = ret ? -EFAULT : 0; @@ -397,10 +406,7 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc { ret = -EINVAL; if ( d != current->domain ) - { - domain_kill(d); - ret = 0; - } + ret = domain_kill(d); rcu_unlock_domain(d); } } @@ -527,12 +533,14 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc if ( v != current ) vcpu_unpause(v); - if ( !IS_COMPAT(v->domain) ) - ret = copy_to_guest(op->u.vcpucontext.ctxt, c.nat, 1); #ifdef CONFIG_COMPAT + if ( !is_pv_32on64_vcpu(v) ) + ret = copy_to_guest(op->u.vcpucontext.ctxt, c.nat, 1); else ret = copy_to_guest(guest_handle_cast(op->u.vcpucontext.ctxt, void), c.cmp, 1); +#else + ret = copy_to_guest(op->u.vcpucontext.ctxt, c.nat, 1); #endif if ( copy_to_guest(u_domctl, op, 1) || ret ) diff -Naurp xen/common/event_channel.c xen-redhat/common/event_channel.c --- xen/common/event_channel.c +++ xen-redhat/common/event_channel.c @@ -118,7 +118,7 @@ static long evtchn_alloc_unbound(evtchn_ if ( (d = rcu_lock_domain_by_id(dom)) == NULL ) return -ESRCH; - spin_lock(&d->evtchn_lock); + spin_lock(&d->event_lock); if ( (port = get_free_port(d)) < 0 ) ERROR_EXIT(port); @@ -131,7 +131,7 @@ static long evtchn_alloc_unbound(evtchn_ alloc->port = port; out: - spin_unlock(&d->evtchn_lock); + spin_unlock(&d->event_lock); rcu_unlock_domain(d); @@ -159,14 +159,14 @@ static long evtchn_bind_interdomain(evtc /* Avoid deadlock by first acquiring lock of domain with smaller id. */ if ( ld < rd ) { - spin_lock(&ld->evtchn_lock); - spin_lock(&rd->evtchn_lock); + spin_lock(&ld->event_lock); + spin_lock(&rd->event_lock); } else { if ( ld != rd ) - spin_lock(&rd->evtchn_lock); - spin_lock(&ld->evtchn_lock); + spin_lock(&rd->event_lock); + spin_lock(&ld->event_lock); } if ( (lport = get_free_port(ld)) < 0 ) @@ -197,9 +197,9 @@ static long evtchn_bind_interdomain(evtc bind->local_port = lport; out: - spin_unlock(&ld->evtchn_lock); + spin_unlock(&ld->event_lock); if ( ld != rd ) - spin_unlock(&rd->evtchn_lock); + spin_unlock(&rd->event_lock); rcu_unlock_domain(rd); @@ -225,7 +225,7 @@ static long evtchn_bind_virq(evtchn_bind ((v = d->vcpu[vcpu]) == NULL) ) return -ENOENT; - spin_lock(&d->evtchn_lock); + spin_lock(&d->event_lock); if ( v->virq_to_evtchn[virq] != 0 ) ERROR_EXIT(-EEXIST); @@ -241,7 +241,7 @@ static long evtchn_bind_virq(evtchn_bind v->virq_to_evtchn[virq] = bind->port = port; out: - spin_unlock(&d->evtchn_lock); + spin_unlock(&d->event_lock); return rc; } @@ -258,7 +258,7 @@ static long evtchn_bind_ipi(evtchn_bind_ (d->vcpu[vcpu] == NULL) ) return -ENOENT; - spin_lock(&d->evtchn_lock); + spin_lock(&d->event_lock); if ( (port = get_free_port(d)) < 0 ) ERROR_EXIT(port); @@ -270,7 +270,7 @@ static long evtchn_bind_ipi(evtchn_bind_ bind->port = port; out: - spin_unlock(&d->evtchn_lock); + spin_unlock(&d->event_lock); return rc; } @@ -289,7 +289,7 @@ static long evtchn_bind_pirq(evtchn_bind if ( !irq_access_permitted(d, pirq) ) return -EPERM; - spin_lock(&d->evtchn_lock); + spin_lock(&d->event_lock); if ( d->pirq_to_evtchn[pirq] != 0 ) ERROR_EXIT(-EEXIST); @@ -314,7 +314,7 @@ static long evtchn_bind_pirq(evtchn_bind bind->port = port; out: - spin_unlock(&d->evtchn_lock); + spin_unlock(&d->event_lock); return rc; } @@ -329,7 +329,7 @@ static long __evtchn_close(struct domain long rc = 0; again: - spin_lock(&d1->evtchn_lock); + spin_lock(&d1->event_lock); if ( !port_is_valid(d1, port1) ) { @@ -357,8 +357,8 @@ static long __evtchn_close(struct domain break; case ECS_PIRQ: - if ( (rc = pirq_guest_unbind(d1, chn1->u.pirq)) == 0 ) - d1->pirq_to_evtchn[chn1->u.pirq] = 0; + pirq_guest_unbind(d1, chn1->u.pirq); + d1->pirq_to_evtchn[chn1->u.pirq] = 0; break; case ECS_VIRQ: @@ -381,12 +381,12 @@ static long __evtchn_close(struct domain if ( d1 < d2 ) { - spin_lock(&d2->evtchn_lock); + spin_lock(&d2->event_lock); } else if ( d1 != d2 ) { - spin_unlock(&d1->evtchn_lock); - spin_lock(&d2->evtchn_lock); + spin_unlock(&d1->event_lock); + spin_lock(&d2->event_lock); goto again; } } @@ -426,11 +426,11 @@ static long __evtchn_close(struct domain if ( d2 != NULL ) { if ( d1 != d2 ) - spin_unlock(&d2->evtchn_lock); + spin_unlock(&d2->event_lock); put_domain(d2); } - spin_unlock(&d1->evtchn_lock); + spin_unlock(&d1->event_lock); return rc; } @@ -449,11 +449,11 @@ long evtchn_send(unsigned int lport) struct vcpu *rvcpu; int rport, ret = 0; - spin_lock(&ld->evtchn_lock); + spin_lock(&ld->event_lock); if ( unlikely(!port_is_valid(ld, lport)) ) { - spin_unlock(&ld->evtchn_lock); + spin_unlock(&ld->event_lock); return -EINVAL; } @@ -462,7 +462,7 @@ long evtchn_send(unsigned int lport) /* Guest cannot send via a Xen-attached event channel. */ if ( unlikely(lchn->consumer_is_xen) ) { - spin_unlock(&ld->evtchn_lock); + spin_unlock(&ld->event_lock); return -EINVAL; } @@ -495,7 +495,7 @@ long evtchn_send(unsigned int lport) ret = -EINVAL; } - spin_unlock(&ld->evtchn_lock); + spin_unlock(&ld->event_lock); return ret; } @@ -517,7 +517,7 @@ void evtchn_set_pending(struct vcpu *v, return; if ( !test_bit (port, __shared_info_addr(d, s, evtchn_mask)) && - !test_and_set_bit(port / BITS_PER_GUEST_LONG(d), + !test_and_set_bit(port / BITS_PER_EVTCHN_WORD(d), vcpu_info_addr(v, evtchn_pending_sel)) ) { vcpu_mark_events_pending(v); @@ -604,7 +604,7 @@ static long evtchn_status(evtchn_status_ if ( (d = rcu_lock_domain_by_id(dom)) == NULL ) return -ESRCH; - spin_lock(&d->evtchn_lock); + spin_lock(&d->event_lock); if ( !port_is_valid(d, port) ) { @@ -647,7 +647,7 @@ static long evtchn_status(evtchn_status_ status->vcpu = chn->notify_vcpu_id; out: - spin_unlock(&d->evtchn_lock); + spin_unlock(&d->event_lock); rcu_unlock_domain(d); return rc; } @@ -662,7 +662,7 @@ long evtchn_bind_vcpu(unsigned int port, if ( (vcpu_id >= ARRAY_SIZE(d->vcpu)) || (d->vcpu[vcpu_id] == NULL) ) return -ENOENT; - spin_lock(&d->evtchn_lock); + spin_lock(&d->event_lock); if ( !port_is_valid(d, port) ) { @@ -698,7 +698,7 @@ long evtchn_bind_vcpu(unsigned int port, } out: - spin_unlock(&d->evtchn_lock); + spin_unlock(&d->event_lock); return rc; } @@ -710,11 +710,11 @@ static long evtchn_unmask(evtchn_unmask_ int port = unmask->port; struct vcpu *v; - spin_lock(&d->evtchn_lock); + spin_lock(&d->event_lock); if ( unlikely(!port_is_valid(d, port)) ) { - spin_unlock(&d->evtchn_lock); + spin_unlock(&d->event_lock); return -EINVAL; } @@ -726,13 +726,13 @@ static long evtchn_unmask(evtchn_unmask_ */ if ( test_and_clear_bit(port, __shared_info_addr(d, s, evtchn_mask)) && test_bit (port, __shared_info_addr(d, s, evtchn_pending)) && - !test_and_set_bit (port / BITS_PER_GUEST_LONG(d), + !test_and_set_bit (port / BITS_PER_EVTCHN_WORD(d), vcpu_info_addr(v, evtchn_pending_sel)) ) { vcpu_mark_events_pending(v); } - spin_unlock(&d->evtchn_lock); + spin_unlock(&d->event_lock); return 0; } @@ -883,7 +883,7 @@ int alloc_unbound_xen_event_channel( struct domain *d = local_vcpu->domain; int port; - spin_lock(&d->evtchn_lock); + spin_lock(&d->event_lock); if ( (port = get_free_port(d)) < 0 ) goto out; @@ -895,7 +895,7 @@ int alloc_unbound_xen_event_channel( chn->u.unbound.remote_domid = remote_domid; out: - spin_unlock(&d->evtchn_lock); + spin_unlock(&d->event_lock); return port; } @@ -907,11 +907,11 @@ void free_xen_event_channel( struct evtchn *chn; struct domain *d = local_vcpu->domain; - spin_lock(&d->evtchn_lock); + spin_lock(&d->event_lock); chn = evtchn_from_port(d, port); BUG_ON(!chn->consumer_is_xen); chn->consumer_is_xen = 0; - spin_unlock(&d->evtchn_lock); + spin_unlock(&d->event_lock); (void)__evtchn_close(d, port); } @@ -923,7 +923,7 @@ void notify_via_xen_event_channel(int lp struct domain *ld = current->domain, *rd; int rport; - spin_lock(&ld->evtchn_lock); + spin_lock(&ld->event_lock); ASSERT(port_is_valid(ld, lport)); lchn = evtchn_from_port(ld, lport); @@ -937,13 +937,13 @@ void notify_via_xen_event_channel(int lp evtchn_set_pending(rd->vcpu[rchn->notify_vcpu_id], rport); } - spin_unlock(&ld->evtchn_lock); + spin_unlock(&ld->event_lock); } int evtchn_init(struct domain *d) { - spin_lock_init(&d->evtchn_lock); + spin_lock_init(&d->event_lock); if ( get_free_port(d) != 0 ) return -EINVAL; evtchn_from_port(d, 0)->state = ECS_RESERVED; @@ -957,7 +957,7 @@ void evtchn_destroy(struct domain *d) /* After this barrier no new event-channel allocations can occur. */ BUG_ON(!d->is_dying); - spin_barrier(&d->evtchn_lock); + spin_barrier(&d->event_lock); /* Close all existing event channels. */ for ( i = 0; port_is_valid(d, i); i++ ) @@ -967,10 +967,10 @@ void evtchn_destroy(struct domain *d) } /* Free all event-channel buckets. */ - spin_lock(&d->evtchn_lock); + spin_lock(&d->event_lock); for ( i = 0; i < NR_EVTCHN_BUCKETS; i++ ) xfree(d->evtchn[i]); - spin_unlock(&d->evtchn_lock); + spin_unlock(&d->event_lock); } /* diff -Naurp xen/common/gdbstub.c xen-redhat/common/gdbstub.c --- xen/common/gdbstub.c +++ xen-redhat/common/gdbstub.c @@ -478,13 +478,13 @@ process_command(struct cpu_user_regs *re return resume; } -static struct gdb_context +struct gdb_context __gdb_ctx = { .serhnd = -1, .running = ATOMIC_INIT(1), .signum = 1 }; -static struct gdb_context *gdb_ctx = &__gdb_ctx; +struct gdb_context *gdb_ctx = &__gdb_ctx; static void gdbstub_console_puts(const char *str) diff -Naurp xen/common/grant_table.c xen-redhat/common/grant_table.c --- xen/common/grant_table.c +++ xen-redhat/common/grant_table.c @@ -809,6 +809,7 @@ gnttab_transfer( grant_entry_t *sha; struct gnttab_transfer gop; unsigned long mfn; + unsigned int max_bitsize; for ( i = 0; i < count; i++ ) { @@ -857,6 +858,34 @@ gnttab_transfer( goto copyback; } + max_bitsize = domain_clamp_alloc_bitsize( + e, BITS_PER_LONG+PAGE_SHIFT-1); + if ( (1UL << (max_bitsize - PAGE_SHIFT)) <= mfn ) + { + struct page_info *new_page; + void *sp, *dp; + + new_page = alloc_domheap_pages(NULL, 0, MEMF_bits(max_bitsize)); + if ( new_page == NULL ) + { + rcu_unlock_domain(e); + page->count_info &= ~(PGC_count_mask|PGC_allocated); + free_domheap_page(page); + gop.status = GNTST_address_too_big; + goto copyback; + } + + sp = map_domain_page(mfn); + dp = map_domain_page(page_to_mfn(new_page)); + memcpy(dp, sp, PAGE_SIZE); + unmap_domain_page(dp); + unmap_domain_page(sp); + + page->count_info &= ~(PGC_count_mask|PGC_allocated); + free_domheap_page(page); + page = new_page; + } + spin_lock(&e->page_alloc_lock); /* @@ -896,7 +925,7 @@ gnttab_transfer( spin_lock(&e->grant_table->lock); sha = &shared_entry(e->grant_table, gop.ref); - guest_physmap_add_page(e, sha->frame, mfn); + guest_physmap_add_page(e, sha->frame, mfn, 0); sha->frame = mfn; wmb(); sha->flags |= GTF_transfer_completed; diff -Naurp xen/common/kernel.c xen-redhat/common/kernel.c --- xen/common/kernel.c +++ xen-redhat/common/kernel.c @@ -80,7 +80,10 @@ void cmdline_parse(char *cmdline) break; case OPT_BOOL: case OPT_INVBOOL: - if ( !strcmp("no", optval) || !strcmp("off", optval) ) + if ( !strcmp("no", optval) || + !strcmp("off", optval) || + !strcmp("false", optval) || + !strcmp("0", optval) ) bool_assert = !bool_assert; if ( param->type == OPT_INVBOOL ) bool_assert = !bool_assert; @@ -217,6 +220,10 @@ DO(xen_version)(int cmd, XEN_GUEST_HANDL (1U << XENFEAT_auto_translated_physmap); if ( supervisor_mode_kernel ) fi.submap |= 1U << XENFEAT_supervisor_mode_kernel; +#ifdef CONFIG_X86 + if ( !is_hvm_vcpu(current) ) + fi.submap |= 1U << XENFEAT_mmu_pt_update_preserve_ad; +#endif break; default: return -EINVAL; diff -Naurp xen/common/kexec.c xen-redhat/common/kexec.c --- xen/common/kexec.c +++ xen-redhat/common/kexec.c @@ -42,6 +42,9 @@ static unsigned long kexec_flags = 0; /* static spinlock_t kexec_lock = SPIN_LOCK_UNLOCKED; +static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; +static size_t vmcoreinfo_size = 0; + xen_kexec_reserve_t kexec_crash_area; static void __init parse_crashkernel(const char *str) @@ -222,6 +225,13 @@ static int kexec_get(cpu)(xen_kexec_rang return 0; } +static int kexec_get(vmcoreinfo)(xen_kexec_range_t *range) +{ + range->start = __pa((unsigned long)vmcoreinfo_data); + range->size = VMCOREINFO_BYTES; + return 0; +} + static int kexec_get(range)(XEN_GUEST_HANDLE(void) uarg) { xen_kexec_range_t range; @@ -241,6 +251,9 @@ static int kexec_get(range)(XEN_GUEST_HA case KEXEC_RANGE_MA_CPU: ret = kexec_get(cpu)(&range); break; + case KEXEC_RANGE_MA_VMCOREINFO: + ret = kexec_get(vmcoreinfo)(&range); + break; } if ( ret == 0 && unlikely(copy_to_guest(uarg, &range, 1)) ) @@ -269,6 +282,56 @@ static int kexec_load_get_bits(int type, return 0; } +void vmcoreinfo_append_str(const char *fmt, ...) +{ + va_list args; + char buf[0x50]; + int r; + size_t note_size = sizeof(Elf_Note) + ELFNOTE_ALIGN(strlen(VMCOREINFO_NOTE_NAME) + 1); + + if (vmcoreinfo_size + note_size + sizeof(buf) > VMCOREINFO_BYTES) + return; + + va_start(args, fmt); + r = vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + memcpy(&vmcoreinfo_data[note_size + vmcoreinfo_size], buf, r); + + vmcoreinfo_size += r; +} + +static void crash_save_vmcoreinfo(void) +{ + size_t data_size; + + if (vmcoreinfo_size > 0) /* already saved */ + return; + + data_size = VMCOREINFO_BYTES - (sizeof(Elf_Note) + ELFNOTE_ALIGN(strlen(VMCOREINFO_NOTE_NAME) + 1)); + setup_note((Elf_Note *)vmcoreinfo_data, VMCOREINFO_NOTE_NAME, 0, data_size); + + VMCOREINFO_PAGESIZE(PAGE_SIZE); + + VMCOREINFO_SYMBOL(domain_list); + VMCOREINFO_SYMBOL(frame_table); + VMCOREINFO_SYMBOL(alloc_bitmap); + VMCOREINFO_SYMBOL(max_page); + VMCOREINFO_SYMBOL(xenheap_phys_end); + + VMCOREINFO_STRUCT_SIZE(page_info); + VMCOREINFO_STRUCT_SIZE(domain); + + VMCOREINFO_OFFSET(page_info, count_info); + VMCOREINFO_OFFSET_ALIAS(page_info, u, _domain); + VMCOREINFO_OFFSET(domain, domain_id); + VMCOREINFO_OFFSET(domain, next_in_list); + +#ifdef ARCH_CRASH_SAVE_VMCOREINFO + arch_crash_save_vmcoreinfo(); +#endif +} + #endif static int kexec_load_unload(unsigned long op, XEN_GUEST_HANDLE(void) uarg) @@ -307,6 +370,9 @@ static int kexec_load_unload(unsigned lo /* Make new image the active one */ change_bit(bit, &kexec_flags); } +#ifndef COMPAT + crash_save_vmcoreinfo(); +#endif } /* Unload the old image if present and load successful */ diff -Naurp xen/common/keyhandler.c xen-redhat/common/keyhandler.c --- xen/common/keyhandler.c +++ xen-redhat/common/keyhandler.c @@ -36,10 +36,10 @@ static void keypress_softirq(void) { keyhandler_t *h; unsigned char key = keypress_key; - console_start_log_everything(); + console_start_sync(); if ( (h = key_table[key].u.handler) != NULL ) (*h)(key); - console_end_log_everything(); + console_end_sync(); } void handle_keypress(unsigned char key, struct cpu_user_regs *regs) @@ -48,10 +48,10 @@ void handle_keypress(unsigned char key, if ( !in_irq() || (key_table[key].flags & KEYHANDLER_IRQ_CALLBACK) ) { - console_start_log_everything(); + console_start_sync(); if ( (h = key_table[key].u.irq_handler) != NULL ) (*h)(key, regs); - console_end_log_everything(); + console_end_sync(); } else { @@ -205,7 +205,7 @@ static void dump_domains(unsigned char k test_bit(v->virq_to_evtchn[VIRQ_DEBUG], shared_info_addr(d, evtchn_mask)), test_bit(v->virq_to_evtchn[VIRQ_DEBUG] / - BITS_PER_GUEST_LONG(d), + BITS_PER_EVTCHN_WORD(d), vcpu_info_addr(v, evtchn_pending_sel))); send_guest_vcpu_virq(v, VIRQ_DEBUG); } diff -Naurp xen/common/memory.c xen-redhat/common/memory.c --- xen/common/memory.c +++ xen-redhat/common/memory.c @@ -129,8 +129,8 @@ static void populate_physmap(struct memo if ( unlikely(paging_mode_translate(d)) ) { - for ( j = 0; j < (1 << a->extent_order); j++ ) - guest_physmap_add_page(d, gpfn + j, mfn + j); + if ( guest_physmap_add_page(d, gpfn, mfn, a->extent_order) ) + goto out; } else { @@ -173,7 +173,7 @@ int guest_remove_page(struct domain *d, if ( test_and_clear_bit(_PGC_allocated, &page->count_info) ) put_page(page); - guest_physmap_remove_page(d, gmfn, mfn); + guest_physmap_remove_page(d, gmfn, mfn, 0); put_page(page); @@ -309,18 +309,6 @@ static long memory_exchange(XEN_GUEST_HA goto fail_early; } - if ( (exch.out.address_bits != 0) && - (exch.out.address_bits < - (get_order_from_pages(max_page) + PAGE_SHIFT)) ) - { - if ( exch.out.address_bits <= PAGE_SHIFT ) - { - rc = -ENOMEM; - goto fail_early; - } - memflags = MEMF_bits(exch.out.address_bits); - } - if ( exch.in.extent_order <= exch.out.extent_order ) { in_chunk_order = exch.out.extent_order - exch.in.extent_order; @@ -343,6 +331,9 @@ static long memory_exchange(XEN_GUEST_HA } d = current->domain; + memflags |= MEMF_bits(domain_clamp_alloc_bitsize( + d, exch.out.address_bits ? : (BITS_PER_LONG+PAGE_SHIFT))); + cpu = select_local_cpu(d); for ( i = (exch.nr_exchanged >> in_chunk_order); @@ -415,7 +406,7 @@ static long memory_exchange(XEN_GUEST_HA if ( !test_and_clear_bit(_PGC_allocated, &page->count_info) ) BUG(); mfn = page_to_mfn(page); - guest_physmap_remove_page(d, mfn_to_gmfn(d, mfn), mfn); + guest_physmap_remove_page(d, mfn_to_gmfn(d, mfn), mfn, 0); put_page(page); } @@ -436,8 +427,9 @@ static long memory_exchange(XEN_GUEST_HA mfn = page_to_mfn(page); if ( unlikely(paging_mode_translate(d)) ) { - for ( k = 0; k < (1UL << exch.out.extent_order); k++ ) - guest_physmap_add_page(d, gpfn + k, mfn + k); + /* Ignore failure here. There's nothing we can do. */ + (void)guest_physmap_add_page(d, gpfn, mfn, + exch.out.extent_order); } else { diff -Naurp xen/common/page_alloc.c xen-redhat/common/page_alloc.c --- xen/common/page_alloc.c +++ xen-redhat/common/page_alloc.c @@ -54,7 +54,7 @@ boolean_param("bootscrub", opt_bootscrub /* * Bit width of the DMA heap. */ -static unsigned int dma_bitsize = CONFIG_DMA_BITSIZE; +static unsigned int dma_bitsize = 0; static void __init parse_dma_bits(char *s) { unsigned int v = simple_strtol(s, NULL, 0); @@ -84,16 +84,12 @@ custom_param("dma_emergency_pool", parse #define round_pgdown(_p) ((_p)&PAGE_MASK) #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK) -static DEFINE_SPINLOCK(page_scrub_lock); -LIST_HEAD(page_scrub_list); -static unsigned long scrub_pages; - /********************* * ALLOCATION BITMAP * One bit per page of memory. Bit set => page is allocated. */ -static unsigned long *alloc_bitmap; +unsigned long *alloc_bitmap; #define PAGES_PER_MAPWORD (sizeof(unsigned long) * 8) #define allocated_in_map(_pn) \ @@ -366,7 +362,6 @@ static struct page_info *alloc_heap_page struct page_info *pg; ASSERT(node >= 0); - ASSERT(node < num_nodes); ASSERT(zone_lo <= zone_hi); ASSERT(zone_hi < NR_ZONES); @@ -395,8 +390,9 @@ static struct page_info *alloc_heap_page } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */ /* Pick next node, wrapping around if needed. */ - if ( ++node == num_nodes ) - node = 0; + node = next_node(node, node_online_map); + if (node == MAX_NUMNODES) + node = first_node(node_online_map); } /* No suitable memory blocks. Fail the request. */ @@ -458,7 +454,6 @@ static void free_heap_pages( ASSERT(zone < NR_ZONES); ASSERT(order <= MAX_ORDER); ASSERT(node >= 0); - ASSERT(node < num_online_nodes()); for ( i = 0; i < (1 << order); i++ ) { @@ -571,13 +566,13 @@ void init_heap_pages( static unsigned long avail_heap_pages( unsigned int zone_lo, unsigned int zone_hi, unsigned int node) { - unsigned int i, zone, num_nodes = num_online_nodes(); + unsigned int i, zone; unsigned long free_pages = 0; if ( zone_hi >= NR_ZONES ) zone_hi = NR_ZONES - 1; - for ( i = 0; i < num_nodes; i++ ) + for_each_online_node(i) { if ( !avail[i] ) continue; @@ -609,6 +604,20 @@ void __init end_boot_allocator(void) init_heap_pages(pfn_dom_zone_type(i), mfn_to_page(i), 1); } + if (dma_bitsize == 0) + { +#ifdef CONFIG_X86 + if (num_online_nodes() > 1) + dma_bitsize = min_t(unsigned int, + fls(NODE_DATA(0)->node_spanned_pages) - 1 + PAGE_SHIFT - 2, + 32); + else + dma_bitsize = CONFIG_DMA_BITSIZE; +#else + dma_bitsize = CONFIG_DMA_BITSIZE; +#endif + } + printk("Domain heap initialised: DMA width %u bits\n", dma_bitsize); } #undef avail_for_domheap @@ -620,7 +629,6 @@ void __init end_boot_allocator(void) */ void __init scrub_heap_pages(void) { - void *p; unsigned long mfn; if ( !opt_bootscrub ) @@ -644,21 +652,7 @@ void __init scrub_heap_pages(void) /* Re-check page status with lock held. */ if ( !allocated_in_map(mfn) ) - { - if ( is_xen_heap_frame(mfn_to_page(mfn)) ) - { - p = page_to_virt(mfn_to_page(mfn)); - memguard_unguard_range(p, PAGE_SIZE); - clear_page(p); - memguard_guard_range(p, PAGE_SIZE); - } - else - { - p = map_domain_page(mfn); - clear_page(p); - unmap_domain_page(p); - } - } + scrub_one_page(mfn_to_page(mfn)); spin_unlock(&heap_lock); } @@ -817,15 +811,13 @@ struct page_info *__alloc_domheap_pages( ASSERT(!in_irq()); - if ( bits ) - { - bits = domain_clamp_alloc_bitsize(d, bits); - if ( bits <= (PAGE_SHIFT + 1) ) - return NULL; - bits -= PAGE_SHIFT + 1; - if ( bits < zone_hi ) - zone_hi = bits; - } + bits = domain_clamp_alloc_bitsize(d, bits ? : (BITS_PER_LONG+PAGE_SHIFT)); + if ( bits <= (PAGE_SHIFT + 1) ) + return NULL; + + bits -= PAGE_SHIFT + 1; + if ( bits < zone_hi ) + zone_hi = bits; if ( (zone_hi + PAGE_SHIFT) >= dma_bitsize ) { @@ -897,26 +889,16 @@ void free_domheap_pages(struct page_info spin_unlock_recursive(&d->page_alloc_lock); - if ( likely(!d->is_dying) ) - { - free_heap_pages(pfn_dom_zone_type(page_to_mfn(pg)), pg, order); - } - else - { - /* - * Normally we expect a domain to clear pages before freeing them, - * if it cares about the secrecy of their contents. However, after - * a domain has died we assume responsibility for erasure. - */ + /* + * Normally we expect a domain to clear pages before freeing them, + * if it cares about the secrecy of their contents. However, after + * a domain has died we assume responsibility for erasure. + */ + if ( unlikely(d->is_dying) ) for ( i = 0; i < (1 << order); i++ ) - { - page_set_owner(&pg[i], NULL); - spin_lock(&page_scrub_lock); - list_add(&pg[i].list, &page_scrub_list); - scrub_pages++; - spin_unlock(&page_scrub_lock); - } - } + scrub_one_page(&pg[i]); + + free_heap_pages(pfn_dom_zone_type(page_to_mfn(pg)), pg, order); } else { @@ -929,6 +911,23 @@ void free_domheap_pages(struct page_info put_domain(d); } +unsigned long avail_domheap_pages_region( + unsigned int node, unsigned int min_width, unsigned int max_width) +{ + int zone_lo, zone_hi; + + zone_lo = min_width ? (min_width - (PAGE_SHIFT + 1)) : (MEMZONE_XEN + 1); + zone_lo = max_t(int, MEMZONE_XEN + 1, zone_lo); + zone_lo = min_t(int, NR_ZONES - 1, zone_lo); + + zone_hi = max_width ? (max_width - (PAGE_SHIFT + 1)) : (NR_ZONES - 1); + zone_hi = max_t(int, MEMZONE_XEN + 1, zone_hi); + zone_hi = min_t(int, NR_ZONES - 1, zone_hi); + + return avail_heap_pages(zone_lo, zone_hi, node); +} + + unsigned long avail_domheap_pages(void) { @@ -950,11 +949,6 @@ unsigned long avail_domheap_pages(void) return avail_nrm + avail_dma; } -unsigned long avail_nodeheap_pages(int node) -{ - return avail_heap_pages(0, NR_ZONES - 1, node); -} - static void pagealloc_keyhandler(unsigned char key) { unsigned int zone = MEMZONE_XEN; @@ -992,70 +986,19 @@ static __init int pagealloc_keyhandler_i } __initcall(pagealloc_keyhandler_init); - - -/************************* - * PAGE SCRUBBING - */ - -static DEFINE_PER_CPU(struct timer, page_scrub_timer); - -static void page_scrub_softirq(void) +void scrub_one_page(struct page_info *pg) { - struct list_head *ent; - struct page_info *pg; - void *p; - int i; - s_time_t start = NOW(); - - /* Aim to do 1ms of work every 10ms. */ - do { - spin_lock(&page_scrub_lock); - - if ( unlikely((ent = page_scrub_list.next) == &page_scrub_list) ) - { - spin_unlock(&page_scrub_lock); - return; - } - - /* Peel up to 16 pages from the list. */ - for ( i = 0; i < 16; i++ ) - { - if ( ent->next == &page_scrub_list ) - break; - ent = ent->next; - } - - /* Remove peeled pages from the list. */ - ent->next->prev = &page_scrub_list; - page_scrub_list.next = ent->next; - scrub_pages -= (i+1); - - spin_unlock(&page_scrub_lock); + void *p = map_domain_page(page_to_mfn(pg)); - /* Working backwards, scrub each page in turn. */ - while ( ent != &page_scrub_list ) - { - pg = list_entry(ent, struct page_info, list); - ent = ent->prev; - p = map_domain_page(page_to_mfn(pg)); - clear_page(p); - unmap_domain_page(p); - free_heap_pages(pfn_dom_zone_type(page_to_mfn(pg)), pg, 0); - } - } while ( (NOW() - start) < MILLISECS(1) ); - - set_timer(&this_cpu(page_scrub_timer), NOW() + MILLISECS(10)); -} - -static void page_scrub_timer_fn(void *unused) -{ - page_scrub_schedule_work(); -} +#ifndef NDEBUG + /* Avoid callers relying on allocations returning zeroed pages. */ + memset(p, 0xc2, PAGE_SIZE); +#else + /* For a production build, clear_page() is the fastest way to scrub. */ + clear_page(p); +#endif -unsigned long avail_scrub_pages(void) -{ - return scrub_pages; + unmap_domain_page(p); } static void dump_heap(unsigned char key) @@ -1083,18 +1026,6 @@ static __init int register_heap_trigger( } __initcall(register_heap_trigger); - -static __init int page_scrub_init(void) -{ - int cpu; - for_each_cpu ( cpu ) - init_timer(&per_cpu(page_scrub_timer, cpu), - page_scrub_timer_fn, NULL, cpu); - open_softirq(PAGE_SCRUB_SOFTIRQ, page_scrub_softirq); - return 0; -} -__initcall(page_scrub_init); - /* * Local variables: * mode: C diff -Naurp xen/common/sched_credit.c xen-redhat/common/sched_credit.c --- xen/common/sched_credit.c +++ xen-redhat/common/sched_credit.c @@ -6,7 +6,8 @@ * Author: Emmanuel Ackaouy * * Description: Credit-based SMP CPU scheduler - */ + * +*/ #include <xen/config.h> #include <xen/init.h> @@ -48,14 +49,24 @@ #define CSCHED_CREDITS_PER_ACCT \ (CSCHED_CREDITS_PER_TICK * CSCHED_TICKS_PER_ACCT) +/* opt_hardvirt: This enables the both the dom0 bypass and + * hard virt dom0. By default these are disabled so as to + * keep behavior as expected for workloads running on an + * existing dom0. + */ +static int opt_hardvirt = 0; +boolean_param("hardvirt", opt_hardvirt); + /* * Priorities */ -#define CSCHED_PRI_TS_BOOST 0 /* time-share waking up */ #define CSCHED_PRI_TS_UNDER -1 /* time-share w/ credits */ #define CSCHED_PRI_TS_OVER -2 /* time-share w/o credits */ #define CSCHED_PRI_IDLE -64 /* idle */ +#define CSCHED_PRI_RR 10 /* Dom-0 and Hard-Virts - HV*/ + +#define NUMBER_DOM0_VCPUS_PRESENT(_cpu) (CSCHED_PCPU(_cpu)->number_of_dom0_vcpus_present) /* @@ -123,7 +134,12 @@ _MACRO(dom_init) \ _MACRO(dom_destroy) \ _MACRO(vcpu_init) \ - _MACRO(vcpu_destroy) + _MACRO(vcpu_destroy) \ + _MACRO(tickle_hard_virt_none) \ + _MACRO(rt_imbalance) \ + _MACRO(rt_vcpu_migrate) \ + _MACRO(rt_steal_trylock_failed) + #ifndef NDEBUG #define CSCHED_STATS_EXPAND_CHECKS(_MACRO) \ @@ -188,6 +204,8 @@ struct csched_pcpu { uint32_t runq_sort_last; struct timer ticker; unsigned int tick; + uint16_t number_of_dom0_vcpus_present; + uint16_t unused; /* HV */ }; /* @@ -201,6 +219,11 @@ struct csched_vcpu { atomic_t credit; uint16_t flags; int16_t pri; + int credit_real_incr; + atomic_t hard_virt_pcpu; /* HV */ + uint16_t hard_virt_pcpu_state_change; /* HV */ + uint16_t unused; + #ifdef CSCHED_STATS struct { int credit_last; @@ -239,6 +262,9 @@ struct csched_private { int credit_balance; uint32_t runq_sort; CSCHED_STATS_DEFINE() + spinlock_t hard_virt_lock; /* HV */ + cpumask_t hard_virt_none; /* 1 by default - meaning it has no RT vcpu */ + cpumask_t hard_virt_multiple; /* 0 by default - meaning it has no more than 1 RT vcpu */ }; @@ -249,6 +275,9 @@ static struct csched_private csched_priv static void csched_tick(void *_cpu); +/* HV - Protected by hard_virt_lock */ +static unsigned int total_hard_virts=0; + static inline int __cycle_cpu(int cpu, const cpumask_t *mask) { @@ -275,14 +304,92 @@ __runq_insert(unsigned int cpu, struct c { const struct list_head * const runq = RUNQ(cpu); struct list_head *iter; + int credit, new_credit; + + + BUG_ON( __vcpu_on_runq(svc) ); + BUG_ON( cpu != svc->vcpu->processor ); + + /* HV - No race condition for hard_virt_pcpu_state_change here */ + if (svc->hard_virt_pcpu_state_change) + { + svc->hard_virt_pcpu_state_change = 0; + if (atomic_read(&svc->hard_virt_pcpu)) + svc->pri = CSCHED_PRI_RR; + else if (svc->pri == CSCHED_PRI_RR && svc->vcpu->domain->domain_id != 0) + { + if (atomic_read(&svc->credit) > 0) + svc->pri = CSCHED_PRI_TS_UNDER; + else + svc->pri = CSCHED_PRI_TS_OVER; + } + } + if (svc->vcpu->domain->domain_id == 0) + NUMBER_DOM0_VCPUS_PRESENT(cpu)++; + + new_credit = atomic_read(&svc->credit); + + if (new_credit >= CSCHED_CREDITS_PER_TSLICE/2) + { + list_for_each( iter, runq ) + { + const struct csched_vcpu * const iter_svc = __runq_elem(iter); + if (svc->pri > iter_svc->pri ) + break; + credit = atomic_read(&iter_svc->credit); + if ( svc->pri == iter_svc->pri && credit < (CSCHED_CREDITS_PER_TSLICE/2) ) + break; + } + } + else + { + list_for_each( iter, runq ) + { + const struct csched_vcpu * const iter_svc = __runq_elem(iter); + if ( svc->pri > iter_svc->pri ) + break; + } + } + + list_add_tail(&svc->runq_elem, iter); +} + +static inline void +__runq_insert_special(unsigned int cpu, struct csched_vcpu *svc) +{ + const struct list_head * const runq = RUNQ(cpu); + struct list_head *iter; + int new_credit, credit; BUG_ON( __vcpu_on_runq(svc) ); BUG_ON( cpu != svc->vcpu->processor ); + /* HV */ + if (svc->hard_virt_pcpu_state_change) + { + svc->hard_virt_pcpu_state_change = 0; + if (atomic_read(&svc->hard_virt_pcpu)) + svc->pri = CSCHED_PRI_RR; + else if (svc->pri == CSCHED_PRI_RR && svc->vcpu->domain->domain_id != 0) + { + if (atomic_read(&svc->credit) > 0) + svc->pri = CSCHED_PRI_TS_UNDER; + else + svc->pri = CSCHED_PRI_TS_OVER; + } + } + if (svc->vcpu->domain->domain_id == 0) + NUMBER_DOM0_VCPUS_PRESENT(cpu)++; + + new_credit = atomic_read(&svc->credit); + list_for_each( iter, runq ) { const struct csched_vcpu * const iter_svc = __runq_elem(iter); if ( svc->pri > iter_svc->pri ) + break; + credit = atomic_read(&iter_svc->credit); + if ( (svc->pri == iter_svc->pri && new_credit >= credit)) break; } @@ -294,6 +401,24 @@ __runq_remove(struct csched_vcpu *svc) { BUG_ON( !__vcpu_on_runq(svc) ); list_del_init(&svc->runq_elem); + + /* HV */ + if (svc->vcpu->domain->domain_id == 0) + NUMBER_DOM0_VCPUS_PRESENT(svc->vcpu->processor)--; + + if (svc->hard_virt_pcpu_state_change) + { + svc->hard_virt_pcpu_state_change = 0; + if (atomic_read(&svc->hard_virt_pcpu)) + svc->pri = CSCHED_PRI_RR; + else if (svc->pri == CSCHED_PRI_RR && svc->vcpu->domain->domain_id != 0 ) + { + if (atomic_read(&svc->credit) > 0) + svc->pri = CSCHED_PRI_TS_UNDER; + else + svc->pri = CSCHED_PRI_TS_OVER; + } + } } static inline void @@ -302,12 +427,18 @@ __runq_tickle(unsigned int cpu, struct c struct csched_vcpu * const cur = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr); cpumask_t mask; + int newcredit, curcredit; ASSERT(cur); cpus_clear(mask); /* If strictly higher priority than current VCPU, signal the CPU */ - if ( new->pri > cur->pri ) + newcredit = atomic_read(&new->credit); + curcredit = atomic_read(&cur->credit); + /* HV */ + if ((opt_hardvirt && new->vcpu->domain->domain_id == 0) || + (new->pri > cur->pri) || + (new->pri == cur->pri && newcredit > curcredit && newcredit > -(CSCHED_CREDITS_PER_TSLICE>>3)) ) { if ( cur->pri == CSCHED_PRI_IDLE ) CSCHED_STAT_CRANK(tickle_local_idler); @@ -339,6 +470,18 @@ __runq_tickle(unsigned int cpu, struct c } } + /* HV - Small chance of false positive in hard_virt_none map here */ + if ( cur->pri == CSCHED_PRI_RR && new->pri == CSCHED_PRI_RR ) + { + cpu_set(cpu, csched_priv.hard_virt_multiple); + if ( ! cpus_empty(csched_priv.hard_virt_none) ) + { + CSCHED_STAT_CRANK(tickle_hard_virt_none); + cpus_or(mask, mask, csched_priv.hard_virt_none); + cpus_and(mask, mask, new->vcpu->cpu_affinity); + } + } + /* Send scheduler interrupts to designated CPUs */ if ( !cpus_empty(mask) ) cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ); @@ -367,11 +510,14 @@ csched_pcpu_init(int cpu) init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu); INIT_LIST_HEAD(&spc->runq); spc->runq_sort_last = csched_priv.runq_sort; + spc->number_of_dom0_vcpus_present = 0; + spc->unused = 0; /* HV */ per_cpu(schedule_data, cpu).sched_priv = spc; /* Start off idling... */ BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr)); cpu_set(cpu, csched_priv.idlers); + cpu_set(cpu, csched_priv.hard_virt_none); /* HV */ spin_unlock_irqrestore(&csched_priv.lock, flags); @@ -464,6 +610,20 @@ csched_cpu_pick(struct vcpu *vc) } else { + /* Hmm.. This is of questionable value.. + * There are many cases where Vcpus are better off + * being on the same socket due to effective L2 sharing + * and low impact of cache bouncing. + * In the absence of any other workload, moving the Vcpus + * to different cores will be useful transiently but when + * the system gets busy since there is no mechanism to assert + * socket level affinities, it will be a hit on the performance. + * NUMA smartness has also gone for a toss here. + * + * Eventually we would want to allocate memory for Virts from + * local NUMA nodes in which case NUMA affinities need to + * implemented by the scheduler and this section + * needs to be thrown out */ ASSERT( !cpu_isset(nxt, cpu_core_map[cpu]) ); cpus_and(cpu_idlers, idlers, cpu_core_map[cpu]); cpus_and(nxt_idlers, idlers, cpu_core_map[nxt]); @@ -533,22 +693,23 @@ csched_vcpu_acct(unsigned int cpu) { struct csched_vcpu * const svc = CSCHED_VCPU(current); + int credit; + /* Update credits */ + credit = atomic_read(&svc->credit); + ASSERT( current->processor == cpu ); ASSERT( svc->sdom != NULL ); /* - * If this VCPU's priority was boosted when it last awoke, reset it. - * If the VCPU is found here, then it's consuming a non-negligeable - * amount of CPU resources and should no longer be boosted. - */ - if ( svc->pri == CSCHED_PRI_TS_BOOST ) - svc->pri = CSCHED_PRI_TS_UNDER; - - /* * Update credits */ atomic_sub(CSCHED_CREDITS_PER_TICK, &svc->credit); + if ( credit < CSCHED_CREDITS_PER_TICK && svc->pri ==CSCHED_PRI_TS_UNDER ) + { + svc->pri = CSCHED_PRI_TS_OVER; + } + /* * Put this VCPU and domain back on the active list if it was * idling. @@ -594,6 +755,14 @@ csched_vcpu_init(struct vcpu *vc) CSCHED_VCPU_STATS_RESET(svc); vc->sched_priv = svc; + /* HV */ + if (opt_hardvirt && vc->domain->domain_id == 0 && !is_idle_vcpu(vc)) + svc->pri = CSCHED_PRI_RR; + svc->credit_real_incr = 0; + atomic_set(&svc->hard_virt_pcpu, 0); /* HV */ + svc->hard_virt_pcpu_state_change = 0; + svc->unused = 0; + /* Allocate per-PCPU info */ if ( unlikely(!CSCHED_PCPU(vc->processor)) ) { @@ -617,6 +786,16 @@ csched_vcpu_destroy(struct vcpu *vc) BUG_ON( sdom == NULL ); BUG_ON( !list_empty(&svc->runq_elem) ); + /* HV */ + spin_lock(&csched_priv.hard_virt_lock); + if (atomic_read(&svc->hard_virt_pcpu)) + { + atomic_set(&svc->hard_virt_pcpu, 0); + svc->hard_virt_pcpu_state_change=1; + total_hard_virts--; + } + spin_unlock(&csched_priv.hard_virt_lock); + spin_lock_irqsave(&csched_priv.lock, flags); if ( !list_empty(&svc->active_vcpu_elem) ) @@ -666,37 +845,32 @@ csched_vcpu_wake(struct vcpu *vc) else CSCHED_STAT_CRANK(vcpu_wake_not_runnable); - /* - * We temporarly boost the priority of awaking VCPUs! - * - * If this VCPU consumes a non negligeable amount of CPU, it - * will eventually find itself in the credit accounting code - * path where its priority will be reset to normal. - * - * If on the other hand the VCPU consumes little CPU and is - * blocking and awoken a lot (doing I/O for example), its - * priority will remain boosted, optimizing it's wake-to-run - * latencies. - * - * This allows wake-to-run latency sensitive VCPUs to preempt - * more CPU resource intensive VCPUs without impacting overall - * system fairness. - * - * The one exception is for VCPUs of capped domains unpausing - * after earning credits they had overspent. We don't boost - * those. - */ - if ( svc->pri == CSCHED_PRI_TS_UNDER && - !(svc->flags & CSCHED_FLAG_VCPU_PARKED) ) - { - svc->pri = CSCHED_PRI_TS_BOOST; - } - /* Put the VCPU on the runq and tickle CPUs */ - __runq_insert(cpu, svc); + __runq_insert_special(cpu, svc); __runq_tickle(cpu, svc); } +/* HV - Count up all vcpus including offline ones */ +static unsigned int find_vcpu_count(struct domain *d) +{ + struct vcpu *v; + unsigned int vcpu_count=0; + for_each_vcpu(d, v) + vcpu_count++; + return vcpu_count; +} + +/* HV - Only online pcpus are considered as valid HV target */ +static unsigned int find_available_online_cpus(unsigned int max_cpus) +{ + int cpu; + unsigned int pcpu_count=0; + + for_each_online_cpu ( cpu ) + pcpu_count++; + return pcpu_count - total_hard_virts; +} + static int csched_dom_cntl( struct domain *d, @@ -705,15 +879,96 @@ csched_dom_cntl( struct csched_dom * const sdom = CSCHED_DOM(d); unsigned long flags; + /* HV */ + unsigned short hard_virt, vcpu; + unsigned int vcpus_in_domain, hard_cpus_available; + if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo ) { - op->u.credit.weight = sdom->weight; + /* HV */ + op->u.credit.weight = sdom->weight + (atomic_read(&d->hard_virt) << 15) ; op->u.credit.cap = sdom->cap; } else { ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo); + /* HV */ + hard_virt = (op->u.credit.weight >> 15) & 0x1; + op->u.credit.weight &= 0x7fff; + + if (hard_virt != atomic_read(&d->hard_virt)) + { + if (!hard_virt) + { + /* This will convert a hard-virt to virt - This really shouldn't fail */ + printk("Taking down hard-virt %u\n", d->domain_id); + spin_lock(&csched_priv.hard_virt_lock); + for (vcpu=0; vcpu < MAX_VIRT_CPUS; vcpu++) + { + if (d->vcpu[vcpu] == NULL) + break; + if ( atomic_read( &(CSCHED_VCPU(d->vcpu[vcpu])->hard_virt_pcpu) ) ) + { + atomic_set(&(CSCHED_VCPU(d->vcpu[vcpu])->hard_virt_pcpu), 0); + CSCHED_VCPU(d->vcpu[vcpu])->hard_virt_pcpu_state_change = 1; + } + total_hard_virts--; + } + atomic_set(&d->hard_virt, 0); + spin_unlock(&csched_priv.hard_virt_lock); + if (total_hard_virts < 0){ + printk("total_hard_virts less than 0!!\n"); + total_hard_virts = 0; + } + } + else + { + /* This will convert the virt into a hard-virt - If this fails, + * the entire operation fails + */ + /* Hard Virt conversion is made atomic with respect to hardvirt + * destruction code path using a spinlock + */ + printk("Creating Hard-Virt %u\n", d->domain_id); + if (sdom->cap != 0U) + { + return -0xDEAD; + } + if (d->domain_id == 0) + { + return -0xDEAD; + } + + spin_lock(&csched_priv.hard_virt_lock); + vcpus_in_domain = find_vcpu_count(d); + hard_cpus_available = find_available_online_cpus(vcpus_in_domain); + printk("to convert %d - available %d \n", vcpus_in_domain, hard_cpus_available); + if (vcpus_in_domain > hard_cpus_available) + { + spin_unlock(&csched_priv.hard_virt_lock); + return -0xDEAD; + } + atomic_set(&d->hard_virt, 1); + for (vcpu=0; vcpu < MAX_VIRT_CPUS; vcpu++) + { + if (d->vcpu[vcpu] == NULL) + break; + if ( atomic_read(&(CSCHED_VCPU(d->vcpu[vcpu])->hard_virt_pcpu)) ) + { + spin_unlock(&csched_priv.hard_virt_lock); + printk("Vcpu %d already has a pcpu assigned - Aborting half way through.. \n", vcpu); + atomic_set(&d->hard_virt, 0); + return -0xDEAD; + } + atomic_set(&(CSCHED_VCPU(d->vcpu[vcpu])->hard_virt_pcpu), 1); + CSCHED_VCPU(d->vcpu[vcpu])->hard_virt_pcpu_state_change = 1; + total_hard_virts++; + } + spin_unlock(&csched_priv.hard_virt_lock); + } + } + spin_lock_irqsave(&csched_priv.lock, flags); if ( op->u.credit.weight != 0 ) @@ -726,7 +981,7 @@ csched_dom_cntl( sdom->weight = op->u.credit.weight; } - if ( op->u.credit.cap != (uint16_t)~0U ) + if ( op->u.credit.cap != (uint16_t)~0U && !atomic_read(&d->hard_virt) ) sdom->cap = op->u.credit.cap; spin_unlock_irqrestore(&csched_priv.lock, flags); @@ -783,6 +1038,7 @@ csched_runq_sort(unsigned int cpu) struct csched_vcpu *svc_elem; unsigned long flags; int sort_epoch; + int credit; sort_epoch = csched_priv.runq_sort; if ( sort_epoch == spc->runq_sort_last ) @@ -801,7 +1057,32 @@ csched_runq_sort(unsigned int cpu) next = elem->next; svc_elem = __runq_elem(elem); - if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER ) + if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER || svc_elem->pri == CSCHED_PRI_RR ) + { + /* does elem need to move up the runq? */ + if ( elem->prev != last_under ) + { + list_del(elem); + list_add(elem, last_under); + } + last_under = elem; + } + + elem = next; + } + + elem = runq->next; + last_under = runq; + + while ( elem != runq ) + { + next = elem->next; + svc_elem = __runq_elem(elem); + if (svc_elem->pri != CSCHED_PRI_TS_UNDER && svc_elem->pri != CSCHED_PRI_RR) + break; + credit = atomic_read (&svc_elem->credit); + + if ( credit >= CSCHED_CREDITS_PER_TSLICE/2 ) { /* does elem need to move up the runq? */ if ( elem->prev != last_under ) @@ -814,6 +1095,31 @@ csched_runq_sort(unsigned int cpu) elem = next; } + /* HV - TODO - This sucks - 3 scans !! - Old-fashioned bubble sort is + likely to be no worse in most cases - Consider a rewrite */ + elem = runq->next; + last_under = runq; + + while ( elem != runq ) + { + next = elem->next; + svc_elem = __runq_elem(elem); + if (svc_elem->pri != CSCHED_PRI_TS_UNDER && svc_elem->pri != CSCHED_PRI_RR) + break; + if ( svc_elem->pri == CSCHED_PRI_RR ) + { + /* does elem need to move up the runq? */ + if ( elem->prev != last_under ) + { + list_del(elem); + list_add(elem, last_under); + } + last_under = elem; + } + + elem = next; + } + spin_unlock_irqrestore(&per_cpu(schedule_data, cpu).schedule_lock, flags); } @@ -835,6 +1141,8 @@ csched_acct(void) int credit_balance; int credit_xtra; int credit; + uint32_t max_credit; + int credit_prev, credit_real_incr; spin_lock_irqsave(&csched_priv.lock, flags); @@ -945,8 +1253,34 @@ csched_acct(void) BUG_ON( sdom != svc->sdom ); /* Increment credit */ - atomic_add(credit_fair, &svc->credit); credit = atomic_read(&svc->credit); + credit_prev = credit; + credit_real_incr = svc->credit_real_incr; + + if (credit <= 0) + credit += credit_fair; + else + { + if ( sdom->cap != 0U ) + { + if (!vcpu_runnable(svc->vcpu)) + { + credit = credit/2; + if (credit > credit_fair/2) + credit = credit_fair/2; + } + } + /* If this earned fair share of credits last time + then allow rollover credits */ + if ( credit_real_incr > credit_fair ) + { + credit -= credit_real_incr - credit_fair; + if (credit < 0) + credit = 0; + } + credit += credit_fair; + } + atomic_set(&svc->credit, credit); /* * Recompute priority or, if VCPU is idling, remove it from @@ -954,29 +1288,33 @@ csched_acct(void) */ if ( credit < 0 ) { - svc->pri = CSCHED_PRI_TS_OVER; - - /* Park running VCPUs of capped-out domains */ - if ( sdom->cap != 0U && - credit < -credit_cap && - !(svc->flags & CSCHED_FLAG_VCPU_PARKED) ) + if (svc->pri != CSCHED_PRI_RR) { - CSCHED_STAT_CRANK(vcpu_park); - vcpu_pause_nosync(svc->vcpu); - svc->flags |= CSCHED_FLAG_VCPU_PARKED; - } + svc->pri = CSCHED_PRI_TS_OVER; + + /* Park running VCPUs of capped-out domains */ + if ( sdom->cap != 0U && + credit < -credit_cap && + !(svc->flags & CSCHED_FLAG_VCPU_PARKED) ) + { + CSCHED_STAT_CRANK(vcpu_park); + vcpu_pause_nosync(svc->vcpu); + svc->flags |= CSCHED_FLAG_VCPU_PARKED; + } + } /* Lower bound on credits */ - if ( credit < -CSCHED_CREDITS_PER_TSLICE ) + if ( credit < -(CSCHED_CREDITS_PER_TSLICE<<1) ) { CSCHED_STAT_CRANK(acct_min_credit); - credit = -CSCHED_CREDITS_PER_TSLICE; + credit = -(CSCHED_CREDITS_PER_TSLICE<<1); atomic_set(&svc->credit, credit); } - } + } else { - svc->pri = CSCHED_PRI_TS_UNDER; + if (svc->pri != CSCHED_PRI_RR) + svc->pri = CSCHED_PRI_TS_UNDER; /* Unpark any capped domains whose credits go positive */ if ( svc->flags & CSCHED_FLAG_VCPU_PARKED) @@ -992,17 +1330,25 @@ csched_acct(void) } /* Upper bound on credits means VCPU stops earning */ - if ( credit > CSCHED_CREDITS_PER_TSLICE ) - { + max_credit = (credit_fair << 1) + credit_fair; + if (max_credit > 3*CSCHED_CREDITS_PER_TSLICE/2) + max_credit = 3*CSCHED_CREDITS_PER_TSLICE/2; + else if (max_credit < CSCHED_CREDITS_PER_TSLICE/2) + max_credit = CSCHED_CREDITS_PER_TSLICE/2; + if ( credit > max_credit ){ + credit = max_credit; __csched_vcpu_acct_stop_locked(svc); - credit = 0; atomic_set(&svc->credit, credit); } + } CSCHED_VCPU_STAT_SET(svc, credit_last, credit); CSCHED_VCPU_STAT_SET(svc, credit_incr, credit_fair); + svc->credit_real_incr = credit - credit_prev; credit_balance += credit; + if (credit_fair > svc->credit_real_incr) + credit_total += credit_fair - svc->credit_real_incr; } } @@ -1048,18 +1394,21 @@ csched_tick(void *_cpu) * once per accounting period (currently 30 milliseconds). */ csched_runq_sort(cpu); + cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK)); } static struct csched_vcpu * -csched_runq_steal(int peer_cpu, int cpu, int pri) +csched_runq_steal(int peer_cpu, int cpu, int pri, int credit) { const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu); const struct vcpu * const peer_vcpu = per_cpu(schedule_data, peer_cpu).curr; struct csched_vcpu *speer; struct list_head *iter; struct vcpu *vc; + int speer_credit; + /* * Don't steal from an idle CPU's runq because it's about to @@ -1075,8 +1424,10 @@ csched_runq_steal(int peer_cpu, int cpu, * If next available VCPU here is not of strictly higher * priority than ours, this PCPU is useless to us. */ - if ( speer->pri <= pri ) - break; + speer_credit = atomic_read(&speer->credit); + if ( speer->pri <= CSCHED_PRI_IDLE || speer->pri < pri + || (speer->pri == pri && speer_credit <= (credit+(CSCHED_CREDITS_PER_TSLICE>>3)) ) ) + break; /* Is this VCPU is runnable on our PCPU? */ vc = speer->vcpu; @@ -1099,11 +1450,12 @@ csched_runq_steal(int peer_cpu, int cpu, } static struct csched_vcpu * -csched_load_balance(int cpu, struct csched_vcpu *snext) +csched_load_balance(int cpu, struct csched_vcpu *snext, int credit) { struct csched_vcpu *speer; cpumask_t workers; int peer_cpu; + int repeat_count = 15, lock_failure_flag = 0; BUG_ON( cpu != snext->vcpu->processor ); @@ -1114,6 +1466,7 @@ csched_load_balance(int cpu, struct csch else CSCHED_STAT_CRANK(load_balance_other); + spinLockRetry: /* * Peek at non-idling CPUs in the system, starting with our * immediate neighbour. @@ -1137,23 +1490,154 @@ csched_load_balance(int cpu, struct csch if ( !spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) ) { CSCHED_STAT_CRANK(steal_trylock_failed); + lock_failure_flag = 1; continue; } /* * Any work over there to steal? */ - speer = csched_runq_steal(peer_cpu, cpu, snext->pri); + speer = csched_runq_steal(peer_cpu, cpu, snext->pri, credit); spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock); if ( speer != NULL ) return speer; } + if ( opt_hardvirt && lock_failure_flag && snext->pri == CSCHED_PRI_IDLE && repeat_count > 1 ) + { + lock_failure_flag = 0; + repeat_count--; + goto spinLockRetry; + } + /* Failed to find more important work elsewhere... */ __runq_remove(snext); return snext; } +static struct csched_vcpu * +csched_runq_rr_steal(int peer_cpu, int cpu) +{ + const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu); + const struct vcpu * const peer_vcpu = per_cpu(schedule_data, peer_cpu).curr; + struct csched_vcpu *speer; + struct list_head *iter; + struct vcpu *vc; + + + /* + * Don't steal from an idle CPU's runq because it's about to + * pick up work from it itself. + */ + if ( peer_pcpu != NULL && !is_idle_vcpu(peer_vcpu) ) + { + list_for_each( iter, &peer_pcpu->runq ) + { + speer = __runq_elem(iter); + + /** If next available VCPU here is not of strictly higher + * priority than ours, this PCPU is useless to us. + */ + if ( speer->pri < CSCHED_PRI_RR ) + break; + + /* Is this VCPU is runnable on our PCPU? */ + vc = speer->vcpu; + BUG_ON( is_idle_vcpu(vc) ); + + if (__csched_vcpu_is_migrateable(vc, cpu)) + { + /* We got a candidate. Grab it! */ + CSCHED_VCPU_STAT_CRANK(speer, migrate_q); + CSCHED_STAT_CRANK(migrate_queued); + __runq_remove(speer); + vc->processor = cpu; + return speer; + } + } + } + + return NULL; +} + +static struct csched_vcpu * +csched_rr_load_balance(int cpu, struct csched_vcpu *snext) +{ + struct csched_vcpu *speer; + cpumask_t workers; + int peer_cpu; + int repeat_count = 15, lock_failure_flag = 0; + + BUG_ON( cpu != snext->vcpu->processor ); + + spinLockRetry: + + cpus_and(workers, cpu_online_map, cpu_online_map); + cpu_clear(cpu, workers); + peer_cpu = cpu; + + while ( !cpus_empty(workers) ) + { + peer_cpu = __cycle_cpu(peer_cpu, &workers); + cpu_clear(peer_cpu, workers); + + /* + * Get ahold of the scheduler lock for this peer CPU. + * + * Note: We don't spin on this lock but simply try it. Spinning could + * cause a deadlock if the peer CPU is also load balancing and trying + * to lock this CPU. + */ + if ( !cpu_isset(peer_cpu, csched_priv.hard_virt_multiple)) + continue; + + if ( !spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) ) + { + CSCHED_STAT_CRANK(rt_steal_trylock_failed); + lock_failure_flag = 1; + continue; + } + + /* + * Any work over there to steal? + */ + speer = csched_runq_rr_steal(peer_cpu, cpu); + spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock); + if ( speer != NULL ) + { + CSCHED_STAT_CRANK(rt_vcpu_migrate); + return speer; + } + } + + if ( lock_failure_flag && snext->pri < CSCHED_PRI_RR && repeat_count > 1 ) + { + lock_failure_flag = 0; + repeat_count--; + goto spinLockRetry; + } + + /* Failed to find more important work elsewhere... */ + __runq_remove(snext); + return snext; +} + +static struct csched_vcpu * __runq_find_dom0_vcpu(int cpu) +{ + const struct list_head * const runq = RUNQ(cpu); + struct list_head *iter; + + list_for_each( iter, runq ) + { + struct csched_vcpu * iter_svc = __runq_elem(iter); + if (iter_svc->pri <= CSCHED_PRI_IDLE) + break; + if (iter_svc->vcpu->domain->domain_id == 0) + return iter_svc; + } + return NULL; +} + /* * This function is in the critical path. It is designed to be simple and * fast for the common case. @@ -1166,6 +1650,8 @@ csched_schedule(s_time_t now) struct csched_vcpu * const scurr = CSCHED_VCPU(current); struct csched_vcpu *snext; struct task_slice ret; + int credit; + struct csched_vcpu *temp_snext; CSCHED_STAT_CRANK(schedule); CSCHED_VCPU_CHECK(current); @@ -1173,11 +1659,26 @@ csched_schedule(s_time_t now) /* * Select next runnable local VCPU (ie top of local runq) */ + if (opt_hardvirt && current->domain->domain_id == 0 && vcpu_runnable(current)) + { + snext = scurr; + goto dom0_bypass; + } + if ( vcpu_runnable(current) ) __runq_insert(cpu, scurr); else BUG_ON( is_idle_vcpu(current) || list_empty(runq) ); + if (opt_hardvirt && NUMBER_DOM0_VCPUS_PRESENT(cpu) > 0) + { + snext = __runq_find_dom0_vcpu(cpu); + if (snext){ + __runq_remove(snext); + goto dom0_bypass; + } + } + snext = __runq_elem(runq->next); /* @@ -1188,10 +1689,32 @@ csched_schedule(s_time_t now) * urgent work... If not, csched_load_balance() will return snext, but * already removed from the runq. */ - if ( snext->pri > CSCHED_PRI_TS_OVER ) + /* HV - hard_virt_multiple might report false positive if a RR vcpu was + * put to sleep when it was in the runq or migrated off- Acceptable + * tradeoff for overhead of updating maps at sleep/wakeup points. + * Since hard_virt_multiple for self isn't updated at this point, there is + * a very small chance of false positive from self + */ + if ( snext->pri < CSCHED_PRI_RR && !cpus_empty(csched_priv.hard_virt_multiple) ) + { + CSCHED_STAT_CRANK(rt_imbalance); + temp_snext = csched_rr_load_balance(cpu, snext); + if (temp_snext){ + snext = temp_snext; + goto dom0_bypass; + } + } + + credit = atomic_read(&snext->credit); + if ( snext->pri > CSCHED_PRI_TS_OVER && credit > (CSCHED_CREDITS_PER_TSLICE >> 2)) __runq_remove(snext); - else - snext = csched_load_balance(cpu, snext); + else{ + if (snext->pri <= CSCHED_PRI_IDLE) + credit = -(CSCHED_CREDITS_PER_TSLICE<<1); + snext = csched_load_balance(cpu, snext, credit); + } + + dom0_bypass: /* * Update idlers mask if necessary. When we're idling, other CPUs @@ -1206,6 +1729,22 @@ csched_schedule(s_time_t now) { cpu_clear(cpu, csched_priv.idlers); } + if ( snext->pri == CSCHED_PRI_RR ) + { + if ( cpu_isset(cpu, csched_priv.hard_virt_none) ) + cpu_clear(cpu, csched_priv.hard_virt_none); + if (!list_empty(runq) && __runq_elem(runq->next)->pri == CSCHED_PRI_RR) + cpu_set(cpu, csched_priv.hard_virt_multiple); + else + cpu_clear(cpu, csched_priv.hard_virt_multiple); + } + else + { + if (!cpu_isset(cpu, csched_priv.hard_virt_none)) + cpu_set(cpu, csched_priv.hard_virt_none); + if (cpu_isset(cpu, csched_priv.hard_virt_multiple)) + cpu_clear(cpu, csched_priv.hard_virt_multiple); + } /* * Return task to run next... @@ -1231,7 +1770,7 @@ csched_dump_vcpu(struct csched_vcpu *svc if ( sdom ) { - printk(" credit=%i [w=%u]", atomic_read(&svc->credit), sdom->weight); + printk(" credit=%i of %d [w=%u]", atomic_read(&svc->credit), svc->credit_real_incr, sdom->weight); #ifdef CSCHED_STATS printk(" (%d+%u) {a/i=%u/%u m=%u+%u}", svc->stats.credit_last, @@ -1257,10 +1796,11 @@ csched_dump_pcpu(int cpu) spc = CSCHED_PCPU(cpu); runq = &spc->runq; - printk(" sort=%d, sibling=0x%lx, core=0x%lx\n", + printk(" sort=%d, sibling=0x%lx, core=0x%lx dom0=%u\n", spc->runq_sort_last, cpu_sibling_map[cpu].bits[0], - cpu_core_map[cpu].bits[0]); + cpu_core_map[cpu].bits[0], + NUMBER_DOM0_VCPUS_PRESENT(cpu)); /* current VCPU */ svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr); @@ -1313,6 +1853,8 @@ csched_dump(void) CSCHED_TICKS_PER_ACCT); printk("idlers: 0x%lx\n", csched_priv.idlers.bits[0]); + printk("hard_virt_none: 0x%lx\n", csched_priv.hard_virt_none.bits[0]); + printk("hard_virt_multiple: 0x%lx\n", csched_priv.hard_virt_multiple.bits[0]); CSCHED_STATS_PRINTK(); @@ -1346,6 +1888,9 @@ csched_init(void) csched_priv.credit = 0U; csched_priv.credit_balance = 0; csched_priv.runq_sort = 0U; + spin_lock_init(&csched_priv.hard_virt_lock); /* HV */ + cpus_clear(csched_priv.hard_virt_none); + cpus_clear(csched_priv.hard_virt_multiple); CSCHED_STATS_RESET(); } diff -Naurp xen/common/schedule.c xen-redhat/common/schedule.c --- xen/common/schedule.c +++ xen-redhat/common/schedule.c @@ -37,10 +37,6 @@ static char opt_sched[10] = "credit"; string_param("sched", opt_sched); -/* opt_dom0_vcpus_pin: If true, dom0 VCPUs are pinned. */ -static unsigned int opt_dom0_vcpus_pin; -boolean_param("dom0_vcpus_pin", opt_dom0_vcpus_pin); - #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */ /* Various timer handlers. */ @@ -105,7 +101,7 @@ int sched_init_vcpu(struct vcpu *v, unsi * domain-0 VCPUs, are pinned onto their respective physical CPUs. */ v->processor = processor; - if ( is_idle_domain(d) || ((d->domain_id == 0) && opt_dom0_vcpus_pin) ) + if ( is_idle_domain(d) || d->is_pinned ) v->cpu_affinity = cpumask_of_cpu(processor); else cpus_setall(v->cpu_affinity); @@ -250,12 +246,11 @@ void vcpu_force_reschedule(struct vcpu * } } -int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity) +static int __vcpu_set_affinity( + struct vcpu *v, cpumask_t *affinity, + bool_t old_lock_status, bool_t new_lock_status) { - cpumask_t online_affinity; - - if ( (v->domain->domain_id == 0) && opt_dom0_vcpus_pin ) - return -EINVAL; + cpumask_t online_affinity, old_affinity; cpus_and(online_affinity, *affinity, cpu_online_map); if ( cpus_empty(online_affinity) ) @@ -263,7 +258,18 @@ int vcpu_set_affinity(struct vcpu *v, cp vcpu_schedule_lock_irq(v); + if ( v->affinity_locked != old_lock_status ) + { + BUG_ON(!v->affinity_locked); + vcpu_schedule_unlock_irq(v); + return -EBUSY; + } + + v->affinity_locked = new_lock_status; + + old_affinity = v->cpu_affinity; v->cpu_affinity = *affinity; + *affinity = old_affinity; if ( !cpu_isset(v->processor, v->cpu_affinity) ) set_bit(_VPF_migrating, &v->pause_flags); @@ -278,6 +284,31 @@ int vcpu_set_affinity(struct vcpu *v, cp return 0; } +int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity) +{ + if ( v->domain->is_pinned ) + return -EINVAL; + return __vcpu_set_affinity(v, affinity, 0, 0); +} + +int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity) +{ + return __vcpu_set_affinity(v, affinity, 0, 1); +} + +void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity) +{ + cpumask_t online_affinity; + + /* Do not fail if no CPU in old affinity mask is online. */ + cpus_and(online_affinity, *affinity, cpu_online_map); + if ( cpus_empty(online_affinity) ) + *affinity = cpu_online_map; + + if ( __vcpu_set_affinity(v, affinity, 1, 0) != 0 ) + BUG(); +} + /* Block the currently-executing domain until a pertinent event occurs. */ static long do_block(void) { diff -Naurp xen/common/sysctl.c xen-redhat/common/sysctl.c --- xen/common/sysctl.c +++ xen-redhat/common/sysctl.c @@ -21,6 +21,9 @@ #include <xen/keyhandler.h> #include <asm/current.h> #include <public/sysctl.h> +#include <asm/numa.h> +#include <xen/nodemask.h> + extern long arch_do_sysctl( struct xen_sysctl *op, XEN_GUEST_HANDLE(xen_sysctl_t) u_sysctl); @@ -38,7 +41,17 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysc return -EFAULT; if ( op->interface_version != XEN_SYSCTL_INTERFACE_VERSION ) - return -EACCES; + { + /* + * RHEL5 ABI compat: Allow through physinfo calls with + * newer versions for NUMA extensions + */ + if (op->cmd == XEN_SYSCTL_physinfo && + op->interface_version == (XEN_SYSCTL_INTERFACE_VERSION+1)) + dprintk(XENLOG_DEBUG, "Allowing physinfo call with newer ABI version\n"); + else + return -EACCES; + } spin_lock(&sysctl_lock); @@ -112,6 +125,18 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysc } break; + case XEN_SYSCTL_availheap: + { + op->u.availheap.avail_bytes = avail_domheap_pages_region( + op->u.availheap.node, + op->u.availheap.min_bitwidth, + op->u.availheap.max_bitwidth); + op->u.availheap.avail_bytes <<= PAGE_SHIFT; + + ret = copy_to_guest(u_sysctl, op, 1) ? -EFAULT : 0; + } + break; + #ifdef PERF_COUNTERS case XEN_SYSCTL_perfc_op: { diff -Naurp xen/common/trace.c xen-redhat/common/trace.c --- xen/common/trace.c +++ xen-redhat/common/trace.c @@ -37,7 +37,7 @@ #define xen_t_buf t_buf CHECK_t_buf; #undef xen_t_buf -#define TB_COMPAT IS_COMPAT(dom0) +#define TB_COMPAT is_pv_32on64_domain(dom0) #else #define compat_t_rec t_rec #define TB_COMPAT 0 diff -Naurp xen/common/xenoprof.c xen-redhat/common/xenoprof.c --- xen/common/xenoprof.c +++ xen-redhat/common/xenoprof.c @@ -171,7 +171,7 @@ static int alloc_xenoprof_struct( bufsize = sizeof(struct xenoprof_buf); i = sizeof(struct event_log); #ifdef CONFIG_COMPAT - d->xenoprof->is_compat = IS_COMPAT(is_passive ? dom0 : d); + d->xenoprof->is_compat = is_pv_32on64_domain(is_passive ? dom0 : d); if ( XENOPROF_COMPAT(d->xenoprof) ) { bufsize = sizeof(struct compat_oprof_buf); diff -Naurp xen/drivers/acpi/tables.c xen-redhat/drivers/acpi/tables.c --- xen/drivers/acpi/tables.c +++ xen-redhat/drivers/acpi/tables.c @@ -59,6 +59,8 @@ static char *acpi_table_signatures[ACPI_ [ACPI_SPMI] = "SPMI", [ACPI_HPET] = "HPET", [ACPI_MCFG] = "MCFG", + [ACPI_DMAR] = "DMAR", + [ACPI_IVRS] = "IVRS", }; static char *mps_inti_flags_polarity[] = { "dfl", "high", "res", "low" }; diff -Naurp xen/drivers/char/console.c xen-redhat/drivers/char/console.c --- xen/drivers/char/console.c +++ xen-redhat/drivers/char/console.c @@ -313,18 +313,16 @@ static long guest_console_write(XEN_GUES while ( count > 0 ) { - while ( serial_tx_space(sercon_handle) < (SERIAL_TXBUFSZ / 2) ) - { - if ( hypercall_preempt_check() ) - break; - cpu_relax(); - } - if ( hypercall_preempt_check() ) return hypercall_create_continuation( __HYPERVISOR_console_io, "iih", CONSOLEIO_write, count, buffer); + if ( serial_tx_space(sercon_handle) < (SERIAL_TXBUFSZ / 2) ) + { + return 0; + } + kcount = min_t(int, count, sizeof(kbuf)-1); if ( copy_from_guest(kbuf, buffer, kcount) ) return -EFAULT; @@ -587,16 +585,6 @@ void __init console_endboot(void) switch_serial_input(); } -void console_start_log_everything(void) -{ - atomic_inc(&print_everything); -} - -void console_end_log_everything(void) -{ - atomic_dec(&print_everything); -} - void console_force_unlock(void) { spin_lock_init(&console_lock); @@ -611,14 +599,14 @@ void console_force_lock(void) void console_start_sync(void) { - console_start_log_everything(); + atomic_inc(&print_everything); serial_start_sync(sercon_handle); } void console_end_sync(void) { serial_end_sync(sercon_handle); - console_end_log_everything(); + atomic_dec(&print_everything); } void console_putc(char c) diff -Naurp xen/drivers/char/serial.c xen-redhat/drivers/char/serial.c --- xen/drivers/char/serial.c +++ xen-redhat/drivers/char/serial.c @@ -3,7 +3,7 @@ * * Framework for serial device drivers. * - * Copyright (c) 2003-2005, K A Fraser + * Copyright (c) 2003-2008, K A Fraser */ #include <xen/config.h> @@ -81,13 +81,21 @@ void serial_tx_interrupt(struct serial_p static void __serial_putc(struct serial_port *port, char c) { - int i; - if ( (port->txbuf != NULL) && !port->sync ) { /* Interrupt-driven (asynchronous) transmitter. */ + if ( port->tx_quench ) + { + /* Buffer filled and we are dropping characters. */ + if ( (port->txbufp - port->txbufc) > (SERIAL_TXBUFSZ / 2) ) + return; + port->tx_quench = 0; + } + if ( (port->txbufp - port->txbufc) == SERIAL_TXBUFSZ ) { +#ifdef SERIAL_NEVER_DROP_CHARS + int i; /* Buffer is full: we spin, but could alternatively drop chars. */ while ( !port->driver->tx_empty(port) ) cpu_relax(); @@ -95,6 +103,10 @@ static void __serial_putc(struct serial_ port->driver->putc( port, port->txbuf[MASK_SERIAL_TXBUF_IDX(port->txbufc++)]); port->txbuf[MASK_SERIAL_TXBUF_IDX(port->txbufp++)] = c; +#else + /* Buffer is full: drop characters until buffer is half empty. */ + port->tx_quench = 1; +#endif } else if ( ((port->txbufp - port->txbufc) == 0) && port->driver->tx_empty(port) ) diff -Naurp xen/drivers/Makefile xen-redhat/drivers/Makefile --- xen/drivers/Makefile +++ xen-redhat/drivers/Makefile @@ -1,3 +1,6 @@ subdir-y += char +subdir-y += pci +subdir-$(x86_32) += passthrough +subdir-$(x86_64) += passthrough subdir-$(HAS_ACPI) += acpi subdir-$(HAS_VGA) += video diff -Naurp xen/drivers/passthrough/amd/iommu_acpi.c xen-redhat/drivers/passthrough/amd/iommu_acpi.c --- xen/drivers/passthrough/amd/iommu_acpi.c +++ xen-redhat/drivers/passthrough/amd/iommu_acpi.c @@ -0,0 +1,1041 @@ +/* + * Copyright (C) 2007 Advanced Micro Devices, Inc. + * Author: Leo Duran <leo.duran@amd.com> + * Author: Wei Wang <wei.wang2@amd.com> - adapted to xen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <xen/config.h> +#include <xen/errno.h> +#include <asm/amd-iommu.h> +#include <asm/hvm/svm/amd-iommu-proto.h> +#include <asm/hvm/svm/amd-iommu-acpi.h> + +extern unsigned long amd_iommu_page_entries; +extern unsigned short ivrs_bdf_entries; +extern struct ivrs_mappings *ivrs_mappings; +extern unsigned short last_bdf; +extern int ioapic_bdf[MAX_IO_APICS]; +unsigned int parse_ivrs_table_error; +extern void *shared_intremap_table; + +static void add_ivrs_mapping_entry( + u16 bdf, u16 alias_id, u8 flags, struct amd_iommu *iommu) +{ + u8 sys_mgt, lint1_pass, lint0_pass, nmi_pass, ext_int_pass, init_pass; + ASSERT( ivrs_mappings != NULL ); + + /* setup requestor id */ + ivrs_mappings[bdf].dte_requestor_id = alias_id; + + /* override flags for range of devices */ + sys_mgt = get_field_from_byte(flags, + AMD_IOMMU_ACPI_SYS_MGT_MASK, + AMD_IOMMU_ACPI_SYS_MGT_SHIFT); + lint1_pass = get_field_from_byte(flags, + AMD_IOMMU_ACPI_LINT1_PASS_MASK, + AMD_IOMMU_ACPI_LINT1_PASS_SHIFT); + lint0_pass = get_field_from_byte(flags, + AMD_IOMMU_ACPI_LINT0_PASS_MASK, + AMD_IOMMU_ACPI_LINT0_PASS_SHIFT); + nmi_pass = get_field_from_byte(flags, + AMD_IOMMU_ACPI_NMI_PASS_MASK, + AMD_IOMMU_ACPI_NMI_PASS_SHIFT); + ext_int_pass = get_field_from_byte(flags, + AMD_IOMMU_ACPI_EINT_PASS_MASK, + AMD_IOMMU_ACPI_EINT_PASS_SHIFT); + init_pass = get_field_from_byte(flags, + AMD_IOMMU_ACPI_INIT_PASS_MASK, + AMD_IOMMU_ACPI_INIT_PASS_SHIFT); + + ivrs_mappings[bdf].dte_sys_mgt_enable = sys_mgt; + ivrs_mappings[bdf].dte_lint1_pass = lint1_pass; + ivrs_mappings[bdf].dte_lint0_pass = lint0_pass; + ivrs_mappings[bdf].dte_nmi_pass = nmi_pass; + ivrs_mappings[bdf].dte_ext_int_pass = ext_int_pass; + ivrs_mappings[bdf].dte_init_pass = init_pass; + + if (ivrs_mappings[alias_id].intremap_table == NULL ) + { + /* allocate per-device interrupt remapping table */ + if ( amd_iommu_perdev_intremap ) + ivrs_mappings[alias_id].intremap_table = + amd_iommu_alloc_intremap_table(); + else + { + if ( shared_intremap_table == NULL ) + shared_intremap_table = amd_iommu_alloc_intremap_table(); + ivrs_mappings[alias_id].intremap_table = shared_intremap_table; + } + } + /* assgin iommu hardware */ + ivrs_mappings[bdf].iommu = iommu; +} + + +static struct amd_iommu * __init find_iommu_from_bdf_cap( + u16 bdf, u8 cap_offset) +{ + struct amd_iommu *iommu; + + for_each_amd_iommu ( iommu ) + if ( (iommu->bdf == bdf) && (iommu->cap_offset == cap_offset) ) + return iommu; + + return NULL; +} + +static void __init reserve_iommu_exclusion_range( + struct amd_iommu *iommu, uint64_t base, uint64_t limit) +{ + /* need to extend exclusion range? */ + if ( iommu->exclusion_enable ) + { + if ( iommu->exclusion_base < base ) + base = iommu->exclusion_base; + if ( iommu->exclusion_limit > limit ) + limit = iommu->exclusion_limit; + } + + iommu->exclusion_enable = IOMMU_CONTROL_ENABLED; + iommu->exclusion_base = base; + iommu->exclusion_limit = limit; +} + +static void __init reserve_iommu_exclusion_range_all( + struct amd_iommu *iommu, + unsigned long base, unsigned long limit) +{ + reserve_iommu_exclusion_range(iommu, base, limit); + iommu->exclusion_allow_all = IOMMU_CONTROL_ENABLED; +} + +static void __init reserve_unity_map_for_device( + u16 bdf, unsigned long base, + unsigned long length, u8 iw, u8 ir) +{ + unsigned long old_top, new_top; + + /* need to extend unity-mapped range? */ + if ( ivrs_mappings[bdf].unity_map_enable ) + { + old_top = ivrs_mappings[bdf].addr_range_start + + ivrs_mappings[bdf].addr_range_length; + new_top = base + length; + if ( old_top > new_top ) + new_top = old_top; + if ( ivrs_mappings[bdf].addr_range_start < base ) + base = ivrs_mappings[bdf].addr_range_start; + length = new_top - base; + } + + /* extend r/w permissioms and keep aggregate */ + ivrs_mappings[bdf].write_permission = iw; + ivrs_mappings[bdf].read_permission = ir; + ivrs_mappings[bdf].unity_map_enable = IOMMU_CONTROL_ENABLED; + ivrs_mappings[bdf].addr_range_start = base; + ivrs_mappings[bdf].addr_range_length = length; +} + +static int __init register_exclusion_range_for_all_devices( + unsigned long base, unsigned long limit, u8 iw, u8 ir) +{ + unsigned long range_top, iommu_top, length; + struct amd_iommu *iommu; + u16 bdf; + + /* is part of exclusion range inside of IOMMU virtual address space? */ + /* note: 'limit' parameter is assumed to be page-aligned */ + range_top = limit + PAGE_SIZE; + iommu_top = max_page * PAGE_SIZE; + if ( base < iommu_top ) + { + if ( range_top > iommu_top ) + range_top = iommu_top; + length = range_top - base; + /* reserve r/w unity-mapped page entries for devices */ + /* note: these entries are part of the exclusion range */ + for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ ) + reserve_unity_map_for_device(bdf, base, length, iw, ir); + /* push 'base' just outside of virtual address space */ + base = iommu_top; + } + /* register IOMMU exclusion range settings */ + if ( limit >= iommu_top ) + { + for_each_amd_iommu( iommu ) + reserve_iommu_exclusion_range_all(iommu, base, limit); + } + + return 0; +} + +static int __init register_exclusion_range_for_device( + u16 bdf, unsigned long base, unsigned long limit, u8 iw, u8 ir) +{ + unsigned long range_top, iommu_top, length; + struct amd_iommu *iommu; + u16 bus, devfn, req; + + bus = bdf >> 8; + devfn = bdf & 0xFF; + iommu = find_iommu_for_device(bus, devfn); + if ( !iommu ) + { + AMD_IOMMU_DEBUG("IVMD Error: No IOMMU for Dev_Id 0x%x!\n", bdf); + return -ENODEV; + } + req = ivrs_mappings[bdf].dte_requestor_id; + + /* note: 'limit' parameter is assumed to be page-aligned */ + range_top = limit + PAGE_SIZE; + iommu_top = max_page * PAGE_SIZE; + if ( base < iommu_top ) + { + if ( range_top > iommu_top ) + range_top = iommu_top; + length = range_top - base; + /* reserve unity-mapped page entries for device */ + /* note: these entries are part of the exclusion range */ + reserve_unity_map_for_device(bdf, base, length, iw, ir); + reserve_unity_map_for_device(req, base, length, iw, ir); + + /* push 'base' just outside of virtual address space */ + base = iommu_top; + } + + /* register IOMMU exclusion range settings for device */ + if ( limit >= iommu_top ) + { + reserve_iommu_exclusion_range(iommu, base, limit); + ivrs_mappings[bdf].dte_allow_exclusion = IOMMU_CONTROL_ENABLED; + ivrs_mappings[req].dte_allow_exclusion = IOMMU_CONTROL_ENABLED; + } + + return 0; +} + +static int __init register_exclusion_range_for_iommu_devices( + struct amd_iommu *iommu, + unsigned long base, unsigned long limit, u8 iw, u8 ir) +{ + unsigned long range_top, iommu_top, length; + u16 bus, devfn, bdf, req; + + /* is part of exclusion range inside of IOMMU virtual address space? */ + /* note: 'limit' parameter is assumed to be page-aligned */ + range_top = limit + PAGE_SIZE; + iommu_top = max_page * PAGE_SIZE; + if ( base < iommu_top ) + { + if ( range_top > iommu_top ) + range_top = iommu_top; + length = range_top - base; + /* reserve r/w unity-mapped page entries for devices */ + /* note: these entries are part of the exclusion range */ + for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ ) + { + bus = bdf >> 8; + devfn = bdf & 0xFF; + if ( iommu == find_iommu_for_device(bus, devfn) ) + { + reserve_unity_map_for_device(bdf, base, length, iw, ir); + req = ivrs_mappings[bdf].dte_requestor_id; + reserve_unity_map_for_device(req, base, length, iw, ir); + } + } + + /* push 'base' just outside of virtual address space */ + base = iommu_top; + } + + /* register IOMMU exclusion range settings */ + if ( limit >= iommu_top ) + reserve_iommu_exclusion_range_all(iommu, base, limit); + return 0; +} + +static int __init parse_ivmd_device_select( + struct acpi_ivmd_block_header *ivmd_block, + unsigned long base, unsigned long limit, u8 iw, u8 ir) +{ + u16 bdf; + + bdf = ivmd_block->header.dev_id; + if ( bdf >= ivrs_bdf_entries ) + { + AMD_IOMMU_DEBUG("IVMD Error: Invalid Dev_Id 0x%x\n", bdf); + return -ENODEV; + } + + return register_exclusion_range_for_device(bdf, base, limit, iw, ir); +} + +static int __init parse_ivmd_device_range( + struct acpi_ivmd_block_header *ivmd_block, + unsigned long base, unsigned long limit, u8 iw, u8 ir) +{ + u16 first_bdf, last_bdf, bdf; + int error; + + first_bdf = ivmd_block->header.dev_id; + if ( first_bdf >= ivrs_bdf_entries ) + { + AMD_IOMMU_DEBUG( + "IVMD Error: Invalid Range_First Dev_Id 0x%x\n", first_bdf); + return -ENODEV; + } + + last_bdf = ivmd_block->last_dev_id; + if ( (last_bdf >= ivrs_bdf_entries) || (last_bdf <= first_bdf) ) + { + AMD_IOMMU_DEBUG( + "IVMD Error: Invalid Range_Last Dev_Id 0x%x\n", last_bdf); + return -ENODEV; + } + + for ( bdf = first_bdf, error = 0; (bdf <= last_bdf) && !error; bdf++ ) + error = register_exclusion_range_for_device( + bdf, base, limit, iw, ir); + + return error; +} + +static int __init parse_ivmd_device_iommu( + struct acpi_ivmd_block_header *ivmd_block, + unsigned long base, unsigned long limit, u8 iw, u8 ir) +{ + struct amd_iommu *iommu; + + /* find target IOMMU */ + iommu = find_iommu_from_bdf_cap(ivmd_block->header.dev_id, + ivmd_block->cap_offset); + if ( !iommu ) + { + AMD_IOMMU_DEBUG("IVMD Error: No IOMMU for Dev_Id 0x%x Cap 0x%x\n", + ivmd_block->header.dev_id, ivmd_block->cap_offset); + return -ENODEV; + } + + return register_exclusion_range_for_iommu_devices( + iommu, base, limit, iw, ir); +} + +static int __init parse_ivmd_block(struct acpi_ivmd_block_header *ivmd_block) +{ + unsigned long start_addr, mem_length, base, limit; + u8 iw, ir; + + if ( ivmd_block->header.length < + sizeof(struct acpi_ivmd_block_header) ) + { + AMD_IOMMU_DEBUG("IVMD Error: Invalid Block Length!\n"); + return -ENODEV; + } + + start_addr = (unsigned long)ivmd_block->start_addr; + mem_length = (unsigned long)ivmd_block->mem_length; + base = start_addr & PAGE_MASK; + limit = (start_addr + mem_length - 1) & PAGE_MASK; + + AMD_IOMMU_DEBUG("IVMD Block: Type 0x%x\n",ivmd_block->header.type); + AMD_IOMMU_DEBUG(" Start_Addr_Phys 0x%lx\n", start_addr); + AMD_IOMMU_DEBUG(" Mem_Length 0x%lx\n", mem_length); + + if ( get_field_from_byte(ivmd_block->header.flags, + AMD_IOMMU_ACPI_EXCLUSION_RANGE_MASK, + AMD_IOMMU_ACPI_EXCLUSION_RANGE_SHIFT) ) + iw = ir = IOMMU_CONTROL_ENABLED; + else if ( get_field_from_byte(ivmd_block->header.flags, + AMD_IOMMU_ACPI_UNITY_MAPPING_MASK, + AMD_IOMMU_ACPI_UNITY_MAPPING_SHIFT) ) + { + iw = get_field_from_byte(ivmd_block->header.flags, + AMD_IOMMU_ACPI_IW_PERMISSION_MASK, + AMD_IOMMU_ACPI_IW_PERMISSION_SHIFT); + ir = get_field_from_byte(ivmd_block->header.flags, + AMD_IOMMU_ACPI_IR_PERMISSION_MASK, + AMD_IOMMU_ACPI_IR_PERMISSION_SHIFT); + } + else + { + AMD_IOMMU_DEBUG("IVMD Error: Invalid Flag Field!\n"); + return -ENODEV; + } + + switch( ivmd_block->header.type ) + { + case AMD_IOMMU_ACPI_IVMD_ALL_TYPE: + return register_exclusion_range_for_all_devices( + base, limit, iw, ir); + + case AMD_IOMMU_ACPI_IVMD_ONE_TYPE: + return parse_ivmd_device_select(ivmd_block, + base, limit, iw, ir); + + case AMD_IOMMU_ACPI_IVMD_RANGE_TYPE: + return parse_ivmd_device_range(ivmd_block, + base, limit, iw, ir); + + case AMD_IOMMU_ACPI_IVMD_IOMMU_TYPE: + return parse_ivmd_device_iommu(ivmd_block, + base, limit, iw, ir); + + default: + AMD_IOMMU_DEBUG("IVMD Error: Invalid Block Type!\n"); + return -ENODEV; + } +} + +static u16 __init parse_ivhd_device_padding( + u16 pad_length, u16 header_length, u16 block_length) +{ + if ( header_length < (block_length + pad_length) ) + { + AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Length!\n"); + return 0; + } + + return pad_length; +} + +static u16 __init parse_ivhd_device_select( + union acpi_ivhd_device *ivhd_device, struct amd_iommu *iommu) +{ + u16 bdf; + + bdf = ivhd_device->header.dev_id; + if ( bdf >= ivrs_bdf_entries ) + { + AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Dev_Id 0x%x\n", bdf); + return 0; + } + + /* override flags for device */ + add_ivrs_mapping_entry(bdf, bdf, ivhd_device->header.flags, iommu); + + return sizeof(struct acpi_ivhd_device_header); +} + +static u16 __init parse_ivhd_device_range( + union acpi_ivhd_device *ivhd_device, + u16 header_length, u16 block_length, struct amd_iommu *iommu) +{ + u16 dev_length, first_bdf, last_bdf, bdf; + + dev_length = sizeof(struct acpi_ivhd_device_range); + if ( header_length < (block_length + dev_length) ) + { + AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Length!\n"); + return 0; + } + + if ( ivhd_device->range.trailer.type != + AMD_IOMMU_ACPI_IVHD_DEV_RANGE_END ) + { + AMD_IOMMU_DEBUG("IVHD Error: " + "Invalid Range: End_Type 0x%x\n", + ivhd_device->range.trailer.type); + return 0; + } + + first_bdf = ivhd_device->header.dev_id; + if ( first_bdf >= ivrs_bdf_entries ) + { + AMD_IOMMU_DEBUG( + "IVHD Error: Invalid Range: First Dev_Id 0x%x\n", first_bdf); + return 0; + } + + last_bdf = ivhd_device->range.trailer.dev_id; + if ( (last_bdf >= ivrs_bdf_entries) || (last_bdf <= first_bdf) ) + { + AMD_IOMMU_DEBUG( + "IVHD Error: Invalid Range: Last Dev_Id 0x%x\n", last_bdf); + return 0; + } + + AMD_IOMMU_DEBUG(" Dev_Id Range: 0x%x -> 0x%x\n", first_bdf, last_bdf); + + for ( bdf = first_bdf; bdf <= last_bdf; bdf++ ) + add_ivrs_mapping_entry(bdf, bdf, ivhd_device->header.flags, iommu); + + return dev_length; +} + +static u16 __init parse_ivhd_device_alias( + union acpi_ivhd_device *ivhd_device, + u16 header_length, u16 block_length, struct amd_iommu *iommu) +{ + u16 dev_length, alias_id, bdf; + + dev_length = sizeof(struct acpi_ivhd_device_alias); + if ( header_length < (block_length + dev_length) ) + { + AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Length!\n"); + return 0; + } + + bdf = ivhd_device->header.dev_id; + if ( bdf >= ivrs_bdf_entries ) + { + AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Dev_Id 0x%x\n", bdf); + return 0; + } + + alias_id = ivhd_device->alias.dev_id; + if ( alias_id >= ivrs_bdf_entries ) + { + AMD_IOMMU_DEBUG("IVHD Error: Invalid Alias Dev_Id 0x%x\n", alias_id); + return 0; + } + + AMD_IOMMU_DEBUG(" Dev_Id Alias: 0x%x\n", alias_id); + + /* override requestor_id and flags for device */ + add_ivrs_mapping_entry(bdf, alias_id, ivhd_device->header.flags, iommu); + + return dev_length; +} + +static u16 __init parse_ivhd_device_alias_range( + union acpi_ivhd_device *ivhd_device, + u16 header_length, u16 block_length, struct amd_iommu *iommu) +{ + + u16 dev_length, first_bdf, last_bdf, alias_id, bdf; + + dev_length = sizeof(struct acpi_ivhd_device_alias_range); + if ( header_length < (block_length + dev_length) ) + { + AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Length!\n"); + return 0; + } + + if ( ivhd_device->alias_range.trailer.type != + AMD_IOMMU_ACPI_IVHD_DEV_RANGE_END ) + { + AMD_IOMMU_DEBUG("IVHD Error: " + "Invalid Range: End_Type 0x%x\n", + ivhd_device->alias_range.trailer.type); + return 0; + } + + first_bdf = ivhd_device->header.dev_id; + if ( first_bdf >= ivrs_bdf_entries ) + { + AMD_IOMMU_DEBUG( + "IVHD Error: Invalid Range: First Dev_Id 0x%x\n", first_bdf); + return 0; + } + + last_bdf = ivhd_device->alias_range.trailer.dev_id; + if ( last_bdf >= ivrs_bdf_entries || last_bdf <= first_bdf ) + { + AMD_IOMMU_DEBUG( + "IVHD Error: Invalid Range: Last Dev_Id 0x%x\n", last_bdf); + return 0; + } + + alias_id = ivhd_device->alias_range.alias.dev_id; + if ( alias_id >= ivrs_bdf_entries ) + { + AMD_IOMMU_DEBUG("IVHD Error: Invalid Alias Dev_Id 0x%x\n", alias_id); + return 0; + } + + AMD_IOMMU_DEBUG(" Dev_Id Range: 0x%x -> 0x%x\n", first_bdf, last_bdf); + AMD_IOMMU_DEBUG(" Dev_Id Alias: 0x%x\n", alias_id); + + /* override requestor_id and flags for range of devices */ + for ( bdf = first_bdf; bdf <= last_bdf; bdf++ ) + add_ivrs_mapping_entry(bdf, alias_id, ivhd_device->header.flags, + iommu); + + return dev_length; +} + +static u16 __init parse_ivhd_device_extended( + union acpi_ivhd_device *ivhd_device, + u16 header_length, u16 block_length, struct amd_iommu *iommu) +{ + u16 dev_length, bdf; + + dev_length = sizeof(struct acpi_ivhd_device_extended); + if ( header_length < (block_length + dev_length) ) + { + AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Length!\n"); + return 0; + } + + bdf = ivhd_device->header.dev_id; + if ( bdf >= ivrs_bdf_entries ) + { + AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Dev_Id 0x%x\n", bdf); + return 0; + } + + /* override flags for device */ + add_ivrs_mapping_entry(bdf, bdf, ivhd_device->header.flags, iommu); + + return dev_length; +} + +static u16 __init parse_ivhd_device_extended_range( + union acpi_ivhd_device *ivhd_device, + u16 header_length, u16 block_length, struct amd_iommu *iommu) +{ + u16 dev_length, first_bdf, last_bdf, bdf; + + dev_length = sizeof(struct acpi_ivhd_device_extended_range); + if ( header_length < (block_length + dev_length) ) + { + AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Length!\n"); + return 0; + } + + if ( ivhd_device->extended_range.trailer.type != + AMD_IOMMU_ACPI_IVHD_DEV_RANGE_END ) + { + AMD_IOMMU_DEBUG("IVHD Error: " + "Invalid Range: End_Type 0x%x\n", + ivhd_device->extended_range.trailer.type); + return 0; + } + + first_bdf = ivhd_device->header.dev_id; + if ( first_bdf >= ivrs_bdf_entries ) + { + AMD_IOMMU_DEBUG( + "IVHD Error: Invalid Range: First Dev_Id 0x%x\n", first_bdf); + return 0; + } + + last_bdf = ivhd_device->extended_range.trailer.dev_id; + if ( (last_bdf >= ivrs_bdf_entries) || (last_bdf <= first_bdf) ) + { + AMD_IOMMU_DEBUG( + "IVHD Error: Invalid Range: Last Dev_Id 0x%x\n", last_bdf); + return 0; + } + + AMD_IOMMU_DEBUG(" Dev_Id Range: 0x%x -> 0x%x\n", + first_bdf, last_bdf); + + /* override flags for range of devices */ + for ( bdf = first_bdf; bdf <= last_bdf; bdf++ ) + add_ivrs_mapping_entry(bdf, bdf, ivhd_device->header.flags, iommu); + + return dev_length; +} + +static u16 __init parse_ivhd_device_special( + union acpi_ivhd_device *ivhd_device, + u16 header_length, u16 block_length, struct amd_iommu *iommu) +{ + u16 dev_length, bdf; + + dev_length = sizeof(struct acpi_ivhd_device_special); + if ( header_length < (block_length + dev_length) ) + { + AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Length!\n"); + return 0; + } + + bdf = ivhd_device->special.dev_id; + if ( bdf >= ivrs_bdf_entries ) + { + AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Dev_Id 0x%x\n", bdf); + return 0; + } + + add_ivrs_mapping_entry(bdf, bdf, ivhd_device->header.flags, iommu); + /* set device id of ioapic */ + ioapic_bdf[ivhd_device->special.handle] = bdf; + return dev_length; + } + + +static int __init parse_ivhd_block(struct acpi_ivhd_block_header *ivhd_block) +{ + union acpi_ivhd_device *ivhd_device; + u16 block_length, dev_length; + struct amd_iommu *iommu; + + if ( ivhd_block->header.length < + sizeof(struct acpi_ivhd_block_header) ) + { + AMD_IOMMU_DEBUG("IVHD Error: Invalid Block Length!\n"); + return -ENODEV; + } + + iommu = find_iommu_from_bdf_cap(ivhd_block->header.dev_id, + ivhd_block->cap_offset); + if ( !iommu ) + { + AMD_IOMMU_DEBUG("IVHD Error: No IOMMU for Dev_Id 0x%x Cap 0x%x\n", + ivhd_block->header.dev_id, ivhd_block->cap_offset); + return -ENODEV; + } + + /* parse Device Entries */ + block_length = sizeof(struct acpi_ivhd_block_header); + while ( ivhd_block->header.length >= + (block_length + sizeof(struct acpi_ivhd_device_header)) ) + { + ivhd_device = (union acpi_ivhd_device *) + ((u8 *)ivhd_block + block_length); + + AMD_IOMMU_DEBUG( "IVHD Device Entry:\n"); + AMD_IOMMU_DEBUG( " Type 0x%x\n", ivhd_device->header.type); + AMD_IOMMU_DEBUG( " Dev_Id 0x%x\n", ivhd_device->header.dev_id); + AMD_IOMMU_DEBUG( " Flags 0x%x\n", ivhd_device->header.flags); + + switch ( ivhd_device->header.type ) + { + case AMD_IOMMU_ACPI_IVHD_DEV_U32_PAD: + dev_length = parse_ivhd_device_padding( + sizeof(u32), + ivhd_block->header.length, block_length); + break; + case AMD_IOMMU_ACPI_IVHD_DEV_U64_PAD: + dev_length = parse_ivhd_device_padding( + sizeof(u64), + ivhd_block->header.length, block_length); + break; + case AMD_IOMMU_ACPI_IVHD_DEV_SELECT: + dev_length = parse_ivhd_device_select(ivhd_device, iommu); + break; + case AMD_IOMMU_ACPI_IVHD_DEV_RANGE_START: + dev_length = parse_ivhd_device_range( + ivhd_device, + ivhd_block->header.length, block_length, iommu); + break; + case AMD_IOMMU_ACPI_IVHD_DEV_ALIAS_SELECT: + dev_length = parse_ivhd_device_alias( + ivhd_device, + ivhd_block->header.length, block_length, iommu); + break; + case AMD_IOMMU_ACPI_IVHD_DEV_ALIAS_RANGE: + dev_length = parse_ivhd_device_alias_range( + ivhd_device, + ivhd_block->header.length, block_length, iommu); + break; + case AMD_IOMMU_ACPI_IVHD_DEV_EXT_SELECT: + dev_length = parse_ivhd_device_extended( + ivhd_device, + ivhd_block->header.length, block_length, iommu); + break; + case AMD_IOMMU_ACPI_IVHD_DEV_EXT_RANGE: + dev_length = parse_ivhd_device_extended_range( + ivhd_device, + ivhd_block->header.length, block_length, iommu); + break; + case AMD_IOMMU_ACPI_IVHD_DEV_SPECIAL: + dev_length = parse_ivhd_device_special( + ivhd_device, + ivhd_block->header.length, block_length, iommu); + break; + default: + AMD_IOMMU_DEBUG("IVHD Error: Invalid Device Type!\n"); + dev_length = 0; + break; + } + + block_length += dev_length; + if ( !dev_length ) + return -ENODEV; + } + + return 0; +} + +static int __init parse_ivrs_block(struct acpi_ivrs_block_header *ivrs_block) +{ + struct acpi_ivhd_block_header *ivhd_block; + struct acpi_ivmd_block_header *ivmd_block; + + switch ( ivrs_block->type ) + { + case AMD_IOMMU_ACPI_IVHD_TYPE: + ivhd_block = (struct acpi_ivhd_block_header *)ivrs_block; + return parse_ivhd_block(ivhd_block); + + case AMD_IOMMU_ACPI_IVMD_ALL_TYPE: + case AMD_IOMMU_ACPI_IVMD_ONE_TYPE: + case AMD_IOMMU_ACPI_IVMD_RANGE_TYPE: + case AMD_IOMMU_ACPI_IVMD_IOMMU_TYPE: + ivmd_block = (struct acpi_ivmd_block_header *)ivrs_block; + return parse_ivmd_block(ivmd_block); + + default: + AMD_IOMMU_DEBUG("IVRS Error: Invalid Block Type!\n"); + return -ENODEV; + } + + return 0; +} + +static void __init dump_acpi_table_header(struct acpi_table_header *table) +{ + int i; + + AMD_IOMMU_DEBUG("ACPI Table:\n"); + AMD_IOMMU_DEBUG(" Signature "); + for ( i = 0; i < ACPI_NAME_SIZE; i++ ) + printk("%c", table->signature[i]); + printk("\n"); + + AMD_IOMMU_DEBUG(" Length 0x%x\n", table->length); + AMD_IOMMU_DEBUG(" Revision 0x%x\n", table->revision); + AMD_IOMMU_DEBUG(" CheckSum 0x%x\n", table->checksum); + + AMD_IOMMU_DEBUG(" OEM_Id "); + for ( i = 0; i < ACPI_OEM_ID_SIZE; i++ ) + printk("%c", table->oem_id[i]); + printk("\n"); + + AMD_IOMMU_DEBUG(" OEM_Table_Id "); + for ( i = 0; i < ACPI_OEM_TABLE_ID_SIZE; i++ ) + printk("%c", table->oem_table_id[i]); + printk("\n"); + + AMD_IOMMU_DEBUG(" OEM_Revision 0x%x\n", table->oem_revision); + + AMD_IOMMU_DEBUG(" Creator_Id "); + for ( i = 0; i < ACPI_NAME_SIZE; i++ ) + printk("%c", table->asl_compiler_id[i]); + printk("\n"); + + AMD_IOMMU_DEBUG(" Creator_Revision 0x%x\n", + table->asl_compiler_revision); +} + +static int __init parse_ivrs_table(unsigned long phys_addr, + unsigned long size) +{ + struct acpi_ivrs_block_header *ivrs_block; + unsigned long length; + int error = 0; + struct acpi_table_header *table; + + table = (struct acpi_table_header *)__acpi_map_table(phys_addr, size); + if ( !table ) + { + AMD_IOMMU_DEBUG("IVRS Error: Unable to map IVRS\n"); + return -ENODEV; + } + + if ( amd_iommu_debug ) + dump_acpi_table_header(table); + + /* parse IVRS blocks */ + length = sizeof(struct acpi_ivrs_table_header); + while ( (error == 0) && (table->length > (length + sizeof(*ivrs_block))) ) + { + ivrs_block = (struct acpi_ivrs_block_header *) + ((u8 *)table + length); + + AMD_IOMMU_DEBUG("IVRS Block:\n"); + AMD_IOMMU_DEBUG(" Type 0x%x\n", ivrs_block->type); + AMD_IOMMU_DEBUG(" Flags 0x%x\n", ivrs_block->flags); + AMD_IOMMU_DEBUG(" Length 0x%x\n", ivrs_block->length); + AMD_IOMMU_DEBUG(" Dev_Id 0x%x\n", ivrs_block->dev_id); + + if ( table->length < (length + ivrs_block->length) ) + { + AMD_IOMMU_DEBUG("IVRS Error: " + "Table Length Exceeded: 0x%x -> 0x%lx\n", + table->length, + (length + ivrs_block->length)); + return -ENODEV; + } + + error = parse_ivrs_block(ivrs_block); + length += ivrs_block->length; + } + + /* this will be used in amd_iommu_update_ivrs_mapping_acpi() */ + parse_ivrs_table_error = error; + return error; +} + +static int __init detect_iommu_acpi(unsigned long phys_addr, + unsigned long size) +{ + struct acpi_ivrs_block_header *ivrs_block; + struct acpi_table_header *table; + unsigned long i; + unsigned long length = sizeof(struct acpi_ivrs_table_header); + u8 checksum, *raw_table; + + table = (struct acpi_table_header *)__acpi_map_table(phys_addr, size); + if ( !table ) + { + AMD_IOMMU_DEBUG("IVRS Error: Unable to map IVRS\n"); + return -ENODEV; + } + + /* validate checksum: sum of entire table == 0 */ + checksum = 0; + raw_table = (u8 *)table; + for ( i = 0; i < table->length; i++ ) + checksum += raw_table[i]; + if ( checksum ) + { + AMD_IOMMU_DEBUG("IVRS Error: " + "Invalid Checksum 0x%x\n", checksum); + return -ENODEV; + } + + while ( table->length > (length + sizeof(*ivrs_block)) ) + { + ivrs_block = (struct acpi_ivrs_block_header *) ((u8 *)table + length); + if ( table->length < (length + ivrs_block->length) ) + return -ENODEV; + if ( ivrs_block->type == AMD_IOMMU_ACPI_IVHD_TYPE ) + if ( amd_iommu_detect_one_acpi((void*)ivrs_block) != 0 ) + return -ENODEV; + length += ivrs_block->length; + } + return 0; +} + +#define UPDATE_LAST_BDF(x) do {\ + if ((x) > last_bdf) \ + last_bdf = (x); \ + } while(0); + +static int __init get_last_bdf_ivhd(void *ivhd) +{ + union acpi_ivhd_device *ivhd_device; + u16 block_length, dev_length; + struct acpi_ivhd_block_header *ivhd_block; + + ivhd_block = (struct acpi_ivhd_block_header *)ivhd; + + if ( ivhd_block->header.length < + sizeof(struct acpi_ivhd_block_header) ) + { + AMD_IOMMU_DEBUG("IVHD Error: Invalid Block Length!\n"); + return -ENODEV; + } + + block_length = sizeof(struct acpi_ivhd_block_header); + while ( ivhd_block->header.length >= + (block_length + sizeof(struct acpi_ivhd_device_header)) ) + { + ivhd_device = (union acpi_ivhd_device *) + ((u8 *)ivhd_block + block_length); + + switch ( ivhd_device->header.type ) + { + case AMD_IOMMU_ACPI_IVHD_DEV_U32_PAD: + dev_length = sizeof(u32); + break; + case AMD_IOMMU_ACPI_IVHD_DEV_U64_PAD: + dev_length = sizeof(u64); + break; + case AMD_IOMMU_ACPI_IVHD_DEV_SELECT: + UPDATE_LAST_BDF(ivhd_device->header.dev_id); + dev_length = sizeof(struct acpi_ivhd_device_header); + break; + case AMD_IOMMU_ACPI_IVHD_DEV_ALIAS_SELECT: + UPDATE_LAST_BDF(ivhd_device->header.dev_id); + dev_length = sizeof(struct acpi_ivhd_device_alias); + break; + case AMD_IOMMU_ACPI_IVHD_DEV_EXT_SELECT: + UPDATE_LAST_BDF(ivhd_device->header.dev_id); + dev_length = sizeof(struct acpi_ivhd_device_extended); + break; + case AMD_IOMMU_ACPI_IVHD_DEV_RANGE_START: + UPDATE_LAST_BDF(ivhd_device->range.trailer.dev_id); + dev_length = sizeof(struct acpi_ivhd_device_range); + break; + case AMD_IOMMU_ACPI_IVHD_DEV_ALIAS_RANGE: + UPDATE_LAST_BDF(ivhd_device->alias_range.trailer.dev_id) + dev_length = sizeof(struct acpi_ivhd_device_alias_range); + break; + case AMD_IOMMU_ACPI_IVHD_DEV_EXT_RANGE: + UPDATE_LAST_BDF(ivhd_device->extended_range.trailer.dev_id) + dev_length = sizeof(struct acpi_ivhd_device_extended_range); + break; + case AMD_IOMMU_ACPI_IVHD_DEV_SPECIAL: + UPDATE_LAST_BDF(ivhd_device->special.dev_id) + dev_length = sizeof(struct acpi_ivhd_device_special); + break; + default: + AMD_IOMMU_DEBUG("IVHD Error: Invalid Device Type!\n"); + dev_length = 0; + break; + } + + block_length += dev_length; + if ( !dev_length ) + return -ENODEV; + } + + return 0; +} + + +static int __init get_last_bdf_acpi(unsigned long phys_addr, unsigned long size) +{ + struct acpi_ivrs_block_header *ivrs_block; + struct acpi_table_header *table; + unsigned long length = sizeof(struct acpi_ivrs_table_header); + + table = (struct acpi_table_header *)__acpi_map_table(phys_addr, size); + if ( !table ) + { + AMD_IOMMU_DEBUG("IVRS Error: Unable to map IVRS\n"); + return -ENODEV; + } + + while ( table->length > (length + sizeof(*ivrs_block)) ) + { + ivrs_block = (struct acpi_ivrs_block_header *) ((u8 *)table + length); + if ( table->length < (length + ivrs_block->length) ) + return -ENODEV; + if ( ivrs_block->type == AMD_IOMMU_ACPI_IVHD_TYPE ) + if ( get_last_bdf_ivhd((void*)ivrs_block) != 0 ) + return -ENODEV; + length += ivrs_block->length; + } + return 0; +} + +int __init amd_iommu_detect_acpi(void) +{ + return acpi_table_parse(ACPI_IVRS, detect_iommu_acpi); +} + +int __init amd_iommu_get_ivrs_dev_entries(void) +{ + acpi_table_parse(ACPI_IVRS, get_last_bdf_acpi); + return last_bdf + 1; +} + +int __init amd_iommu_update_ivrs_mapping_acpi(void) +{ + /* note that acpi_table_parse() function doesn't return value from + * parse_ivrs_table(). So we have to get the value from a global variable + * parse_ivrs_table_error. + */ + acpi_table_parse(ACPI_IVRS, parse_ivrs_table); + + return parse_ivrs_table_error; +} diff -Naurp xen/drivers/passthrough/amd/iommu_detect.c xen-redhat/drivers/passthrough/amd/iommu_detect.c --- xen/drivers/passthrough/amd/iommu_detect.c +++ xen-redhat/drivers/passthrough/amd/iommu_detect.c @@ -0,0 +1,159 @@ +/* + * Copyright (C) 2007 Advanced Micro Devices, Inc. + * Author: Leo Duran <leo.duran@amd.com> + * Author: Wei Wang <wei.wang2@amd.com> - adapted to xen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <xen/config.h> +#include <xen/errno.h> +#include <xen/iommu.h> +#include <xen/pci.h> +#include <xen/pci_regs.h> +#include <asm/amd-iommu.h> +#include <asm/hvm/svm/amd-iommu-proto.h> +#include <asm/hvm/svm/amd-iommu-acpi.h> + +extern struct list_head amd_iommu_head; +unsigned short last_bdf = 0; + +static int __init get_iommu_msi_capabilities(u8 bus, u8 dev, u8 func, + struct amd_iommu *iommu) +{ + int cap_ptr, cap_id; + u32 cap_header; + u16 control; + int count = 0; + + cap_ptr = pci_conf_read8(bus, dev, func, + PCI_CAPABILITY_LIST); + + while ( cap_ptr >= PCI_MIN_CAP_OFFSET && + count < PCI_MAX_CAP_BLOCKS ) + { + cap_ptr &= PCI_CAP_PTR_MASK; + cap_header = pci_conf_read32(bus, dev, func, cap_ptr); + cap_id = get_field_from_reg_u32(cap_header, + PCI_CAP_ID_MASK, PCI_CAP_ID_SHIFT); + + if ( cap_id == PCI_CAP_ID_MSI ) + { + iommu->msi_cap = cap_ptr; + break; + } + cap_ptr = get_field_from_reg_u32(cap_header, + PCI_CAP_NEXT_PTR_MASK, PCI_CAP_NEXT_PTR_SHIFT); + count++; + } + + if ( !iommu->msi_cap ) + return -ENODEV; + + AMD_IOMMU_DEBUG("Found MSI capability block \n"); + control = pci_conf_read16(bus, dev, func, + iommu->msi_cap + PCI_MSI_FLAGS); + iommu->maskbit = control & PCI_MSI_FLAGS_MASKBIT; + return 0; +} + +int __init get_iommu_capabilities(u8 bus, u8 dev, u8 func, u8 cap_ptr, + struct amd_iommu *iommu) +{ + u32 cap_header, cap_range, misc_info; + + cap_header = pci_conf_read32(bus, dev, func, cap_ptr); + iommu->revision = get_field_from_reg_u32( + cap_header, PCI_CAP_REV_MASK, PCI_CAP_REV_SHIFT); + iommu->pte_not_present_cached = get_field_from_reg_u32( + cap_header, PCI_CAP_NP_CACHE_MASK, PCI_CAP_NP_CACHE_SHIFT); + + cap_range = pci_conf_read32(bus, dev, func, + cap_ptr + PCI_CAP_RANGE_OFFSET); + iommu->unit_id = get_field_from_reg_u32( + cap_range, PCI_CAP_UNIT_ID_MASK, PCI_CAP_UNIT_ID_SHIFT); + + misc_info = pci_conf_read32(bus, dev, func, + cap_ptr + PCI_MISC_INFO_OFFSET); + iommu->msi_number = get_field_from_reg_u32( + misc_info, PCI_CAP_MSI_NUMBER_MASK, PCI_CAP_MSI_NUMBER_SHIFT); + + return 0; +} + +int __init amd_iommu_detect_one_acpi(void *ivhd) +{ + struct amd_iommu *iommu; + u8 bus, dev, func; + struct acpi_ivhd_block_header *ivhd_block; + + ivhd_block = (struct acpi_ivhd_block_header *)ivhd; + + if ( ivhd_block->header.length < sizeof(struct acpi_ivhd_block_header) ) + { + AMD_IOMMU_DEBUG("Invalid IVHD Block Length!\n"); + return -ENODEV; + } + + if ( !ivhd_block->header.dev_id || + !ivhd_block->cap_offset || !ivhd_block->mmio_base) + { + AMD_IOMMU_DEBUG("Invalid IVHD Block!\n"); + return -ENODEV; + } + + iommu = (struct amd_iommu *) xmalloc(struct amd_iommu); + if ( !iommu ) + { + AMD_IOMMU_DEBUG("Error allocating amd_iommu\n"); + return -ENOMEM; + } + memset(iommu, 0, sizeof(struct amd_iommu)); + + spin_lock_init(&iommu->lock); + + iommu->bdf = ivhd_block->header.dev_id; + iommu->cap_offset = ivhd_block->cap_offset; + iommu->mmio_base_phys = ivhd_block->mmio_base; + + /* override IOMMU support flags */ + iommu->coherent = get_field_from_byte(ivhd_block->header.flags, + AMD_IOMMU_ACPI_COHERENT_MASK, + AMD_IOMMU_ACPI_COHERENT_SHIFT); + iommu->iotlb_support = get_field_from_byte(ivhd_block->header.flags, + AMD_IOMMU_ACPI_IOTLB_SUP_MASK, + AMD_IOMMU_ACPI_IOTLB_SUP_SHIFT); + iommu->isochronous = get_field_from_byte(ivhd_block->header.flags, + AMD_IOMMU_ACPI_ISOC_MASK, + AMD_IOMMU_ACPI_ISOC_SHIFT); + iommu->res_pass_pw = get_field_from_byte(ivhd_block->header.flags, + AMD_IOMMU_ACPI_RES_PASS_PW_MASK, + AMD_IOMMU_ACPI_RES_PASS_PW_SHIFT); + iommu->pass_pw = get_field_from_byte(ivhd_block->header.flags, + AMD_IOMMU_ACPI_PASS_PW_MASK, + AMD_IOMMU_ACPI_PASS_PW_SHIFT); + iommu->ht_tunnel_enable = get_field_from_byte(ivhd_block->header.flags, + AMD_IOMMU_ACPI_HT_TUN_ENB_MASK, + AMD_IOMMU_ACPI_HT_TUN_ENB_SHIFT); + bus = iommu->bdf >> 8; + dev = PCI_SLOT(iommu->bdf & 0xFF); + func = PCI_FUNC(iommu->bdf & 0xFF); + get_iommu_capabilities(bus, dev, func, iommu->cap_offset, iommu); + get_iommu_msi_capabilities(bus, dev, func, iommu); + + list_add_tail(&iommu->list, &amd_iommu_head); + + return 0; +} diff -Naurp xen/drivers/passthrough/amd/iommu_init.c xen-redhat/drivers/passthrough/amd/iommu_init.c --- xen/drivers/passthrough/amd/iommu_init.c +++ xen-redhat/drivers/passthrough/amd/iommu_init.c @@ -0,0 +1,839 @@ +/* + * Copyright (C) 2007 Advanced Micro Devices, Inc. + * Author: Leo Duran <leo.duran@amd.com> + * Author: Wei Wang <wei.wang2@amd.com> - adapted to xen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <xen/config.h> +#include <xen/errno.h> +#include <xen/pci.h> +#include <xen/pci_regs.h> +#include <asm/amd-iommu.h> +#include <asm/hvm/svm/amd-iommu-proto.h> +#include <asm-x86/fixmap.h> + +static struct amd_iommu *vector_to_iommu[NR_VECTORS]; +static int nr_amd_iommus; +static long amd_iommu_cmd_buffer_entries = IOMMU_CMD_BUFFER_DEFAULT_ENTRIES; +static long amd_iommu_event_log_entries = IOMMU_EVENT_LOG_DEFAULT_ENTRIES; + +unsigned short ivrs_bdf_entries; +struct ivrs_mappings *ivrs_mappings; +struct list_head amd_iommu_head; +struct table_struct device_table; + +/* + * Shifts for MSI data + */ + +#define MSI_DATA_VECTOR_SHIFT 0 +#define MSI_DATA_VECTOR_MASK 0x000000ff +#define MSI_DATA_VECTOR(v) (((v) << MSI_DATA_VECTOR_SHIFT) & MSI_DATA_VECTOR_MASK) + +#define MSI_DATA_DELIVERY_MODE_SHIFT 8 +#define MSI_DATA_DELIVERY_FIXED (0 << MSI_DATA_DELIVERY_MODE_SHIFT) +#define MSI_DATA_DELIVERY_LOWPRI (1 << MSI_DATA_DELIVERY_MODE_SHIFT) + +#define MSI_DATA_LEVEL_SHIFT 14 +#define MSI_DATA_LEVEL_DEASSERT (0 << MSI_DATA_LEVEL_SHIFT) +#define MSI_DATA_LEVEL_ASSERT (1 << MSI_DATA_LEVEL_SHIFT) + +#define MSI_DATA_TRIGGER_SHIFT 15 +#define MSI_DATA_TRIGGER_EDGE (0 << MSI_DATA_TRIGGER_SHIFT) +#define MSI_DATA_TRIGGER_LEVEL (1 << MSI_DATA_TRIGGER_SHIFT) + +/* + * Shift/mask fields for msi address + */ + +#define MSI_ADDR_BASE_HI 0 +#define MSI_ADDR_BASE_LO 0xfee00000 +#define MSI_ADDR_HEADER MSI_ADDR_BASE_LO + +#define MSI_ADDR_DESTMODE_SHIFT 2 +#define MSI_ADDR_DESTMODE_PHYS (0 << MSI_ADDR_DESTMODE_SHIFT) +#define MSI_ADDR_DESTMODE_LOGIC (1 << MSI_ADDR_DESTMODE_SHIFT) + +#define MSI_ADDR_REDIRECTION_SHIFT 3 +#define MSI_ADDR_REDIRECTION_CPU (0 << MSI_ADDR_REDIRECTION_SHIFT) +#define MSI_ADDR_REDIRECTION_LOWPRI (1 << MSI_ADDR_REDIRECTION_SHIFT) + +#define MSI_ADDR_DEST_ID_SHIFT 12 +#define MSI_ADDR_DEST_ID_MASK 0x00ffff0 +#define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & MSI_ADDR_DEST_ID_MASK) + +static int __init map_iommu_mmio_region(struct amd_iommu *iommu) +{ + unsigned long mfn; + + if ( nr_amd_iommus > MAX_AMD_IOMMUS ) + { + AMD_IOMMU_DEBUG("nr_amd_iommus %d > MAX_IOMMUS\n", nr_amd_iommus); + return -ENOMEM; + } + + iommu->mmio_base = (void *)fix_to_virt( + FIX_IOMMU_MMIO_BASE_0 + nr_amd_iommus * MMIO_PAGES_PER_IOMMU); + mfn = (unsigned long)(iommu->mmio_base_phys >> PAGE_SHIFT); + map_pages_to_xen((unsigned long)iommu->mmio_base, mfn, + MMIO_PAGES_PER_IOMMU, PAGE_HYPERVISOR_NOCACHE); + + memset(iommu->mmio_base, 0, IOMMU_MMIO_REGION_LENGTH); + + return 0; +} + +static void __init unmap_iommu_mmio_region(struct amd_iommu *iommu) +{ + if ( iommu->mmio_base ) + { + iounmap(iommu->mmio_base); + iommu->mmio_base = NULL; + } +} + +static void __init register_iommu_dev_table_in_mmio_space(struct amd_iommu *iommu) +{ + u64 addr_64, addr_lo, addr_hi; + u32 entry; + + addr_64 = (u64)virt_to_maddr(iommu->dev_table.buffer); + addr_lo = addr_64 & DMA_32BIT_MASK; + addr_hi = addr_64 >> 32; + + set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0, + IOMMU_DEV_TABLE_BASE_LOW_MASK, + IOMMU_DEV_TABLE_BASE_LOW_SHIFT, &entry); + set_field_in_reg_u32((iommu->dev_table.alloc_size / PAGE_SIZE) - 1, + entry, IOMMU_DEV_TABLE_SIZE_MASK, + IOMMU_DEV_TABLE_SIZE_SHIFT, &entry); + writel(entry, iommu->mmio_base + IOMMU_DEV_TABLE_BASE_LOW_OFFSET); + + set_field_in_reg_u32((u32)addr_hi, 0, + IOMMU_DEV_TABLE_BASE_HIGH_MASK, + IOMMU_DEV_TABLE_BASE_HIGH_SHIFT, &entry); + writel(entry, iommu->mmio_base + IOMMU_DEV_TABLE_BASE_HIGH_OFFSET); +} + +static void __init register_iommu_cmd_buffer_in_mmio_space(struct amd_iommu *iommu) +{ + u64 addr_64, addr_lo, addr_hi; + u32 power_of2_entries; + u32 entry; + + addr_64 = (u64)virt_to_maddr(iommu->cmd_buffer.buffer); + addr_lo = addr_64 & DMA_32BIT_MASK; + addr_hi = addr_64 >> 32; + + set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0, + IOMMU_CMD_BUFFER_BASE_LOW_MASK, + IOMMU_CMD_BUFFER_BASE_LOW_SHIFT, &entry); + writel(entry, iommu->mmio_base + IOMMU_CMD_BUFFER_BASE_LOW_OFFSET); + + power_of2_entries = get_order_from_bytes(iommu->cmd_buffer.alloc_size) + + IOMMU_CMD_BUFFER_POWER_OF2_ENTRIES_PER_PAGE; + + set_field_in_reg_u32((u32)addr_hi, 0, + IOMMU_CMD_BUFFER_BASE_HIGH_MASK, + IOMMU_CMD_BUFFER_BASE_HIGH_SHIFT, &entry); + set_field_in_reg_u32(power_of2_entries, entry, + IOMMU_CMD_BUFFER_LENGTH_MASK, + IOMMU_CMD_BUFFER_LENGTH_SHIFT, &entry); + writel(entry, iommu->mmio_base+IOMMU_CMD_BUFFER_BASE_HIGH_OFFSET); +} + +static void __init register_iommu_event_log_in_mmio_space(struct amd_iommu *iommu) +{ + u64 addr_64, addr_lo, addr_hi; + u32 power_of2_entries; + u32 entry; + + addr_64 = (u64)virt_to_maddr(iommu->event_log.buffer); + addr_lo = addr_64 & DMA_32BIT_MASK; + addr_hi = addr_64 >> 32; + + set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0, + IOMMU_EVENT_LOG_BASE_LOW_MASK, + IOMMU_EVENT_LOG_BASE_LOW_SHIFT, &entry); + writel(entry, iommu->mmio_base + IOMMU_EVENT_LOG_BASE_LOW_OFFSET); + + power_of2_entries = get_order_from_bytes(iommu->event_log.alloc_size) + + IOMMU_EVENT_LOG_POWER_OF2_ENTRIES_PER_PAGE; + + set_field_in_reg_u32((u32)addr_hi, 0, + IOMMU_EVENT_LOG_BASE_HIGH_MASK, + IOMMU_EVENT_LOG_BASE_HIGH_SHIFT, &entry); + set_field_in_reg_u32(power_of2_entries, entry, + IOMMU_EVENT_LOG_LENGTH_MASK, + IOMMU_EVENT_LOG_LENGTH_SHIFT, &entry); + writel(entry, iommu->mmio_base+IOMMU_EVENT_LOG_BASE_HIGH_OFFSET); +} + +static void __init set_iommu_translation_control(struct amd_iommu *iommu, + int enable) +{ + u32 entry; + + entry = readl(iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET); + + if ( enable ) + { + set_field_in_reg_u32(iommu->ht_tunnel_support ? IOMMU_CONTROL_ENABLED : + IOMMU_CONTROL_DISABLED, entry, + IOMMU_CONTROL_HT_TUNNEL_TRANSLATION_MASK, + IOMMU_CONTROL_HT_TUNNEL_TRANSLATION_SHIFT, &entry); + set_field_in_reg_u32(iommu->isochronous ? IOMMU_CONTROL_ENABLED : + IOMMU_CONTROL_DISABLED, entry, + IOMMU_CONTROL_ISOCHRONOUS_MASK, + IOMMU_CONTROL_ISOCHRONOUS_SHIFT, &entry); + set_field_in_reg_u32(iommu->coherent ? IOMMU_CONTROL_ENABLED : + IOMMU_CONTROL_DISABLED, entry, + IOMMU_CONTROL_COHERENT_MASK, + IOMMU_CONTROL_COHERENT_SHIFT, &entry); + set_field_in_reg_u32(iommu->res_pass_pw ? IOMMU_CONTROL_ENABLED : + IOMMU_CONTROL_DISABLED, entry, + IOMMU_CONTROL_RESP_PASS_POSTED_WRITE_MASK, + IOMMU_CONTROL_RESP_PASS_POSTED_WRITE_SHIFT, &entry); + /* do not set PassPW bit */ + set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, entry, + IOMMU_CONTROL_PASS_POSTED_WRITE_MASK, + IOMMU_CONTROL_PASS_POSTED_WRITE_SHIFT, &entry); + } + set_field_in_reg_u32(enable ? IOMMU_CONTROL_ENABLED : + IOMMU_CONTROL_DISABLED, entry, + IOMMU_CONTROL_TRANSLATION_ENABLE_MASK, + IOMMU_CONTROL_TRANSLATION_ENABLE_SHIFT, &entry); + writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET); +} + +static void __init set_iommu_command_buffer_control(struct amd_iommu *iommu, + int enable) +{ + u32 entry; + + entry = readl(iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET); + set_field_in_reg_u32(enable ? IOMMU_CONTROL_ENABLED : + IOMMU_CONTROL_DISABLED, entry, + IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_MASK, + IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_SHIFT, &entry); + writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET); + + /*reset head and tail pointer */ + writel(0x0, iommu->mmio_base + IOMMU_CMD_BUFFER_HEAD_OFFSET); + writel(0x0, iommu->mmio_base + IOMMU_CMD_BUFFER_TAIL_OFFSET); +} + +static void __init register_iommu_exclusion_range(struct amd_iommu *iommu) +{ + u64 addr_lo, addr_hi; + u32 entry; + + addr_lo = iommu->exclusion_limit & DMA_32BIT_MASK; + addr_hi = iommu->exclusion_limit >> 32; + + set_field_in_reg_u32((u32)addr_hi, 0, + IOMMU_EXCLUSION_LIMIT_HIGH_MASK, + IOMMU_EXCLUSION_LIMIT_HIGH_SHIFT, &entry); + writel(entry, iommu->mmio_base+IOMMU_EXCLUSION_LIMIT_HIGH_OFFSET); + + set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0, + IOMMU_EXCLUSION_LIMIT_LOW_MASK, + IOMMU_EXCLUSION_LIMIT_LOW_SHIFT, &entry); + writel(entry, iommu->mmio_base+IOMMU_EXCLUSION_LIMIT_LOW_OFFSET); + + addr_lo = iommu->exclusion_base & DMA_32BIT_MASK; + addr_hi = iommu->exclusion_base >> 32; + + set_field_in_reg_u32((u32)addr_hi, 0, + IOMMU_EXCLUSION_BASE_HIGH_MASK, + IOMMU_EXCLUSION_BASE_HIGH_SHIFT, &entry); + writel(entry, iommu->mmio_base+IOMMU_EXCLUSION_BASE_HIGH_OFFSET); + + set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0, + IOMMU_EXCLUSION_BASE_LOW_MASK, + IOMMU_EXCLUSION_BASE_LOW_SHIFT, &entry); + + set_field_in_reg_u32(iommu->exclusion_allow_all, entry, + IOMMU_EXCLUSION_ALLOW_ALL_MASK, + IOMMU_EXCLUSION_ALLOW_ALL_SHIFT, &entry); + + set_field_in_reg_u32(iommu->exclusion_enable, entry, + IOMMU_EXCLUSION_RANGE_ENABLE_MASK, + IOMMU_EXCLUSION_RANGE_ENABLE_SHIFT, &entry); + writel(entry, iommu->mmio_base+IOMMU_EXCLUSION_BASE_LOW_OFFSET); +} + +static void __init set_iommu_event_log_control(struct amd_iommu *iommu, + int enable) +{ + u32 entry; + + entry = readl(iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET); + set_field_in_reg_u32(enable ? IOMMU_CONTROL_ENABLED : + IOMMU_CONTROL_DISABLED, entry, + IOMMU_CONTROL_EVENT_LOG_ENABLE_MASK, + IOMMU_CONTROL_EVENT_LOG_ENABLE_SHIFT, &entry); + writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET); + + set_field_in_reg_u32(enable ? IOMMU_CONTROL_ENABLED : + IOMMU_CONTROL_DISABLED, entry, + IOMMU_CONTROL_EVENT_LOG_INT_MASK, + IOMMU_CONTROL_EVENT_LOG_INT_SHIFT, &entry); + writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET); + + set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, entry, + IOMMU_CONTROL_COMP_WAIT_INT_MASK, + IOMMU_CONTROL_COMP_WAIT_INT_SHIFT, &entry); + writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET); + + /*reset head and tail pointer */ + writel(0x0, iommu->mmio_base + IOMMU_EVENT_LOG_HEAD_OFFSET); + writel(0x0, iommu->mmio_base + IOMMU_EVENT_LOG_TAIL_OFFSET); +} + +static int amd_iommu_read_event_log(struct amd_iommu *iommu, u32 event[]) +{ + u32 tail, head, *event_log; + int i; + + BUG_ON( !iommu || !event ); + + /* make sure there's an entry in the log */ + tail = get_field_from_reg_u32( + readl(iommu->mmio_base + IOMMU_EVENT_LOG_TAIL_OFFSET), + IOMMU_EVENT_LOG_TAIL_MASK, + IOMMU_EVENT_LOG_TAIL_SHIFT); + if ( tail != iommu->event_log_head ) + { + /* read event log entry */ + event_log = (u32 *)(iommu->event_log.buffer + + (iommu->event_log_head * + IOMMU_EVENT_LOG_ENTRY_SIZE)); + for ( i = 0; i < IOMMU_EVENT_LOG_U32_PER_ENTRY; i++ ) + event[i] = event_log[i]; + if ( ++iommu->event_log_head == iommu->event_log.entries ) + iommu->event_log_head = 0; + + /* update head pointer */ + set_field_in_reg_u32(iommu->event_log_head, 0, + IOMMU_EVENT_LOG_HEAD_MASK, + IOMMU_EVENT_LOG_HEAD_SHIFT, &head); + writel(head, iommu->mmio_base + IOMMU_EVENT_LOG_HEAD_OFFSET); + return 0; + } + + return -EFAULT; +} + +static void amd_iommu_msi_data_init(struct amd_iommu *iommu) +{ + u32 msi_data; + u8 bus = (iommu->bdf >> 8) & 0xff; + u8 dev = PCI_SLOT(iommu->bdf & 0xff); + u8 func = PCI_FUNC(iommu->bdf & 0xff); + int vector = iommu->vector; + + msi_data = MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + MSI_DATA_DELIVERY_FIXED | + MSI_DATA_VECTOR(vector); + + pci_conf_write32(bus, dev, func, + iommu->msi_cap + PCI_MSI_DATA_64, msi_data); +} + +static void amd_iommu_msi_addr_init(struct amd_iommu *iommu, int phy_cpu) +{ + + int bus = (iommu->bdf >> 8) & 0xff; + int dev = PCI_SLOT(iommu->bdf & 0xff); + int func = PCI_FUNC(iommu->bdf & 0xff); + + u32 address_hi = 0; + u32 address_lo = MSI_ADDR_HEADER | + MSI_ADDR_DESTMODE_PHYS | + MSI_ADDR_REDIRECTION_CPU | + MSI_ADDR_DEST_ID(phy_cpu); + + pci_conf_write32(bus, dev, func, + iommu->msi_cap + PCI_MSI_ADDRESS_LO, address_lo); + pci_conf_write32(bus, dev, func, + iommu->msi_cap + PCI_MSI_ADDRESS_HI, address_hi); +} + +static void amd_iommu_msi_enable(struct amd_iommu *iommu, int flag) +{ + u16 control; + int bus = (iommu->bdf >> 8) & 0xff; + int dev = PCI_SLOT(iommu->bdf & 0xff); + int func = PCI_FUNC(iommu->bdf & 0xff); + + control = pci_conf_read16(bus, dev, func, + iommu->msi_cap + PCI_MSI_FLAGS); + control &= ~(1); + if ( flag ) + control |= flag; + pci_conf_write16(bus, dev, func, + iommu->msi_cap + PCI_MSI_FLAGS, control); +} + +static void iommu_msi_unmask(unsigned int vector) +{ + unsigned long flags; + struct amd_iommu *iommu = vector_to_iommu[vector]; + + /* FIXME: do not support mask bits at the moment */ + if ( iommu->maskbit ) + return; + + spin_lock_irqsave(&iommu->lock, flags); + amd_iommu_msi_enable(iommu, IOMMU_CONTROL_ENABLED); + spin_unlock_irqrestore(&iommu->lock, flags); +} + +static void iommu_msi_mask(unsigned int vector) +{ + unsigned long flags; + struct amd_iommu *iommu = vector_to_iommu[vector]; + + /* FIXME: do not support mask bits at the moment */ + if ( iommu->maskbit ) + return; + + spin_lock_irqsave(&iommu->lock, flags); + amd_iommu_msi_enable(iommu, IOMMU_CONTROL_DISABLED); + spin_unlock_irqrestore(&iommu->lock, flags); +} + +static unsigned int iommu_msi_startup(unsigned int vector) +{ + iommu_msi_unmask(vector); + return 0; +} + +static void iommu_msi_end(unsigned int vector) +{ + iommu_msi_unmask(vector); + ack_APIC_irq(); +} + +static void iommu_msi_set_affinity(unsigned int vector, cpumask_t dest) +{ + struct amd_iommu *iommu = vector_to_iommu[vector]; + amd_iommu_msi_addr_init(iommu, cpu_physical_id(first_cpu(dest))); +} + +static struct hw_interrupt_type iommu_msi_type = { + .typename = "AMD_IOV_MSI", + .startup = iommu_msi_startup, + .shutdown = iommu_msi_mask, + .enable = iommu_msi_unmask, + .disable = iommu_msi_mask, + .ack = iommu_msi_mask, + .end = iommu_msi_end, + .set_affinity = iommu_msi_set_affinity, +}; + +static void parse_event_log_entry(u32 entry[]) +{ + u16 domain_id, device_id, bdf, cword; + u32 code; + u64 *addr; + char * event_str[] = {"ILLEGAL_DEV_TABLE_ENTRY", + "IO_PAGE_FALT", + "DEV_TABLE_HW_ERROR", + "PAGE_TABLE_HW_ERROR", + "ILLEGAL_COMMAND_ERROR", + "COMMAND_HW_ERROR", + "IOTLB_INV_TIMEOUT", + "INVALID_DEV_REQUEST"}; + + code = get_field_from_reg_u32(entry[1], IOMMU_EVENT_CODE_MASK, + IOMMU_EVENT_CODE_SHIFT); + + if ( (code > IOMMU_EVENT_INVALID_DEV_REQUEST) || + (code < IOMMU_EVENT_ILLEGAL_DEV_TABLE_ENTRY) ) + { + AMD_IOMMU_DEBUG("Invalid event log entry!\n"); + return; + } + + if ( code == IOMMU_EVENT_IO_PAGE_FALT ) + { + device_id = get_field_from_reg_u32(entry[0], + IOMMU_EVENT_DEVICE_ID_MASK, + IOMMU_EVENT_DEVICE_ID_SHIFT); + domain_id = get_field_from_reg_u32(entry[1], + IOMMU_EVENT_DOMAIN_ID_MASK, + IOMMU_EVENT_DOMAIN_ID_SHIFT); + addr= (u64*) (entry + 2); + printk(XENLOG_ERR "AMD-Vi: " + "%s: domain:%d, device id:0x%x, fault address:0x%"PRIx64"\n", + event_str[code-1], domain_id, device_id, *addr); + + /* Tell the device to stop DMAing; we can't rely on the guest to + * control it for us. */ + for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ ) + if ( get_dma_requestor_id(bdf) == device_id ) + { + cword = pci_conf_read16(PCI_BUS(bdf), PCI_SLOT(bdf), + PCI_FUNC(bdf), PCI_COMMAND); + pci_conf_write16(PCI_BUS(bdf), PCI_SLOT(bdf), + PCI_FUNC(bdf), PCI_COMMAND, + cword & ~PCI_COMMAND_MASTER); + } + } +} + +static void amd_iommu_page_fault(int vector, void *dev_id, + struct cpu_user_regs *regs) +{ + u32 event[4]; + u32 entry; + unsigned long flags; + int ret = 0; + struct amd_iommu *iommu = dev_id; + + spin_lock_irqsave(&iommu->lock, flags); + ret = amd_iommu_read_event_log(iommu, event); + /* reset interrupt status bit */ + entry = readl(iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET); + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, + IOMMU_STATUS_EVENT_LOG_INT_MASK, + IOMMU_STATUS_EVENT_LOG_INT_SHIFT, &entry); + writel(entry, iommu->mmio_base+IOMMU_STATUS_MMIO_OFFSET); + spin_unlock_irqrestore(&iommu->lock, flags); + + if ( ret != 0 ) + return; + parse_event_log_entry(event); +} + +static int set_iommu_interrupt_handler(struct amd_iommu *iommu) +{ + int vector, ret; + + vector = assign_irq_vector(AUTO_ASSIGN); + + if ( !vector ) + { + AMD_IOMMU_DEBUG("no vectors\n"); + return 0; + } + + vector_to_iommu[vector] = iommu; + + /* make irq == vector */ + irq_vector[vector] = vector; + vector_irq[vector] = vector; + + irq_desc[vector].handler = &iommu_msi_type; + ret = request_irq(vector, amd_iommu_page_fault, 0, "amd_iommu", iommu); + if ( ret ) + { + AMD_IOMMU_DEBUG("can't request irq\n"); + return 0; + } + iommu->vector = vector; + return vector; +} + +void __init enable_iommu(struct amd_iommu *iommu) +{ + unsigned long flags; + + spin_lock_irqsave(&iommu->lock, flags); + + if ( iommu->enabled ) + { + spin_unlock_irqrestore(&iommu->lock, flags); + return; + } + + iommu->dev_table.alloc_size = device_table.alloc_size; + iommu->dev_table.entries = device_table.entries; + iommu->dev_table.buffer = device_table.buffer; + + register_iommu_dev_table_in_mmio_space(iommu); + register_iommu_cmd_buffer_in_mmio_space(iommu); + register_iommu_event_log_in_mmio_space(iommu); + register_iommu_exclusion_range(iommu); + + amd_iommu_msi_data_init (iommu); + amd_iommu_msi_addr_init(iommu, cpu_physical_id(first_cpu(cpu_online_map))); + amd_iommu_msi_enable(iommu, IOMMU_CONTROL_ENABLED); + + set_iommu_command_buffer_control(iommu, IOMMU_CONTROL_ENABLED); + set_iommu_event_log_control(iommu, IOMMU_CONTROL_ENABLED); + set_iommu_translation_control(iommu, IOMMU_CONTROL_ENABLED); + + printk("AMD-Vi: IOMMU %d Enabled.\n", nr_amd_iommus ); + nr_amd_iommus++; + + iommu->enabled = 1; + spin_unlock_irqrestore(&iommu->lock, flags); + +} + +static void __init deallocate_iommu_table_struct( + struct table_struct *table) +{ + int order = 0; + if ( table->buffer ) + { + order = get_order_from_bytes(table->alloc_size); + __free_amd_iommu_tables(table->buffer, order); + table->buffer = NULL; + } +} + +static int __init allocate_iommu_table_struct(struct table_struct *table, + const char *name) +{ + int order = 0; + if ( table->buffer == NULL ) + { + order = get_order_from_bytes(table->alloc_size); + table->buffer = __alloc_amd_iommu_tables(order); + + if ( table->buffer == NULL ) + { + AMD_IOMMU_DEBUG("Error allocating %s\n", name); + return -ENOMEM; + } + memset(table->buffer, 0, PAGE_SIZE * (1UL << order)); + } + return 0; +} + +static int __init allocate_cmd_buffer(struct amd_iommu *iommu) +{ + /* allocate 'command buffer' in power of 2 increments of 4K */ + iommu->cmd_buffer_tail = 0; + iommu->cmd_buffer.alloc_size = PAGE_SIZE << + get_order_from_bytes( + PAGE_ALIGN(amd_iommu_cmd_buffer_entries * + IOMMU_CMD_BUFFER_ENTRY_SIZE)); + iommu->cmd_buffer.entries = iommu->cmd_buffer.alloc_size / + IOMMU_CMD_BUFFER_ENTRY_SIZE; + + return (allocate_iommu_table_struct(&iommu->cmd_buffer, "Command Buffer")); +} + +static int __init allocate_event_log(struct amd_iommu *iommu) +{ + /* allocate 'event log' in power of 2 increments of 4K */ + iommu->event_log_head = 0; + iommu->event_log.alloc_size = PAGE_SIZE << + get_order_from_bytes( + PAGE_ALIGN(amd_iommu_event_log_entries * + IOMMU_EVENT_LOG_ENTRY_SIZE)); + iommu->event_log.entries = iommu->event_log.alloc_size / + IOMMU_EVENT_LOG_ENTRY_SIZE; + + return (allocate_iommu_table_struct(&iommu->event_log, "Event Log")); +} + + +int __init amd_iommu_init_one(struct amd_iommu *iommu) +{ + + if ( allocate_cmd_buffer(iommu) != 0 ) + goto error_out; + + if ( allocate_event_log(iommu) != 0 ) + goto error_out; + + if ( map_iommu_mmio_region(iommu) != 0 ) + goto error_out; + + if ( set_iommu_interrupt_handler(iommu) == 0 ) + goto error_out; + + enable_iommu(iommu); + return 0; + +error_out: + return -ENODEV; +} + +void __init amd_iommu_init_cleanup(void) +{ + struct amd_iommu *iommu, *next; + int bdf; + + list_for_each_entry_safe ( iommu, next, &amd_iommu_head, list ) + { + list_del(&iommu->list); + if ( iommu->enabled ) + { + deallocate_iommu_table_struct(&iommu->cmd_buffer); + deallocate_iommu_table_struct(&iommu->event_log); + unmap_iommu_mmio_region(iommu); + } + xfree(iommu); + } + + /* free interrupt remapping table */ + for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ ) + { + if ( ivrs_mappings[bdf].intremap_table ) + amd_iommu_free_intremap_table(bdf); + } + + /* free device table */ + deallocate_iommu_table_struct(&device_table); + + /* free IVRS_mappings */ + if ( ivrs_mappings ) + { + xfree(ivrs_mappings); + ivrs_mappings = NULL; + } + + iommu_enabled = 0; + iommu_passthrough = 0; + iommu_intremap = 0; +} + +static int __init init_ivrs_mapping(void) +{ + int bdf; + + BUG_ON( !ivrs_bdf_entries ); + + ivrs_mappings = xmalloc_array( struct ivrs_mappings, ivrs_bdf_entries); + if ( ivrs_mappings == NULL ) + { + AMD_IOMMU_DEBUG("Error allocating IVRS Mappings table\n"); + return -ENOMEM; + } + memset(ivrs_mappings, 0, ivrs_bdf_entries * sizeof(struct ivrs_mappings)); + + /* assign default values for device entries */ + for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ ) + { + ivrs_mappings[bdf].dte_requestor_id = bdf; + ivrs_mappings[bdf].dte_sys_mgt_enable = + IOMMU_DEV_TABLE_SYS_MGT_MSG_FORWARDED; + ivrs_mappings[bdf].dte_allow_exclusion = IOMMU_CONTROL_DISABLED; + ivrs_mappings[bdf].unity_map_enable = IOMMU_CONTROL_DISABLED; + ivrs_mappings[bdf].iommu = NULL; + + ivrs_mappings[bdf].intremap_table = NULL; + ivrs_mappings[bdf].dte_lint1_pass = IOMMU_CONTROL_DISABLED; + ivrs_mappings[bdf].dte_lint0_pass = IOMMU_CONTROL_DISABLED; + ivrs_mappings[bdf].dte_nmi_pass = IOMMU_CONTROL_DISABLED; + ivrs_mappings[bdf].dte_ext_int_pass = IOMMU_CONTROL_DISABLED; + ivrs_mappings[bdf].dte_init_pass = IOMMU_CONTROL_DISABLED; + + if ( amd_iommu_perdev_intremap ) + spin_lock_init(&ivrs_mappings[bdf].intremap_lock); + } + return 0; +} + +static int __init amd_iommu_setup_device_table(void) +{ + int bdf; + void *intr_tb, *dte; + int sys_mgt, dev_ex, lint1_pass, lint0_pass, nmi_pass, ext_int_pass, + init_pass; + + BUG_ON(ivrs_bdf_entries == 0); + + /* allocate 'device table' on a 4K boundary */ + device_table.alloc_size = PAGE_SIZE << get_order_from_bytes( + PAGE_ALIGN(ivrs_bdf_entries * IOMMU_DEV_TABLE_ENTRY_SIZE)); + device_table.entries = device_table.alloc_size / IOMMU_DEV_TABLE_ENTRY_SIZE; + + if ( allocate_iommu_table_struct(&device_table, "Device Table") != 0 ) + return -ENOMEM; + + /* add device table entries */ + for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ ) + { + intr_tb = ivrs_mappings[bdf].intremap_table; + + if ( intr_tb ) + { + sys_mgt = ivrs_mappings[bdf].dte_sys_mgt_enable; + dev_ex = ivrs_mappings[bdf].dte_allow_exclusion; + + /* get interrupt remapping settings */ + lint1_pass = ivrs_mappings[bdf].dte_lint1_pass; + lint0_pass = ivrs_mappings[bdf].dte_lint0_pass; + nmi_pass = ivrs_mappings[bdf].dte_nmi_pass; + ext_int_pass = ivrs_mappings[bdf].dte_ext_int_pass; + init_pass = ivrs_mappings[bdf].dte_init_pass; + + /* add device table entry */ + dte = device_table.buffer + (bdf * IOMMU_DEV_TABLE_ENTRY_SIZE); + amd_iommu_add_dev_table_entry( + dte, sys_mgt, dev_ex, lint1_pass, lint0_pass, + nmi_pass, ext_int_pass, init_pass); + + amd_iommu_set_intremap_table( + dte, (u64)virt_to_maddr(intr_tb), iommu_intremap); + + AMD_IOMMU_DEBUG("Add device table entry at DTE:0x%x, " + "intremap_table:%"PRIx64"\n", bdf, + (u64)virt_to_maddr(intr_tb)); + } + } + + return 0; +} + +int __init amd_iommu_init(void) +{ + struct amd_iommu *iommu; + + BUG_ON( !iommu_found() ); + + /* find the max BDF in IRVS table. It will be used in init_ivrs_mapping */ + ivrs_bdf_entries = amd_iommu_get_ivrs_dev_entries(); + + if ( !ivrs_bdf_entries ) + goto error_out; + + if ( init_ivrs_mapping() != 0 ) + goto error_out; + + /* start to read and store IVRS info into ivrs_mapping structure */ + if ( amd_iommu_update_ivrs_mapping_acpi() != 0 ) + goto error_out; + + /* initialize io-apic interrupt remapping entries */ + if ( amd_iommu_setup_ioapic_remapping() != 0 ) + goto error_out; + + /* allocate and initiliaze a global device table shared by all iommus */ + if ( amd_iommu_setup_device_table() != 0 ) + goto error_out; + + for_each_amd_iommu ( iommu ) + if ( amd_iommu_init_one(iommu) != 0 ) + goto error_out; + return 0; + +error_out: + amd_iommu_init_cleanup(); + return -ENODEV; +} + + diff -Naurp xen/drivers/passthrough/amd/iommu_intr.c xen-redhat/drivers/passthrough/amd/iommu_intr.c --- xen/drivers/passthrough/amd/iommu_intr.c +++ xen-redhat/drivers/passthrough/amd/iommu_intr.c @@ -0,0 +1,399 @@ +/* + * Copyright (C) 2007 Advanced Micro Devices, Inc. + * Author: Wei Wang <wei.wang2@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <xen/sched.h> +#include <xen/hvm/iommu.h> +#include <asm/amd-iommu.h> +#include <asm/hvm/svm/amd-iommu-proto.h> + +int ioapic_bdf[MAX_IO_APICS]; +#define INTREMAP_TABLE_ORDER 1 +#define INTREMAP_LENGTH 0xB +#define INTREMAP_ENTRIES (1 << INTREMAP_LENGTH) + +extern struct ivrs_mappings *ivrs_mappings; +extern unsigned short ivrs_bdf_entries; +void *shared_intremap_table; +static DEFINE_SPINLOCK(shared_intremap_lock); + +static spinlock_t* get_intremap_lock(int req_id) +{ + return (amd_iommu_perdev_intremap ? + &ivrs_mappings[req_id].intremap_lock: + &shared_intremap_lock); +} + +static int get_intremap_requestor_id(int bdf) +{ + ASSERT( bdf < ivrs_bdf_entries ); + return ivrs_mappings[bdf].dte_requestor_id; +} + +static int get_intremap_offset(u8 vector, u8 dm) +{ + int offset = 0; + offset = (dm << INT_REMAP_INDEX_DM_SHIFT) & INT_REMAP_INDEX_DM_MASK; + offset |= (vector << INT_REMAP_INDEX_VECTOR_SHIFT ) & + INT_REMAP_INDEX_VECTOR_MASK; + return offset; +} + +static u8 *get_intremap_entry(int bdf, int offset) +{ + u8 *table; + + table = (u8*)ivrs_mappings[bdf].intremap_table; + ASSERT( (table != NULL) && (offset < INTREMAP_ENTRIES) ); + + return (u8*) (table + offset); +} + +static void free_intremap_entry(int bdf, int offset) +{ + u32* entry; + entry = (u32*)get_intremap_entry(bdf, offset); + memset(entry, 0, sizeof(u32)); +} + +static void update_intremap_entry(u32* entry, u8 vector, u8 int_type, + u8 dest_mode, u8 dest) +{ + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, 0, + INT_REMAP_ENTRY_REMAPEN_MASK, + INT_REMAP_ENTRY_REMAPEN_SHIFT, entry); + set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, *entry, + INT_REMAP_ENTRY_SUPIOPF_MASK, + INT_REMAP_ENTRY_SUPIOPF_SHIFT, entry); + set_field_in_reg_u32(int_type, *entry, + INT_REMAP_ENTRY_INTTYPE_MASK, + INT_REMAP_ENTRY_INTTYPE_SHIFT, entry); + set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, *entry, + INT_REMAP_ENTRY_REQEOI_MASK, + INT_REMAP_ENTRY_REQEOI_SHIFT, entry); + set_field_in_reg_u32((u32)dest_mode, *entry, + INT_REMAP_ENTRY_DM_MASK, + INT_REMAP_ENTRY_DM_SHIFT, entry); + set_field_in_reg_u32((u32)dest, *entry, + INT_REMAP_ENTRY_DEST_MAST, + INT_REMAP_ENTRY_DEST_SHIFT, entry); + set_field_in_reg_u32((u32)vector, *entry, + INT_REMAP_ENTRY_VECTOR_MASK, + INT_REMAP_ENTRY_VECTOR_SHIFT, entry); +} + +void invalidate_interrupt_table(struct amd_iommu *iommu, u16 device_id) +{ + u32 cmd[4], entry; + + cmd[3] = cmd[2] = 0; + set_field_in_reg_u32(device_id, 0, + IOMMU_INV_INT_TABLE_DEVICE_ID_MASK, + IOMMU_INV_INT_TABLE_DEVICE_ID_SHIFT, &entry); + cmd[0] = entry; + set_field_in_reg_u32(IOMMU_CMD_INVALIDATE_INT_TABLE, 0, + IOMMU_CMD_OPCODE_MASK, IOMMU_CMD_OPCODE_SHIFT, + &entry); + cmd[1] = entry; + send_iommu_command(iommu, cmd); +} + +static void update_intremap_entry_from_ioapic( + int bdf, + struct amd_iommu *iommu, + struct IO_APIC_route_entry *ioapic_rte, + unsigned int rte_upper, unsigned int value) +{ + unsigned long flags; + u32* entry; + u8 delivery_mode, dest, vector, dest_mode; + struct IO_APIC_route_entry *rte = ioapic_rte; + int req_id; + spinlock_t *lock; + int offset; + + req_id = get_intremap_requestor_id(bdf); + lock = get_intremap_lock(req_id); + /* only remap interrupt vector when lower 32 bits in ioapic ire changed */ + if ( rte_upper ) + { + delivery_mode = rte->delivery_mode; + vector = rte->vector; + dest_mode = rte->dest_mode; + dest = rte->dest.logical.logical_dest; + + spin_lock_irqsave(lock, flags); + offset = get_intremap_offset(vector, delivery_mode); + entry = (u32*)get_intremap_entry(req_id, offset); + + update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest); + spin_unlock_irqrestore(lock, flags); + + if ( iommu->enabled ) + { + spin_lock_irqsave(&iommu->lock, flags); + invalidate_interrupt_table(iommu, req_id); + flush_command_buffer(iommu); + spin_unlock_irqrestore(&iommu->lock, flags); + } + } +} + + +extern int nr_ioapic_registers[MAX_IO_APICS]; +extern int nr_ioapics; + +int __init amd_iommu_setup_ioapic_remapping(void) +{ + struct IO_APIC_route_entry rte = {0}; + unsigned long flags; + u32* entry; + int apic, pin; + u8 delivery_mode, dest, vector, dest_mode; + u16 bdf, req_id, bus, devfn; + struct amd_iommu *iommu; + spinlock_t *lock; + int offset; + + /* Read ioapic entries and update interrupt remapping table accordingly */ + for ( apic = 0; apic < nr_ioapics; apic++ ) + { + for ( pin = 0; pin < nr_ioapic_registers[apic]; pin++ ) + { + *(((int *)&rte) + 1) = io_apic_read(apic, 0x11 + 2 * pin); + *(((int *)&rte) + 0) = io_apic_read(apic, 0x10 + 2 * pin); + + if ( rte.mask == 1 ) + continue; + + bdf = ioapic_bdf[IO_APIC_ID(apic)]; + bus = bdf >> 8; + devfn = bdf & 0xFF; + iommu = find_iommu_for_device(bus, devfn); + + if ( !iommu ) + { + AMD_IOMMU_DEBUG("failed to find iommu for ioapic device " + "id = 0x%x\n", bdf); + continue; + } + + req_id = get_intremap_requestor_id(bdf); + lock = get_intremap_lock(req_id); + + delivery_mode = rte.delivery_mode; + vector = rte.vector; + dest_mode = rte.dest_mode; + dest = rte.dest.logical.logical_dest; + + spin_lock_irqsave(lock, flags); + offset = get_intremap_offset(vector, delivery_mode); + entry = (u32*)get_intremap_entry(req_id, offset); + update_intremap_entry(entry, vector, delivery_mode, dest_mode, + dest); + spin_unlock_irqrestore(lock, flags); + + if ( iommu->enabled ) + { + spin_lock_irqsave(&iommu->lock, flags); + invalidate_interrupt_table(iommu, req_id); + flush_command_buffer(iommu); + spin_unlock_irqrestore(&iommu->lock, flags); + } + } + } + + return 0; +} + +void amd_iommu_ioapic_update_ire( + unsigned int apic, unsigned int reg, unsigned int value) +{ + struct IO_APIC_route_entry ioapic_rte = { 0 }; + unsigned int rte_upper = (reg & 1) ? 1 : 0; + int saved_mask; + u16 bus, devfn, bdf; + struct amd_iommu *iommu; + + *IO_APIC_BASE(apic) = reg; + *(IO_APIC_BASE(apic)+4) = value; + + if ( !iommu_intremap ) + return; + + /* get device id of ioapic devices */ + bdf = ioapic_bdf[IO_APIC_ID(apic)]; + bus = bdf >> 8; + devfn = bdf & 0xFF; + iommu = find_iommu_for_device(bus, devfn); + if ( !iommu ) + { + AMD_IOMMU_DEBUG( + "Fail to find iommu for ioapic device id = 0x%x\n", bdf); + return; + } + + if ( !rte_upper ) + return; + + reg--; + /* read both lower and upper 32-bits of rte entry */ + *IO_APIC_BASE(apic) = reg; + *(((u32 *)&ioapic_rte) + 0) = *(IO_APIC_BASE(apic)+4); + *IO_APIC_BASE(apic) = reg + 1; + *(((u32 *)&ioapic_rte) + 1) = *(IO_APIC_BASE(apic)+4); + + /* mask the interrupt while we change the intremap table */ + saved_mask = ioapic_rte.mask; + ioapic_rte.mask = 1; + *IO_APIC_BASE(apic) = reg; + *(IO_APIC_BASE(apic)+4) = *(((int *)&ioapic_rte)+0); + ioapic_rte.mask = saved_mask; + + + update_intremap_entry_from_ioapic(bdf, iommu, + &ioapic_rte, rte_upper, value); + + /* unmask the interrupt after we have updated the intremap table */ + *IO_APIC_BASE(apic) = reg; + *(IO_APIC_BASE(apic)+4) = *(((u32 *)&ioapic_rte)+0); +} + +static void update_intremap_entry_from_msi_msg( + struct amd_iommu *iommu, struct pci_dev *pdev, + struct msi_desc *msi_desc, struct msi_msg *msg) +{ + unsigned long flags; + u32* entry; + u16 dev_id, alias_id, bus, devfn, req_id; + + u8 delivery_mode, dest, vector, dest_mode; + spinlock_t *lock; + int offset; + + dev_id = (pdev->bus << 8) | pdev->devfn; + bus = pdev->bus; + devfn = pdev->devfn; + req_id = get_dma_requestor_id(dev_id); + alias_id = get_intremap_requestor_id(dev_id); + + if ( msg == NULL ) + { + lock = get_intremap_lock(req_id); + spin_lock_irqsave(lock, flags); + free_intremap_entry(req_id, msi_desc->remap_index); + spin_unlock_irqrestore(lock, flags); + + if ( ( req_id != alias_id ) && + ivrs_mappings[alias_id].intremap_table != NULL ) + { + lock = get_intremap_lock(alias_id); + spin_lock_irqsave(lock, flags); + free_intremap_entry(alias_id, msi_desc->remap_index); + spin_unlock_irqrestore(lock, flags); + } + goto done; + } + + lock = get_intremap_lock(req_id); + + spin_lock_irqsave(lock, flags); + dest_mode = (msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT) & 0x1; + delivery_mode = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x1; + vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) & MSI_DATA_VECTOR_MASK; + dest = (msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff; + offset = get_intremap_offset(vector, delivery_mode); + msi_desc->remap_index = offset; + + entry = (u32*)get_intremap_entry(req_id, offset); + update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest); + spin_unlock_irqrestore(lock, flags); + + /* + * In some special cases, a pci-e device(e.g SATA controller in IDE mode) + * will use alias id to index interrupt remapping table. + * We have to setup a secondary interrupt remapping entry to satisfy those + * devices. + */ + + lock = get_intremap_lock(alias_id); + if ( ( req_id != alias_id ) && + ivrs_mappings[alias_id].intremap_table != NULL ) + { + spin_lock_irqsave(lock, flags); + entry = (u32*)get_intremap_entry(alias_id, offset); + update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest); + spin_unlock_irqrestore(lock, flags); + } + +done: + if ( iommu->enabled ) + { + spin_lock_irqsave(&iommu->lock, flags); + invalidate_interrupt_table(iommu, dev_id); + if ( alias_id != req_id ) + invalidate_interrupt_table(iommu, alias_id); + flush_command_buffer(iommu); + spin_unlock_irqrestore(&iommu->lock, flags); + } + + return; +} + +void amd_iommu_msi_msg_update_ire( + struct msi_desc *msi_desc, struct msi_msg *msg) +{ + struct pci_dev *pdev = msi_desc->dev; + struct amd_iommu *iommu = NULL; + + if ( !iommu_intremap ) + return; + + iommu = find_iommu_for_device(pdev->bus, pdev->devfn); + + if ( !iommu ) + { + AMD_IOMMU_DEBUG( + "Fail to find iommu for MSI device id = 0x%x\n", + (pdev->bus << 8) | pdev->devfn); + return; + } + + update_intremap_entry_from_msi_msg(iommu, pdev, msi_desc, msg); +} + +void __init amd_iommu_free_intremap_table(int bdf) +{ + void *tb = ivrs_mappings[bdf].intremap_table; + + if ( tb ) + { + __free_amd_iommu_tables(tb, INTREMAP_TABLE_ORDER); + ivrs_mappings[bdf].intremap_table = NULL; + } +} + +void* __init amd_iommu_alloc_intremap_table(void) +{ + void *tb; + tb = __alloc_amd_iommu_tables(INTREMAP_TABLE_ORDER); + BUG_ON(tb == NULL); + memset(tb, 0, PAGE_SIZE * (1UL << INTREMAP_TABLE_ORDER)); + return tb; +} + diff -Naurp xen/drivers/passthrough/amd/iommu_map.c xen-redhat/drivers/passthrough/amd/iommu_map.c --- xen/drivers/passthrough/amd/iommu_map.c +++ xen-redhat/drivers/passthrough/amd/iommu_map.c @@ -0,0 +1,659 @@ +/* + * Copyright (C) 2007 Advanced Micro Devices, Inc. + * Author: Leo Duran <leo.duran@amd.com> + * Author: Wei Wang <wei.wang2@amd.com> - adapted to xen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <xen/sched.h> +#include <xen/hvm/iommu.h> +#include <asm/amd-iommu.h> +#include <asm/hvm/svm/amd-iommu-proto.h> + +static int queue_iommu_command(struct amd_iommu *iommu, u32 cmd[]) +{ + u32 tail, head, *cmd_buffer; + int i; + + tail = iommu->cmd_buffer_tail; + if ( ++tail == iommu->cmd_buffer.entries ) + tail = 0; + head = get_field_from_reg_u32( + readl(iommu->mmio_base+IOMMU_CMD_BUFFER_HEAD_OFFSET), + IOMMU_CMD_BUFFER_HEAD_MASK, + IOMMU_CMD_BUFFER_HEAD_SHIFT); + if ( head != tail ) + { + cmd_buffer = (u32 *)(iommu->cmd_buffer.buffer + + (iommu->cmd_buffer_tail * + IOMMU_CMD_BUFFER_ENTRY_SIZE)); + for ( i = 0; i < IOMMU_CMD_BUFFER_U32_PER_ENTRY; i++ ) + cmd_buffer[i] = cmd[i]; + + iommu->cmd_buffer_tail = tail; + return 1; + } + + return 0; +} + +static void commit_iommu_command_buffer(struct amd_iommu *iommu) +{ + u32 tail; + + set_field_in_reg_u32(iommu->cmd_buffer_tail, 0, + IOMMU_CMD_BUFFER_TAIL_MASK, + IOMMU_CMD_BUFFER_TAIL_SHIFT, &tail); + writel(tail, iommu->mmio_base+IOMMU_CMD_BUFFER_TAIL_OFFSET); +} + +int send_iommu_command(struct amd_iommu *iommu, u32 cmd[]) +{ + if ( queue_iommu_command(iommu, cmd) ) + { + commit_iommu_command_buffer(iommu); + return 1; + } + + return 0; +} + +static void invalidate_iommu_page(struct amd_iommu *iommu, + u64 io_addr, u16 domain_id) +{ + u64 addr_lo, addr_hi; + u32 cmd[4], entry; + + addr_lo = io_addr & DMA_32BIT_MASK; + addr_hi = io_addr >> 32; + + set_field_in_reg_u32(domain_id, 0, + IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_MASK, + IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_SHIFT, &entry); + set_field_in_reg_u32(IOMMU_CMD_INVALIDATE_IOMMU_PAGES, entry, + IOMMU_CMD_OPCODE_MASK, IOMMU_CMD_OPCODE_SHIFT, + &entry); + cmd[1] = entry; + + set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, 0, + IOMMU_INV_IOMMU_PAGES_S_FLAG_MASK, + IOMMU_INV_IOMMU_PAGES_S_FLAG_SHIFT, &entry); + set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, entry, + IOMMU_INV_IOMMU_PAGES_PDE_FLAG_MASK, + IOMMU_INV_IOMMU_PAGES_PDE_FLAG_SHIFT, &entry); + set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, entry, + IOMMU_INV_IOMMU_PAGES_ADDR_LOW_MASK, + IOMMU_INV_IOMMU_PAGES_ADDR_LOW_SHIFT, &entry); + cmd[2] = entry; + + set_field_in_reg_u32((u32)addr_hi, 0, + IOMMU_INV_IOMMU_PAGES_ADDR_HIGH_MASK, + IOMMU_INV_IOMMU_PAGES_ADDR_HIGH_SHIFT, &entry); + cmd[3] = entry; + + cmd[0] = 0; + send_iommu_command(iommu, cmd); +} + +void flush_command_buffer(struct amd_iommu *iommu) +{ + u32 cmd[4], status; + int loop_count, comp_wait; + + /* clear 'ComWaitInt' in status register (WIC) */ + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, 0, + IOMMU_STATUS_COMP_WAIT_INT_MASK, + IOMMU_STATUS_COMP_WAIT_INT_SHIFT, &status); + writel(status, iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET); + + /* send an empty COMPLETION_WAIT command to flush command buffer */ + cmd[3] = cmd[2] = 0; + set_field_in_reg_u32(IOMMU_CMD_COMPLETION_WAIT, 0, + IOMMU_CMD_OPCODE_MASK, + IOMMU_CMD_OPCODE_SHIFT, &cmd[1]); + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, 0, + IOMMU_COMP_WAIT_I_FLAG_MASK, + IOMMU_COMP_WAIT_I_FLAG_SHIFT, &cmd[0]); + send_iommu_command(iommu, cmd); + + /* wait for 'ComWaitInt' to signal comp#endifletion? */ + loop_count = 1000; + do { + status = readl(iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET); + comp_wait = get_field_from_reg_u32(status, + IOMMU_STATUS_COMP_WAIT_INT_MASK, + IOMMU_STATUS_COMP_WAIT_INT_SHIFT); + --loop_count; + } while ( !comp_wait && loop_count ); + + if ( comp_wait ) + { + /* clear 'ComWaitInt' in status register (WIC) */ + status &= IOMMU_STATUS_COMP_WAIT_INT_MASK; + writel(status, iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET); + return; + } + AMD_IOMMU_DEBUG("Warning: ComWaitInt bit did not assert!\n"); +} + +static void clear_iommu_l1e_present(u64 l2e, unsigned long gfn) +{ + u32 *l1e; + int offset; + void *l1_table; + + l1_table = map_domain_page(l2e >> PAGE_SHIFT); + + offset = gfn & (~PTE_PER_TABLE_MASK); + l1e = (u32*)(l1_table + (offset * IOMMU_PAGE_TABLE_ENTRY_SIZE)); + + /* clear l1 entry */ + l1e[0] = l1e[1] = 0; + + unmap_domain_page(l1_table); +} + +static void set_iommu_l1e_present(u64 l2e, unsigned long gfn, + u64 maddr, int iw, int ir) +{ + u64 addr_lo, addr_hi; + u32 entry; + void *l1_table; + int offset; + u32 *l1e; + + l1_table = map_domain_page(l2e >> PAGE_SHIFT); + + offset = gfn & (~PTE_PER_TABLE_MASK); + l1e = (u32*)((u8*)l1_table + (offset * IOMMU_PAGE_TABLE_ENTRY_SIZE)); + + addr_lo = maddr & DMA_32BIT_MASK; + addr_hi = maddr >> 32; + + set_field_in_reg_u32((u32)addr_hi, 0, + IOMMU_PTE_ADDR_HIGH_MASK, + IOMMU_PTE_ADDR_HIGH_SHIFT, &entry); + set_field_in_reg_u32(iw ? IOMMU_CONTROL_ENABLED : + IOMMU_CONTROL_DISABLED, entry, + IOMMU_PTE_IO_WRITE_PERMISSION_MASK, + IOMMU_PTE_IO_WRITE_PERMISSION_SHIFT, &entry); + set_field_in_reg_u32(ir ? IOMMU_CONTROL_ENABLED : + IOMMU_CONTROL_DISABLED, entry, + IOMMU_PTE_IO_READ_PERMISSION_MASK, + IOMMU_PTE_IO_READ_PERMISSION_SHIFT, &entry); + l1e[1] = entry; + + set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0, + IOMMU_PTE_ADDR_LOW_MASK, + IOMMU_PTE_ADDR_LOW_SHIFT, &entry); + set_field_in_reg_u32(IOMMU_PAGING_MODE_LEVEL_0, entry, + IOMMU_PTE_NEXT_LEVEL_MASK, + IOMMU_PTE_NEXT_LEVEL_SHIFT, &entry); + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, + IOMMU_PTE_PRESENT_MASK, + IOMMU_PTE_PRESENT_SHIFT, &entry); + l1e[0] = entry; + + unmap_domain_page(l1_table); +} + +static void amd_iommu_set_page_directory_entry(u32 *pde, + u64 next_ptr, u8 next_level) +{ + u64 addr_lo, addr_hi; + u32 entry; + + addr_lo = next_ptr & DMA_32BIT_MASK; + addr_hi = next_ptr >> 32; + + /* enable read/write permissions,which will be enforced at the PTE */ + set_field_in_reg_u32((u32)addr_hi, 0, + IOMMU_PDE_ADDR_HIGH_MASK, + IOMMU_PDE_ADDR_HIGH_SHIFT, &entry); + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, + IOMMU_PDE_IO_WRITE_PERMISSION_MASK, + IOMMU_PDE_IO_WRITE_PERMISSION_SHIFT, &entry); + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, + IOMMU_PDE_IO_READ_PERMISSION_MASK, + IOMMU_PDE_IO_READ_PERMISSION_SHIFT, &entry); + pde[1] = entry; + + /* mark next level as 'present' */ + set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0, + IOMMU_PDE_ADDR_LOW_MASK, + IOMMU_PDE_ADDR_LOW_SHIFT, &entry); + set_field_in_reg_u32(next_level, entry, + IOMMU_PDE_NEXT_LEVEL_MASK, + IOMMU_PDE_NEXT_LEVEL_SHIFT, &entry); + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, + IOMMU_PDE_PRESENT_MASK, + IOMMU_PDE_PRESENT_SHIFT, &entry); + pde[0] = entry; +} + +void amd_iommu_set_root_page_table( + u32 *dte, u64 root_ptr, u16 domain_id, u8 paging_mode, u8 valid) +{ + u64 addr_hi, addr_lo; + u32 entry; + set_field_in_reg_u32(domain_id, 0, + IOMMU_DEV_TABLE_DOMAIN_ID_MASK, + IOMMU_DEV_TABLE_DOMAIN_ID_SHIFT, &entry); + dte[2] = entry; + + addr_lo = root_ptr & DMA_32BIT_MASK; + addr_hi = root_ptr >> 32; + + set_field_in_reg_u32((u32)addr_hi, 0, + IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_MASK, + IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_SHIFT, &entry); + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, + IOMMU_DEV_TABLE_IO_WRITE_PERMISSION_MASK, + IOMMU_DEV_TABLE_IO_WRITE_PERMISSION_SHIFT, &entry); + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, + IOMMU_DEV_TABLE_IO_READ_PERMISSION_MASK, + IOMMU_DEV_TABLE_IO_READ_PERMISSION_SHIFT, &entry); + dte[1] = entry; + + set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0, + IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_MASK, + IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_SHIFT, &entry); + set_field_in_reg_u32(paging_mode, entry, + IOMMU_DEV_TABLE_PAGING_MODE_MASK, + IOMMU_DEV_TABLE_PAGING_MODE_SHIFT, &entry); + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, + IOMMU_DEV_TABLE_TRANSLATION_VALID_MASK, + IOMMU_DEV_TABLE_TRANSLATION_VALID_SHIFT, &entry); + set_field_in_reg_u32(valid ? IOMMU_CONTROL_ENABLED : + IOMMU_CONTROL_DISABLED, entry, + IOMMU_DEV_TABLE_VALID_MASK, + IOMMU_DEV_TABLE_VALID_SHIFT, &entry); + dte[0] = entry; +} + +void amd_iommu_set_intremap_table(u32 *dte, u64 intremap_ptr, u8 int_valid) +{ + u64 addr_hi, addr_lo; + u32 entry; + + addr_lo = intremap_ptr & DMA_32BIT_MASK; + addr_hi = intremap_ptr >> 32; + + entry = dte[5]; + set_field_in_reg_u32((u32)addr_hi, entry, + IOMMU_DEV_TABLE_INT_TABLE_PTR_HIGH_MASK, + IOMMU_DEV_TABLE_INT_TABLE_PTR_HIGH_SHIFT, &entry); + /* Fixed and arbitrated interrupts remapepd */ + set_field_in_reg_u32(2, entry, + IOMMU_DEV_TABLE_INT_CONTROL_MASK, + IOMMU_DEV_TABLE_INT_CONTROL_SHIFT, &entry); + dte[5] = entry; + + set_field_in_reg_u32((u32)addr_lo >> 6, 0, + IOMMU_DEV_TABLE_INT_TABLE_PTR_LOW_MASK, + IOMMU_DEV_TABLE_INT_TABLE_PTR_LOW_SHIFT, &entry); + /* 2048 entries */ + set_field_in_reg_u32(0xB, entry, + IOMMU_DEV_TABLE_INT_TABLE_LENGTH_MASK, + IOMMU_DEV_TABLE_INT_TABLE_LENGTH_SHIFT, &entry); + /* ignore unmapped interrupts */ + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, + IOMMU_DEV_TABLE_INT_TABLE_IGN_UNMAPPED_MASK, + IOMMU_DEV_TABLE_INT_TABLE_IGN_UNMAPPED_SHIFT, &entry); + set_field_in_reg_u32(int_valid ? IOMMU_CONTROL_ENABLED : + IOMMU_CONTROL_DISABLED, entry, + IOMMU_DEV_TABLE_INT_VALID_MASK, + IOMMU_DEV_TABLE_INT_VALID_SHIFT, &entry); + dte[4] = entry; +} + +void amd_iommu_add_dev_table_entry( + u32 *dte, u8 sys_mgt, u8 dev_ex, u8 lint1_pass, u8 lint0_pass, + u8 nmi_pass, u8 ext_int_pass, u8 init_pass) +{ + u32 entry; + + dte[7] = dte[6] = dte[4] = dte[2] = dte[1] = dte[0] = 0; + + + set_field_in_reg_u32(init_pass ? IOMMU_CONTROL_ENABLED : + IOMMU_CONTROL_DISABLED, 0, + IOMMU_DEV_TABLE_INIT_PASSTHRU_MASK, + IOMMU_DEV_TABLE_INIT_PASSTHRU_SHIFT, &entry); + set_field_in_reg_u32(ext_int_pass ? IOMMU_CONTROL_ENABLED : + IOMMU_CONTROL_DISABLED, entry, + IOMMU_DEV_TABLE_EINT_PASSTHRU_MASK, + IOMMU_DEV_TABLE_EINT_PASSTHRU_SHIFT, &entry); + set_field_in_reg_u32(nmi_pass ? IOMMU_CONTROL_ENABLED : + IOMMU_CONTROL_DISABLED, entry, + IOMMU_DEV_TABLE_NMI_PASSTHRU_MASK, + IOMMU_DEV_TABLE_NMI_PASSTHRU_SHIFT, &entry); + set_field_in_reg_u32(lint0_pass ? IOMMU_CONTROL_ENABLED : + IOMMU_CONTROL_DISABLED, entry, + IOMMU_DEV_TABLE_LINT0_ENABLE_MASK, + IOMMU_DEV_TABLE_LINT0_ENABLE_SHIFT, &entry); + set_field_in_reg_u32(lint1_pass ? IOMMU_CONTROL_ENABLED : + IOMMU_CONTROL_DISABLED, entry, + IOMMU_DEV_TABLE_LINT1_ENABLE_MASK, + IOMMU_DEV_TABLE_LINT1_ENABLE_SHIFT, &entry); + dte[5] = entry; + + set_field_in_reg_u32(sys_mgt, 0, + IOMMU_DEV_TABLE_SYS_MGT_MSG_ENABLE_MASK, + IOMMU_DEV_TABLE_SYS_MGT_MSG_ENABLE_SHIFT, &entry); + set_field_in_reg_u32(dev_ex, entry, + IOMMU_DEV_TABLE_ALLOW_EXCLUSION_MASK, + IOMMU_DEV_TABLE_ALLOW_EXCLUSION_SHIFT, &entry); + dte[3] = entry; +} + + + +u64 amd_iommu_get_next_table_from_pte(u32 *entry) +{ + u64 addr_lo, addr_hi, ptr; + + addr_lo = get_field_from_reg_u32( + entry[0], + IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_MASK, + IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_SHIFT); + + addr_hi = get_field_from_reg_u32( + entry[1], + IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_MASK, + IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_SHIFT); + + ptr = (addr_hi << 32) | (addr_lo << PAGE_SHIFT); + return ptr; +} + +static int amd_iommu_is_pte_present(u32 *entry) +{ + return (get_field_from_reg_u32(entry[0], + IOMMU_PDE_PRESENT_MASK, + IOMMU_PDE_PRESENT_SHIFT)); +} + +void invalidate_dev_table_entry(struct amd_iommu *iommu, + u16 device_id) +{ + u32 cmd[4], entry; + + cmd[3] = cmd[2] = 0; + set_field_in_reg_u32(device_id, 0, + IOMMU_INV_DEVTAB_ENTRY_DEVICE_ID_MASK, + IOMMU_INV_DEVTAB_ENTRY_DEVICE_ID_SHIFT, &entry); + cmd[0] = entry; + + set_field_in_reg_u32(IOMMU_CMD_INVALIDATE_DEVTAB_ENTRY, 0, + IOMMU_CMD_OPCODE_MASK, IOMMU_CMD_OPCODE_SHIFT, + &entry); + cmd[1] = entry; + + send_iommu_command(iommu, cmd); +} + +int amd_iommu_is_dte_page_translation_valid(u32 *entry) +{ + return (get_field_from_reg_u32(entry[0], + IOMMU_DEV_TABLE_VALID_MASK, + IOMMU_DEV_TABLE_VALID_SHIFT) && + get_field_from_reg_u32(entry[0], + IOMMU_DEV_TABLE_TRANSLATION_VALID_MASK, + IOMMU_DEV_TABLE_TRANSLATION_VALID_SHIFT)); +} + +static u64 iommu_l2e_from_pfn(struct page_info *table, int level, + unsigned long io_pfn) +{ + unsigned long offset; + void *pde = NULL; + void *table_vaddr; + u64 next_table_maddr = 0; + + BUG_ON( table == NULL || level == 0 ); + + while ( level > 1 ) + { + offset = io_pfn >> ((PTE_PER_TABLE_SHIFT * + (level - IOMMU_PAGING_MODE_LEVEL_1))); + offset &= ~PTE_PER_TABLE_MASK; + + table_vaddr = map_domain_page(page_to_mfn(table)); + pde = table_vaddr + (offset * IOMMU_PAGE_TABLE_ENTRY_SIZE); + next_table_maddr = amd_iommu_get_next_table_from_pte(pde); + + if ( !amd_iommu_is_pte_present(pde) ) + { + if ( next_table_maddr == 0 ) + { + table = alloc_amd_iommu_pgtable(); + if ( table == NULL ) + return 0; + next_table_maddr = page_to_maddr(table); + amd_iommu_set_page_directory_entry( + (u32 *)pde, next_table_maddr, level - 1); + } + else /* should never reach here */ + return 0; + } + + unmap_domain_page(table_vaddr); + table = maddr_to_page(next_table_maddr); + level--; + } + + return next_table_maddr; +} + +int amd_iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn) +{ + u64 iommu_l2e; + struct hvm_iommu *hd = domain_hvm_iommu(d); + int iw = IOMMU_IO_WRITE_ENABLED; + int ir = IOMMU_IO_READ_ENABLED; + + BUG_ON( !hd->root_table ); + + spin_lock(&hd->mapping_lock); + + if ( is_hvm_domain(d) && !hd->p2m_synchronized ) + goto out; + + iommu_l2e = iommu_l2e_from_pfn(hd->root_table, hd->paging_mode, gfn); + if ( iommu_l2e == 0 ) + { + spin_unlock(&hd->mapping_lock); + AMD_IOMMU_DEBUG("Invalid IO pagetable entry gfn = %lx\n", gfn); + return -EFAULT; + } + set_iommu_l1e_present(iommu_l2e, gfn, (u64)mfn << PAGE_SHIFT, iw, ir); + +out: + spin_unlock(&hd->mapping_lock); + return 0; +} + +int amd_iommu_unmap_page(struct domain *d, unsigned long gfn) +{ + u64 iommu_l2e; + unsigned long flags; + struct amd_iommu *iommu; + struct hvm_iommu *hd = domain_hvm_iommu(d); + + BUG_ON( !hd->root_table ); + + spin_lock(&hd->mapping_lock); + + if ( is_hvm_domain(d) && !hd->p2m_synchronized ) + { + spin_unlock(&hd->mapping_lock); + return 0; + } + + iommu_l2e = iommu_l2e_from_pfn(hd->root_table, hd->paging_mode, gfn); + + if ( iommu_l2e == 0 ) + { + spin_unlock(&hd->mapping_lock); + AMD_IOMMU_DEBUG("Invalid IO pagetable entry gfn = %lx\n", gfn); + return -EFAULT; + } + + /* mark PTE as 'page not present' */ + clear_iommu_l1e_present(iommu_l2e, gfn); + spin_unlock(&hd->mapping_lock); + + /* send INVALIDATE_IOMMU_PAGES command */ + for_each_amd_iommu ( iommu ) + { + spin_lock_irqsave(&iommu->lock, flags); + invalidate_iommu_page(iommu, (u64)gfn << PAGE_SHIFT, hd->domain_id); + flush_command_buffer(iommu); + spin_unlock_irqrestore(&iommu->lock, flags); + } + + return 0; +} + +int amd_iommu_reserve_domain_unity_map( + struct domain *domain, + unsigned long phys_addr, + unsigned long size, int iw, int ir) +{ + u64 iommu_l2e; + unsigned long npages, i; + struct hvm_iommu *hd = domain_hvm_iommu(domain); + + npages = region_to_pages(phys_addr, size); + + spin_lock(&hd->mapping_lock); + for ( i = 0; i < npages; ++i ) + { + iommu_l2e = iommu_l2e_from_pfn( + hd->root_table, hd->paging_mode, phys_addr >> PAGE_SHIFT); + + if ( iommu_l2e == 0 ) + { + spin_unlock(&hd->mapping_lock); + AMD_IOMMU_DEBUG("Invalid IO pagetable entry phys_addr = %lx\n", + phys_addr); + return -EFAULT; + } + + set_iommu_l1e_present(iommu_l2e, + (phys_addr >> PAGE_SHIFT), phys_addr, iw, ir); + + phys_addr += PAGE_SIZE; + } + spin_unlock(&hd->mapping_lock); + return 0; +} + +int amd_iommu_sync_p2m(struct domain *d) +{ + unsigned long mfn, gfn; + u64 iommu_l2e; + struct page_info *page; + struct hvm_iommu *hd; + int iw = IOMMU_IO_WRITE_ENABLED; + int ir = IOMMU_IO_READ_ENABLED; + + if ( !is_hvm_domain(d) ) + return 0; + + hd = domain_hvm_iommu(d); + + spin_lock(&hd->mapping_lock); + + if ( hd->p2m_synchronized ) + goto out; + + spin_lock(&d->page_alloc_lock); + + list_for_each_entry( page, &d->page_list, list ) + { + mfn = page_to_mfn(page); + gfn = get_gpfn_from_mfn(mfn); + + if ( gfn == INVALID_M2P_ENTRY ) + continue; + + iommu_l2e = iommu_l2e_from_pfn(hd->root_table, hd->paging_mode, gfn); + + if ( iommu_l2e == 0 ) + { + spin_unlock(&d->page_alloc_lock); + spin_unlock(&hd->mapping_lock); + AMD_IOMMU_DEBUG("Invalid IO pagetable entry gfn = %lx\n", gfn); + return -EFAULT; + } + + set_iommu_l1e_present(iommu_l2e, gfn, (u64)mfn << PAGE_SHIFT, iw, ir); + } + + spin_unlock(&d->page_alloc_lock); + + hd->p2m_synchronized = 1; + +out: + spin_unlock(&hd->mapping_lock); + return 0; +} + +void invalidate_all_iommu_pages(struct domain *d) +{ + u32 cmd[4], entry; + unsigned long flags; + struct amd_iommu *iommu; + int domain_id = d->domain_id; + u64 addr_lo = 0x7FFFFFFFFFFFF000ULL & DMA_32BIT_MASK; + u64 addr_hi = 0x7FFFFFFFFFFFF000ULL >> 32; + + set_field_in_reg_u32(domain_id, 0, + IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_MASK, + IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_SHIFT, &entry); + set_field_in_reg_u32(IOMMU_CMD_INVALIDATE_IOMMU_PAGES, entry, + IOMMU_CMD_OPCODE_MASK, IOMMU_CMD_OPCODE_SHIFT, + &entry); + cmd[1] = entry; + + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, 0, + IOMMU_INV_IOMMU_PAGES_S_FLAG_MASK, + IOMMU_INV_IOMMU_PAGES_S_FLAG_SHIFT, &entry); + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, + IOMMU_INV_IOMMU_PAGES_PDE_FLAG_MASK, + IOMMU_INV_IOMMU_PAGES_PDE_FLAG_SHIFT, &entry); + set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, entry, + IOMMU_INV_IOMMU_PAGES_ADDR_LOW_MASK, + IOMMU_INV_IOMMU_PAGES_ADDR_LOW_SHIFT, &entry); + cmd[2] = entry; + + set_field_in_reg_u32((u32)addr_hi, 0, + IOMMU_INV_IOMMU_PAGES_ADDR_HIGH_MASK, + IOMMU_INV_IOMMU_PAGES_ADDR_HIGH_SHIFT, &entry); + cmd[3] = entry; + + cmd[0] = 0; + + for_each_amd_iommu ( iommu ) + { + spin_lock_irqsave(&iommu->lock, flags); + send_iommu_command(iommu, cmd); + flush_command_buffer(iommu); + spin_unlock_irqrestore(&iommu->lock, flags); + } +} diff -Naurp xen/drivers/passthrough/amd/Makefile xen-redhat/drivers/passthrough/amd/Makefile --- xen/drivers/passthrough/amd/Makefile +++ xen-redhat/drivers/passthrough/amd/Makefile @@ -0,0 +1,6 @@ +obj-y += iommu_detect.o +obj-y += iommu_init.o +obj-y += iommu_map.o +obj-y += pci_amd_iommu.o +obj-y += iommu_acpi.o +obj-y += iommu_intr.o diff -Naurp xen/drivers/passthrough/amd/pci_amd_iommu.c xen-redhat/drivers/passthrough/amd/pci_amd_iommu.c --- xen/drivers/passthrough/amd/pci_amd_iommu.c +++ xen-redhat/drivers/passthrough/amd/pci_amd_iommu.c @@ -0,0 +1,429 @@ +/* + * Copyright (C) 2007 Advanced Micro Devices, Inc. + * Author: Leo Duran <leo.duran@amd.com> + * Author: Wei Wang <wei.wang2@amd.com> - adapted to xen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <xen/sched.h> +#include <xen/pci.h> +#include <xen/pci_regs.h> +#include <asm/amd-iommu.h> +#include <asm/hvm/svm/amd-iommu-proto.h> + +extern unsigned short ivrs_bdf_entries; +extern struct ivrs_mappings *ivrs_mappings; +extern void *int_remap_table; + +struct amd_iommu *find_iommu_for_device(int bus, int devfn) +{ + u16 bdf = (bus << 8) | devfn; + BUG_ON ( bdf >= ivrs_bdf_entries ); + return ivrs_mappings[bdf].iommu; +} + +/* + * Some devices will use alias id and original device id to index interrupt + * table and I/O page table respectively. Such devices will have + * both alias entry and select entry in IVRS structure. + + * Return original device id, if device has valid interrupt remapping + * table setup for both select entry and alias entry. +*/ +int get_dma_requestor_id(u16 bdf) +{ + int req_id; + + BUG_ON ( bdf >= ivrs_bdf_entries ); + req_id = ivrs_mappings[bdf].dte_requestor_id; + if ( (ivrs_mappings[bdf].intremap_table != NULL) && + (ivrs_mappings[req_id].intremap_table != NULL) ) + req_id = bdf; + + return req_id; +} + +static void amd_iommu_setup_domain_device( + struct domain *domain, struct amd_iommu *iommu, int bdf) +{ + void *dte; + unsigned long flags; + int req_id, valid = 1; + struct hvm_iommu *hd = domain_hvm_iommu(domain); + + BUG_ON( !hd->root_table || !hd->paging_mode || !iommu->dev_table.buffer ); + + if ( iommu_passthrough && (domain->domain_id == 0) ) + valid = 0; + + /* get device-table entry */ + req_id = get_dma_requestor_id(bdf); + dte = iommu->dev_table.buffer + (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE); + + spin_lock_irqsave(&iommu->lock, flags); + if ( !amd_iommu_is_dte_page_translation_valid((u32 *)dte) ) + { + /* bind DTE to domain page-tables */ + amd_iommu_set_root_page_table( + (u32 *)dte, page_to_maddr(hd->root_table), hd->domain_id, + hd->paging_mode, valid); + + invalidate_dev_table_entry(iommu, req_id); + flush_command_buffer(iommu); + + AMD_IOMMU_DEBUG("Setup I/O page table at DTE:0x%x, root_table:%" + PRIx64", domain_id:%d, paging_mode:%d\n", req_id, + (u64)page_to_maddr(hd->root_table), hd->domain_id, + hd->paging_mode); + } + spin_unlock_irqrestore(&iommu->lock, flags); +} + +static void amd_iommu_setup_dom0_devices(struct domain *d) +{ + struct amd_iommu *iommu; + struct pci_dev *pdev; + int bus, dev, func; + u32 l; + int bdf; + + spin_lock(&pcidevs_lock); + for ( bus = 0; bus < 256; bus++ ) + { + for ( dev = 0; dev < 32; dev++ ) + { + for ( func = 0; func < 8; func++ ) + { + l = pci_conf_read32(bus, dev, func, PCI_VENDOR_ID); + /* some broken boards return 0 or ~0 if a slot is empty: */ + if ( (l == 0xffffffff) || (l == 0x00000000) || + (l == 0x0000ffff) || (l == 0xffff0000) ) + continue; + + pdev = alloc_pdev(bus, PCI_DEVFN(dev, func)); + pdev->domain = d; + list_add(&pdev->domain_list, &d->arch.pdev_list); + + bdf = (bus << 8) | pdev->devfn; + /* supported device? */ + iommu = (bdf < ivrs_bdf_entries) ? + find_iommu_for_device(bus, pdev->devfn) : NULL; + + if ( iommu ) + amd_iommu_setup_domain_device(d, iommu, bdf); + } + } + } + spin_unlock(&pcidevs_lock); +} + +int amd_iov_detect(void) +{ + INIT_LIST_HEAD(&amd_iommu_head); + + amd_iommu_detect_acpi(); + + if ( !iommu_found() ) + { + printk("AMD-Vi: IOMMU not found!\n"); + return -ENODEV; + } + + if ( amd_iommu_init() != 0 ) + { + printk ("AMD-Vi: Error initialization!\n"); + return -ENODEV; + } + + return 0; +} + +static int allocate_domain_resources(struct hvm_iommu *hd) +{ + /* allocate root table */ + spin_lock(&hd->mapping_lock); + if ( !hd->root_table ) + { + hd->root_table = alloc_amd_iommu_pgtable(); + if ( !hd->root_table ) + { + spin_unlock(&hd->mapping_lock); + return -ENOMEM; + } + } + spin_unlock(&hd->mapping_lock); + return 0; +} + +static int get_paging_mode(unsigned long entries) +{ + int level = 1; + + BUG_ON(!max_page); + + if ( entries > max_page ) + entries = max_page; + + while ( entries > PTE_PER_TABLE_SIZE ) + { + entries = PTE_PER_TABLE_ALIGN(entries) >> PTE_PER_TABLE_SHIFT; + if ( ++level > 6 ) + return -ENOMEM; + } + + return level; +} + +static int amd_iommu_domain_init(struct domain *domain) +{ + struct hvm_iommu *hd = domain_hvm_iommu(domain); + + /* allocate page directroy */ + if ( allocate_domain_resources(hd) != 0 ) + { + if ( hd->root_table ) + free_domheap_page(hd->root_table); + return -ENOMEM; + } + + hd->paging_mode = is_hvm_domain(domain)? + IOMMU_PAGE_TABLE_LEVEL_4 : get_paging_mode(max_page); + + if ( domain->domain_id == 0 ) + { + unsigned long i; + + if ( !iommu_passthrough ) + { + /* setup 1:1 page table for dom0 */ + for ( i = 0; i < max_page; i++ ) + amd_iommu_map_page(domain, i, i); + } + + amd_iommu_setup_dom0_devices(domain); + } + + hd->domain_id = domain->domain_id; + + return 0; +} + +static void amd_iommu_disable_domain_device( + struct domain *domain, struct amd_iommu *iommu, int bdf) +{ + void *dte; + unsigned long flags; + int req_id; + + req_id = get_dma_requestor_id(bdf); + dte = iommu->dev_table.buffer + (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE); + + spin_lock_irqsave(&iommu->lock, flags); + if ( amd_iommu_is_dte_page_translation_valid((u32 *)dte) ) + { + memset (dte, 0, IOMMU_DEV_TABLE_ENTRY_SIZE); + invalidate_dev_table_entry(iommu, req_id); + flush_command_buffer(iommu); + AMD_IOMMU_DEBUG("Disable DTE:0x%x," + " domain_id:%d, paging_mode:%d\n", + req_id, domain_hvm_iommu(domain)->domain_id, + domain_hvm_iommu(domain)->paging_mode); + } + spin_unlock_irqrestore(&iommu->lock, flags); +} + +static int reassign_device( struct domain *source, struct domain *target, + u8 bus, u8 devfn) +{ + struct pci_dev *pdev; + struct amd_iommu *iommu; + int bdf; + + ASSERT(spin_is_locked(&pcidevs_lock)); + pdev = pci_get_pdev_by_domain(source, bus, devfn); + if ( !pdev ) + return -ENODEV; + + bdf = (bus << 8) | devfn; + /* supported device? */ + iommu = (bdf < ivrs_bdf_entries) ? + find_iommu_for_device(bus, pdev->devfn) : NULL; + + if ( !iommu ) + { + AMD_IOMMU_DEBUG("Fail to find iommu." + " %x:%x.%x cannot be assigned to domain %d\n", + bus, PCI_SLOT(devfn), PCI_FUNC(devfn), + target->domain_id); + return -ENODEV; + } + + amd_iommu_disable_domain_device(source, iommu, bdf); + + list_move(&pdev->domain_list, &target->arch.pdev_list); + pdev->domain = target; + + amd_iommu_setup_domain_device(target, iommu, bdf); + AMD_IOMMU_DEBUG("reassign %x:%x.%x domain %d -> domain %d\n", + bus, PCI_SLOT(devfn), PCI_FUNC(devfn), + source->domain_id, target->domain_id); + + return 0; +} + +static int amd_iommu_assign_device(struct domain *d, u8 bus, u8 devfn) +{ + int bdf = (bus << 8) | devfn; + int req_id = get_dma_requestor_id(bdf); + + amd_iommu_sync_p2m(d); + + if ( ivrs_mappings[req_id].unity_map_enable ) + { + amd_iommu_reserve_domain_unity_map( + d, + ivrs_mappings[req_id].addr_range_start, + ivrs_mappings[req_id].addr_range_length, + ivrs_mappings[req_id].write_permission, + ivrs_mappings[req_id].read_permission); + } + + return reassign_device(dom0, d, bus, devfn); +} + +static void deallocate_next_page_table(struct page_info* pg, int level) +{ + void *table_vaddr, *pde; + u64 next_table_maddr; + int index; + + table_vaddr = map_domain_page(page_to_mfn(pg)); + + if ( level > 1 ) + { + for ( index = 0; index < PTE_PER_TABLE_SIZE; index++ ) + { + pde = table_vaddr + (index * IOMMU_PAGE_TABLE_ENTRY_SIZE); + next_table_maddr = amd_iommu_get_next_table_from_pte(pde); + if ( next_table_maddr != 0 ) + { + deallocate_next_page_table( + maddr_to_page(next_table_maddr), level - 1); + } + } + } + + unmap_domain_page(table_vaddr); + free_amd_iommu_pgtable(pg); +} + +static void deallocate_iommu_page_tables(struct domain *d) +{ + struct hvm_iommu *hd = domain_hvm_iommu(d); + + spin_lock(&hd->mapping_lock); + if ( hd->root_table ) + { + deallocate_next_page_table(hd->root_table, hd->paging_mode); + hd->root_table = NULL; + } + spin_unlock(&hd->mapping_lock); +} + + +static void amd_iommu_domain_destroy(struct domain *d) +{ + deallocate_iommu_page_tables(d); + invalidate_all_iommu_pages(d); +} + +static int amd_iommu_return_device( + struct domain *s, struct domain *t, u8 bus, u8 devfn) +{ + return reassign_device(s, t, bus, devfn); +} + +static int amd_iommu_add_device(struct pci_dev *pdev) +{ + struct amd_iommu *iommu; + u16 bdf; + if ( !pdev->domain ) + return -EINVAL; + + bdf = (pdev->bus << 8) | pdev->devfn; + iommu = (bdf < ivrs_bdf_entries) ? + find_iommu_for_device(pdev->bus, pdev->devfn) : NULL; + + if ( !iommu ) + { + AMD_IOMMU_DEBUG("Fail to find iommu." + " %x:%x.%x cannot be assigned to domain %d\n", + pdev->bus, PCI_SLOT(pdev->devfn), + PCI_FUNC(pdev->devfn), pdev->domain->domain_id); + return -ENODEV; + } + + amd_iommu_setup_domain_device(pdev->domain, iommu, bdf); + return 0; +} + +static int amd_iommu_remove_device(struct pci_dev *pdev) +{ + struct amd_iommu *iommu; + u16 bdf; + if ( !pdev->domain ) + return -EINVAL; + + bdf = (pdev->bus << 8) | pdev->devfn; + iommu = (bdf < ivrs_bdf_entries) ? + find_iommu_for_device(pdev->bus, pdev->devfn) : NULL; + + if ( !iommu ) + { + AMD_IOMMU_DEBUG("Fail to find iommu." + " %x:%x.%x cannot be removed from domain %d\n", + pdev->bus, PCI_SLOT(pdev->devfn), + PCI_FUNC(pdev->devfn), pdev->domain->domain_id); + return -ENODEV; + } + + amd_iommu_disable_domain_device(pdev->domain, iommu, bdf); + return 0; +} + +static int amd_iommu_group_id(u8 bus, u8 devfn) +{ + int rt; + int bdf = (bus << 8) | devfn; + rt = ( bdf < ivrs_bdf_entries ) ? + get_dma_requestor_id(bdf) : + bdf; + return rt; +} + +struct iommu_ops amd_iommu_ops = { + .init = amd_iommu_domain_init, + .add_device = amd_iommu_add_device, + .remove_device = amd_iommu_remove_device, + .assign_device = amd_iommu_assign_device, + .teardown = amd_iommu_domain_destroy, + .map_page = amd_iommu_map_page, + .unmap_page = amd_iommu_unmap_page, + .reassign_device = amd_iommu_return_device, + .get_device_group_id = amd_iommu_group_id, + .update_ire_from_apic = amd_iommu_ioapic_update_ire, + .update_ire_from_msi = amd_iommu_msi_msg_update_ire, +}; diff -Naurp xen/drivers/passthrough/io.c xen-redhat/drivers/passthrough/io.c --- xen/drivers/passthrough/io.c +++ xen-redhat/drivers/passthrough/io.c @@ -0,0 +1,436 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Allen Kay <allen.m.kay@intel.com> + * Copyright (C) Xiaohui Xin <xiaohui.xin@intel.com> + */ + +#include <xen/event.h> +#include <xen/iommu.h> +#include <asm/hvm/irq.h> +#include <asm/hvm/iommu.h> +#include <xen/hvm/irq.h> + +static void pt_irq_time_out(void *data) +{ + struct hvm_mirq_dpci_mapping *irq_map = data; + unsigned int guest_gsi, machine_gsi = 0; + int vector; + struct hvm_irq_dpci *dpci = NULL; + struct dev_intx_gsi_link *digl; + uint32_t device, intx; + + spin_lock(&irq_map->dom->event_lock); + + dpci = domain_get_irq_dpci(irq_map->dom); + ASSERT(dpci); + list_for_each_entry ( digl, &irq_map->digl_list, list ) + { + guest_gsi = digl->gsi; + machine_gsi = dpci->girq[guest_gsi].machine_gsi; + device = digl->device; + intx = digl->intx; + hvm_pci_intx_deassert(irq_map->dom, device, intx); + } + + clear_bit(machine_gsi, dpci->dirq_mask); + vector = domain_irq_to_vector(irq_map->dom, machine_gsi); + dpci->mirq[machine_gsi].pending = 0; + spin_unlock(&irq_map->dom->event_lock); + pirq_guest_eoi(irq_map->dom, machine_gsi); +} + +int pt_irq_create_bind_vtd( + struct domain *d, xen_domctl_bind_pt_irq_t *pt_irq_bind) +{ + struct hvm_irq_dpci *hvm_irq_dpci = NULL; + uint32_t machine_gsi, guest_gsi; + uint32_t device, intx, link; + struct dev_intx_gsi_link *digl; + int rc, pirq = pt_irq_bind->machine_irq; + + if ( pirq < 0 || pirq >= NR_IRQS ) + return -EINVAL; + + spin_lock(&d->event_lock); + + hvm_irq_dpci = domain_get_irq_dpci(d); + if ( hvm_irq_dpci == NULL ) + { + hvm_irq_dpci = xmalloc(struct hvm_irq_dpci); + if ( hvm_irq_dpci == NULL ) + { + spin_unlock(&d->event_lock); + return -ENOMEM; + } + memset(hvm_irq_dpci, 0, sizeof(*hvm_irq_dpci)); + for ( int i = 0; i < NR_IRQS; i++ ) + INIT_LIST_HEAD(&hvm_irq_dpci->mirq[i].digl_list); + } + + if ( domain_set_irq_dpci(d, hvm_irq_dpci) == 0 ) + { + xfree(hvm_irq_dpci); + spin_unlock(&d->event_lock); + return -EINVAL; + } + + if ( pt_irq_bind->irq_type == PT_IRQ_TYPE_MSI ) + { +#ifdef SUPPORT_MSI_REMAPPING + if ( !test_and_set_bit(pirq, hvm_irq_dpci->mapping)) + { + set_bit(_HVM_IRQ_DPCI_MSI, &hvm_irq_dpci->mirq[pirq].flags); + hvm_irq_dpci->mirq[pirq].gmsi.gvec = pt_irq_bind->u.msi.gvec; + hvm_irq_dpci->mirq[pirq].gmsi.gflags = pt_irq_bind->u.msi.gflags; + hvm_irq_dpci->msi_gvec_pirq[pt_irq_bind->u.msi.gvec] = pirq; + /* bind after hvm_irq_dpci is setup to avoid race with irq handler*/ + rc = pirq_guest_bind(d->vcpu[0], pirq, 0); + if ( rc == 0 ) + { + rc = msixtbl_pt_register(d, pirq); + if ( unlikely(rc) ) + pirq_guest_unbind(d, pirq); + } + if ( unlikely(rc) ) + { + hvm_irq_dpci->msi_gvec_pirq[pt_irq_bind->u.msi.gvec] = 0; + hvm_irq_dpci->mirq[pirq].gmsi.gflags = 0; + hvm_irq_dpci->mirq[pirq].gmsi.gvec = 0; + clear_bit(_HVM_IRQ_DPCI_MSI, &hvm_irq_dpci->mirq[pirq].flags); + clear_bit(pirq, hvm_irq_dpci->mapping); + spin_unlock(&d->event_lock); + return rc; + } + } + else + { + uint32_t old_gvec; + + if ( !test_bit(_HVM_IRQ_DPCI_MSI, &hvm_irq_dpci->mirq[pirq].flags) ) + { + spin_unlock(&d->event_lock); + return -EBUSY; + } + + /* if pirq is already mapped as vmsi, update the guest data/addr */ + old_gvec = hvm_irq_dpci->mirq[pirq].gmsi.gvec; + hvm_irq_dpci->msi_gvec_pirq[old_gvec] = 0; + hvm_irq_dpci->mirq[pirq].gmsi.gvec = pt_irq_bind->u.msi.gvec; + hvm_irq_dpci->mirq[pirq].gmsi.gflags = pt_irq_bind->u.msi.gflags; + hvm_irq_dpci->msi_gvec_pirq[pt_irq_bind->u.msi.gvec] = pirq; + } +#else + return -ENOSYS; +#endif + } + else + { + machine_gsi = pt_irq_bind->machine_irq; + device = pt_irq_bind->u.pci.device; + intx = pt_irq_bind->u.pci.intx; + guest_gsi = hvm_pci_intx_gsi(device, intx); + link = hvm_pci_intx_link(device, intx); + hvm_irq_dpci->link_cnt[link]++; + + digl = xmalloc(struct dev_intx_gsi_link); + if ( !digl ) + { + spin_unlock(&d->event_lock); + return -ENOMEM; + } + + digl->device = device; + digl->intx = intx; + digl->gsi = guest_gsi; + digl->link = link; + list_add_tail(&digl->list, + &hvm_irq_dpci->mirq[machine_gsi].digl_list); + + hvm_irq_dpci->girq[guest_gsi].valid = 1; + hvm_irq_dpci->girq[guest_gsi].device = device; + hvm_irq_dpci->girq[guest_gsi].intx = intx; + hvm_irq_dpci->girq[guest_gsi].machine_gsi = machine_gsi; + + /* Bind the same mirq once in the same domain */ + if ( !test_and_set_bit(machine_gsi, hvm_irq_dpci->mapping)) + { + unsigned int vector = domain_irq_to_vector(d, machine_gsi); + + hvm_irq_dpci->mirq[machine_gsi].dom = d; + + /* Init timer before binding */ + init_timer(&hvm_irq_dpci->hvm_timer[vector], + pt_irq_time_out, &hvm_irq_dpci->mirq[machine_gsi], 0); + /* Deal with gsi for legacy devices */ + rc = pirq_guest_bind(d->vcpu[0], machine_gsi, BIND_PIRQ__WILL_SHARE); + if ( unlikely(rc) ) + { + kill_timer(&hvm_irq_dpci->hvm_timer[vector]); + hvm_irq_dpci->mirq[machine_gsi].dom = NULL; + clear_bit(machine_gsi, hvm_irq_dpci->mapping); + hvm_irq_dpci->girq[guest_gsi].machine_gsi = 0; + hvm_irq_dpci->girq[guest_gsi].intx = 0; + hvm_irq_dpci->girq[guest_gsi].device = 0; + hvm_irq_dpci->girq[guest_gsi].valid = 0; + list_del(&digl->list); + hvm_irq_dpci->link_cnt[link]--; + spin_unlock(&d->event_lock); + xfree(digl); + return rc; + } + } + + gdprintk(XENLOG_INFO VTDPREFIX, + "VT-d irq bind: m_irq = %x device = %x intx = %x\n", + machine_gsi, device, intx); + } + spin_unlock(&d->event_lock); + return 0; +} + +int pt_irq_destroy_bind_vtd( + struct domain *d, xen_domctl_bind_pt_irq_t *pt_irq_bind) +{ + struct hvm_irq_dpci *hvm_irq_dpci = NULL; + uint32_t machine_gsi, guest_gsi; + uint32_t device, intx, link; + struct list_head *digl_list, *tmp; + struct dev_intx_gsi_link *digl; + + machine_gsi = pt_irq_bind->machine_irq; + device = pt_irq_bind->u.pci.device; + intx = pt_irq_bind->u.pci.intx; + guest_gsi = hvm_pci_intx_gsi(device, intx); + link = hvm_pci_intx_link(device, intx); + + gdprintk(XENLOG_INFO, + "pt_irq_destroy_bind_vtd: machine_gsi=%d " + "guest_gsi=%d, device=%d, intx=%d.\n", + machine_gsi, guest_gsi, device, intx); + spin_lock(&d->event_lock); + + hvm_irq_dpci = domain_get_irq_dpci(d); + + if ( hvm_irq_dpci == NULL ) + { + spin_unlock(&d->event_lock); + return -EINVAL; + } + + hvm_irq_dpci->link_cnt[link]--; + memset(&hvm_irq_dpci->girq[guest_gsi], 0, + sizeof(struct hvm_girq_dpci_mapping)); + + /* clear the mirq info */ + if ( test_bit(machine_gsi, hvm_irq_dpci->mapping)) + { + list_for_each_safe ( digl_list, tmp, + &hvm_irq_dpci->mirq[machine_gsi].digl_list ) + { + digl = list_entry(digl_list, + struct dev_intx_gsi_link, list); + if ( digl->device == device && + digl->intx == intx && + digl->link == link && + digl->gsi == guest_gsi ) + { + list_del(&digl->list); + xfree(digl); + } + } + + if ( list_empty(&hvm_irq_dpci->mirq[machine_gsi].digl_list) ) + { + pirq_guest_unbind(d, machine_gsi); +#ifdef SUPPORT_MSI_REMAPPING + msixtbl_pt_unregister(d, machine_gsi); +#endif + kill_timer(&hvm_irq_dpci->hvm_timer[domain_irq_to_vector(d, machine_gsi)]); + hvm_irq_dpci->mirq[machine_gsi].dom = NULL; + hvm_irq_dpci->mirq[machine_gsi].flags = 0; + clear_bit(machine_gsi, hvm_irq_dpci->mapping); + } + } + spin_unlock(&d->event_lock); + gdprintk(XENLOG_INFO, + "XEN_DOMCTL_irq_unmapping: m_irq = %x device = %x intx = %x\n", + machine_gsi, device, intx); + + return 0; +} + +int hvm_do_IRQ_dpci(struct domain *d, unsigned int mirq) +{ + struct hvm_irq_dpci *dpci = domain_get_irq_dpci(d); + + ASSERT(spin_is_locked(&irq_desc[domain_irq_to_vector(d, mirq)].lock)); + if ( !iommu_enabled || (d == dom0) || !dpci || + !test_bit(mirq, dpci->mapping)) + return 0; + + /* + * Set a timer here to avoid situations where the IRQ line is shared, and + * the device belonging to the pass-through guest is not yet active. In + * this case the guest may not pick up the interrupt (e.g., masked at the + * PIC) and we need to detect that. + */ + set_bit(mirq, dpci->dirq_mask); + if ( !test_bit(_HVM_IRQ_DPCI_MSI, &dpci->mirq[mirq].flags) ) + set_timer(&dpci->hvm_timer[domain_irq_to_vector(d, mirq)], + NOW() + PT_IRQ_TIME_OUT); + vcpu_kick(d->vcpu[0]); + + return 1; +} + +#ifdef SUPPORT_MSI_REMAPPING +void hvm_dpci_msi_eoi(struct domain *d, int vector) +{ + struct hvm_irq_dpci *hvm_irq_dpci = d->arch.hvm_domain.irq.dpci; + irq_desc_t *desc; + int pirq; + + if ( !iommu_enabled || (hvm_irq_dpci == NULL) ) + return; + + spin_lock(&d->event_lock); + pirq = hvm_irq_dpci->msi_gvec_pirq[vector]; + + if ( ( pirq >= 0 ) && (pirq < NR_IRQS) && + test_bit(pirq, hvm_irq_dpci->mapping) && + (test_bit(_HVM_IRQ_DPCI_MSI, &hvm_irq_dpci->mirq[pirq].flags))) + { + BUG_ON(!local_irq_is_enabled()); + desc = domain_spin_lock_irq_desc(d, pirq, NULL); + if (!desc) + { + spin_unlock(&d->event_lock); + return; + } + + desc->status &= ~IRQ_INPROGRESS; + spin_unlock_irq(&desc->lock); + + pirq_guest_eoi(d, pirq); + } + + spin_unlock(&d->event_lock); +} + +extern int vmsi_deliver(struct domain *d, int pirq); +static int hvm_pci_msi_assert(struct domain *d, int pirq) +{ + return vmsi_deliver(d, pirq); +} +#endif + +void hvm_dirq_assist(struct vcpu *v) +{ + unsigned int irq; + uint32_t device, intx; + struct domain *d = v->domain; + struct hvm_irq_dpci *hvm_irq_dpci = d->arch.hvm_domain.irq.dpci; + struct dev_intx_gsi_link *digl; + + if ( !iommu_enabled || (v->vcpu_id != 0) || (hvm_irq_dpci == NULL) ) + return; + + for ( irq = find_first_bit(hvm_irq_dpci->dirq_mask, NR_IRQS); + irq < NR_IRQS; + irq = find_next_bit(hvm_irq_dpci->dirq_mask, NR_IRQS, irq + 1) ) + { + if ( !test_and_clear_bit(irq, &hvm_irq_dpci->dirq_mask) ) + continue; + + spin_lock(&d->event_lock); +#ifdef SUPPORT_MSI_REMAPPING + if ( test_bit(_HVM_IRQ_DPCI_MSI, &hvm_irq_dpci->mirq[irq].flags) ) + { + hvm_pci_msi_assert(d, irq); + spin_unlock(&d->event_lock); + continue; + } +#endif + stop_timer(&hvm_irq_dpci->hvm_timer[domain_irq_to_vector(d, irq)]); + + list_for_each_entry ( digl, &hvm_irq_dpci->mirq[irq].digl_list, list ) + { + device = digl->device; + intx = digl->intx; + hvm_pci_intx_assert(d, device, intx); + hvm_irq_dpci->mirq[irq].pending++; + } + + /* + * Set a timer to see if the guest can finish the interrupt or not. For + * example, the guest OS may unmask the PIC during boot, before the + * guest driver is loaded. hvm_pci_intx_assert() may succeed, but the + * guest will never deal with the irq, then the physical interrupt line + * will never be deasserted. + */ + set_timer(&hvm_irq_dpci->hvm_timer[domain_irq_to_vector(d, irq)], + NOW() + PT_IRQ_TIME_OUT); + spin_unlock(&d->event_lock); + } +} + +void hvm_dpci_eoi(struct domain *d, unsigned int guest_gsi, + union vioapic_redir_entry *ent) +{ + struct hvm_irq_dpci *hvm_irq_dpci = NULL; + uint32_t device, intx, machine_gsi; + + if ( !iommu_enabled) + return; + + if ( guest_gsi < NR_ISAIRQS ) + { + hvm_dpci_isairq_eoi(d, guest_gsi); + return; + } + + spin_lock(&d->event_lock); + hvm_irq_dpci = domain_get_irq_dpci(d); + + if((hvm_irq_dpci == NULL) || + (guest_gsi >= NR_ISAIRQS && + !hvm_irq_dpci->girq[guest_gsi].valid) ) + { + spin_unlock(&d->event_lock); + return; + } + + device = hvm_irq_dpci->girq[guest_gsi].device; + intx = hvm_irq_dpci->girq[guest_gsi].intx; + hvm_pci_intx_deassert(d, device, intx); + + machine_gsi = hvm_irq_dpci->girq[guest_gsi].machine_gsi; + if ( --hvm_irq_dpci->mirq[machine_gsi].pending == 0 ) + { + if ( (ent == NULL) || !ent->fields.mask ) + { + /* + * No need to get vector lock for timer + * since interrupt is still not EOIed + */ + stop_timer(&hvm_irq_dpci->hvm_timer[ + domain_irq_to_vector(d, machine_gsi)]); + pirq_guest_eoi(d, machine_gsi); + } + } + spin_unlock(&d->event_lock); +} diff -Naurp xen/drivers/passthrough/iommu.c xen-redhat/drivers/passthrough/iommu.c --- xen/drivers/passthrough/iommu.c +++ xen-redhat/drivers/passthrough/iommu.c @@ -0,0 +1,310 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + */ + +#include <xen/sched.h> +#include <xen/iommu.h> +#include <asm/hvm/iommu.h> +#include <xen/paging.h> +#include <xen/guest_access.h> + +static void parse_iommu_param(char *s); +int intel_vtd_setup(void); + +/* + * The 'iommu' parameter enables the IOMMU. Optional comma separated + * value may contain: + * + * off|no|false|disable Disable IOMMU (default) + * force|required Don't boot unless IOMMU is enabled + * passthrough Bypass VT-d translation for Dom0 + * snoop Utilize the snoop control for IOMMU (default) + * no-snoop Dont utilize the snoop control for IOMMU + * amd-iommu-debug Turn on debug info for AMD IOMMU + */ +custom_param("iommu", parse_iommu_param); +int iommu_enabled = 0; +int force_iommu = 0; +int iommu_passthrough = 0; +int iommu_snoop = 0; +int iommu_intremap = 0; +int iommu_intremap_cmdline = 0; +int amd_iommu_debug=0; +int amd_iommu_perdev_intremap = 0; + +static void __init parse_iommu_param(char *s) +{ + char *ss; + iommu_enabled = 1; + iommu_snoop = 1; + iommu_intremap = 1; + iommu_intremap_cmdline = 1; + amd_iommu_debug = 0; + amd_iommu_perdev_intremap = 0; + + do { + ss = strchr(s, ','); + if ( ss ) + *ss = '\0'; + + if ( !strcmp(s, "off") || !strcmp(s, "no") || !strcmp(s, "false") || + !strcmp(s, "0") || !strcmp(s, "disable") ) + iommu_enabled = 0; + else if ( !strcmp(s, "force") || !strcmp(s, "required") ) + force_iommu = 1; + else if ( !strcmp(s, "passthrough") ) + iommu_passthrough = 1; + else if ( !strcmp(s, "snoop") ) + iommu_snoop = 1; + else if ( !strcmp(s, "no-snoop") ) + iommu_snoop = 0; + else if ( !strcmp(s, "no-intremap") ) + iommu_intremap = iommu_intremap_cmdline = 0; + else if ( !strcmp(s, "amd-iommu-debug") ) + amd_iommu_debug = 1; + else if ( !strcmp(s, "amd-iommu-perdev-intremap") ) + amd_iommu_perdev_intremap = 1; + + s = ss + 1; + } while ( ss ); +} + +int iommu_domain_init(struct domain *domain) +{ + struct hvm_iommu *hd = domain_hvm_iommu(domain); + + spin_lock_init(&hd->mapping_lock); + INIT_LIST_HEAD(&hd->g2m_ioport_list); + + if ( !iommu_enabled ) + return 0; + + hd->platform_ops = iommu_get_ops(); + return hd->platform_ops->init(domain); +} + +int iommu_add_device(struct pci_dev *pdev) +{ + struct hvm_iommu *hd; + + if ( !pdev->domain ) + return -EINVAL; + + ASSERT(spin_is_locked(&pcidevs_lock)); + + hd = domain_hvm_iommu(pdev->domain); + if ( !iommu_enabled || !hd->platform_ops ) + return 0; + + return hd->platform_ops->add_device(pdev); +} + +int iommu_remove_device(struct pci_dev *pdev) +{ + struct hvm_iommu *hd; + if ( !pdev->domain ) + return -EINVAL; + + hd = domain_hvm_iommu(pdev->domain); + if ( !iommu_enabled || !hd->platform_ops ) + return 0; + + return hd->platform_ops->remove_device(pdev); +} + +/* + * If the device isn't owned by dom0, it means it already + * has been assigned to another domain, or it doesn't exist. + */ +int device_assignable(struct domain *d, u8 bus, u8 devfn) +{ + struct pci_dev *pdev; + struct hvm_iommu *hd; + + spin_lock(&pcidevs_lock); + pdev = pci_get_pdev_by_domain(dom0, bus, devfn); + if (!pdev) + { + spin_unlock(&pcidevs_lock); + return -EINVAL; + } + spin_unlock(&pcidevs_lock); + + hd = domain_hvm_iommu(d); + if ( hd->platform_ops && hd->platform_ops->assignable && + !hd->platform_ops->assignable(d) ) + return -EINVAL; + + return 0; +} + +int assign_device(struct domain *d, u8 bus, u8 devfn) +{ + struct hvm_iommu *hd = domain_hvm_iommu(d); + int rc = 0; + + if ( !iommu_enabled || !hd->platform_ops ) + return 0; + + spin_lock(&pcidevs_lock); + rc = hd->platform_ops->assign_device(d, bus, devfn); + spin_unlock(&pcidevs_lock); + return rc; +} + +void iommu_domain_destroy(struct domain *d) +{ + struct hvm_iommu *hd = domain_hvm_iommu(d); + struct list_head *ioport_list, *tmp; + struct g2m_ioport *ioport; + + if ( !iommu_enabled || !hd->platform_ops ) + return; + + if ( hd ) + { + list_for_each_safe ( ioport_list, tmp, &hd->g2m_ioport_list ) + { + ioport = list_entry(ioport_list, struct g2m_ioport, list); + list_del(&ioport->list); + xfree(ioport); + } + } + + return hd->platform_ops->teardown(d); +} + +int iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn) +{ + struct hvm_iommu *hd = domain_hvm_iommu(d); + + if ( !iommu_enabled || !hd->platform_ops ) + return 0; + + return hd->platform_ops->map_page(d, gfn, mfn); +} + +int iommu_unmap_page(struct domain *d, unsigned long gfn) +{ + struct hvm_iommu *hd = domain_hvm_iommu(d); + + if ( !iommu_enabled || !hd->platform_ops ) + return 0; + + return hd->platform_ops->unmap_page(d, gfn); +} + +/* caller should hold the pcidevs_lock */ +int deassign_device(struct domain *d, u8 bus, u8 devfn) +{ + struct hvm_iommu *hd = domain_hvm_iommu(d); + struct pci_dev *pdev = NULL; + + if ( !iommu_enabled || !hd->platform_ops ) + return -EINVAL; + + ASSERT(spin_is_locked(&pcidevs_lock)); + pdev = pci_get_pdev(bus, devfn); + if (!pdev) + return -ENODEV; + + if (pdev->domain != d) + { + gdprintk(XENLOG_ERR VTDPREFIX, + "IOMMU: deassign a device not owned\n"); + return -EINVAL; + } + + return hd->platform_ops->reassign_device(d, dom0, bus, devfn); +} + +int iommu_setup(void) +{ + int rc = -ENODEV; + + if ( !iommu_enabled ) + goto out; + + rc = iommu_hardware_setup(); + + iommu_enabled = (rc == 0); + + out: + if ( force_iommu && !iommu_enabled ) + panic("IOMMU setup failed, crash Xen for security purpose!\n"); + + printk("I/O virtualisation %sabled\n", iommu_enabled ? "en" : "dis"); + return rc; +} + +int iommu_get_device_group(struct domain *d, u8 bus, u8 devfn, + XEN_GUEST_HANDLE_64(uint32_t) buf, int max_sdevs) +{ + struct hvm_iommu *hd = domain_hvm_iommu(d); + struct pci_dev *pdev; + int group_id, sdev_id; + u32 bdf; + int i = 0; + struct iommu_ops *ops = hd->platform_ops; + + if ( !iommu_enabled || !ops || !ops->get_device_group_id ) + return 0; + + group_id = ops->get_device_group_id(bus, devfn); + + spin_lock(&pcidevs_lock); + for_each_pdev( d, pdev ) + { + if ( (pdev->bus == bus) && (pdev->devfn == devfn) ) + continue; + + sdev_id = ops->get_device_group_id(pdev->bus, pdev->devfn); + if ( (sdev_id == group_id) && (i < max_sdevs) ) + { + bdf = 0; + bdf |= (pdev->bus & 0xff) << 16; + bdf |= (pdev->devfn & 0xff) << 8; + if ( unlikely(copy_to_guest_offset(buf, i, &bdf, 1)) ) + { + spin_unlock(&pcidevs_lock); + return -1; + } + i++; + } + } + spin_unlock(&pcidevs_lock); + + return i; +} + +void iommu_update_ire_from_apic( + unsigned int apic, unsigned int reg, unsigned int value) +{ + struct iommu_ops *ops = iommu_get_ops(); + ops->update_ire_from_apic(apic, reg, value); +} +void iommu_update_ire_from_msi( + struct msi_desc *msi_desc, struct msi_msg *msg) +{ + struct iommu_ops *ops = iommu_get_ops(); + ops->update_ire_from_msi(msi_desc, msg); +} +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ diff -Naurp xen/drivers/passthrough/Makefile xen-redhat/drivers/passthrough/Makefile --- xen/drivers/passthrough/Makefile +++ xen-redhat/drivers/passthrough/Makefile @@ -0,0 +1,8 @@ +subdir-$(x86_32) += vtd +subdir-$(x86_64) += vtd +subdir-$(x86_32) += amd +subdir-$(x86_64) += amd + +obj-y += iommu.o +obj-y += pci.o +obj-y += io.o diff -Naurp xen/drivers/passthrough/pci.c xen-redhat/drivers/passthrough/pci.c --- xen/drivers/passthrough/pci.c +++ xen-redhat/drivers/passthrough/pci.c @@ -0,0 +1,220 @@ +/* + * Copyright (C) 2008, Netronome Systems, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + */ + +#include <xen/sched.h> +#include <xen/pci.h> +#include <xen/pci_regs.h> +#include <xen/list.h> +#include <xen/prefetch.h> +#include <xen/iommu.h> +#include <asm/hvm/iommu.h> +#include <asm/hvm/irq.h> +#include <xen/delay.h> +#include <xen/keyhandler.h> + + +LIST_HEAD(alldevs_list); +spinlock_t pcidevs_lock = SPIN_LOCK_UNLOCKED; + +struct pci_dev *alloc_pdev(u8 bus, u8 devfn) +{ + struct pci_dev *pdev; + + list_for_each_entry ( pdev, &alldevs_list, alldevs_list ) + if ( pdev->bus == bus && pdev->devfn == devfn ) + return pdev; + + pdev = xmalloc(struct pci_dev); + if ( !pdev ) + return NULL; + memset(pdev, 0, sizeof(struct pci_dev)); + + *((u8*) &pdev->bus) = bus; + *((u8*) &pdev->devfn) = devfn; + pdev->domain = NULL; + INIT_LIST_HEAD(&pdev->msi_list); + list_add(&pdev->alldevs_list, &alldevs_list); + spin_lock_init(&pdev->msix_table_lock); + + return pdev; +} + +void free_pdev(struct pci_dev *pdev) +{ + list_del(&pdev->alldevs_list); + xfree(pdev); +} + +struct pci_dev *pci_get_pdev(int bus, int devfn) +{ + struct pci_dev *pdev = NULL; + + ASSERT(spin_is_locked(&pcidevs_lock)); + + list_for_each_entry ( pdev, &alldevs_list, alldevs_list ) + if ( (pdev->bus == bus || bus == -1) && + (pdev->devfn == devfn || devfn == -1) ) + { + return pdev; + } + + return NULL; +} + +struct pci_dev *pci_get_pdev_by_domain(struct domain *d, int bus, int devfn) +{ + struct pci_dev *pdev = NULL; + + ASSERT(spin_is_locked(&pcidevs_lock)); + + list_for_each_entry ( pdev, &alldevs_list, alldevs_list ) + if ( (pdev->bus == bus || bus == -1) && + (pdev->devfn == devfn || devfn == -1) && + (pdev->domain == d) ) + { + return pdev; + } + + return NULL; +} + +int pci_add_device(u8 bus, u8 devfn) +{ + struct pci_dev *pdev; + int ret = -ENOMEM; + + spin_lock(&pcidevs_lock); + pdev = alloc_pdev(bus, devfn); + if ( !pdev ) + goto out; + + ret = 0; + if ( !pdev->domain ) + { + pdev->domain = dom0; + ret = iommu_add_device(pdev); + if ( ret ) + goto out; + + list_add(&pdev->domain_list, &dom0->arch.pdev_list); + } + +out: + spin_unlock(&pcidevs_lock); + printk(XENLOG_DEBUG "PCI add device %02x:%02x.%x\n", bus, + PCI_SLOT(devfn), PCI_FUNC(devfn)); + return ret; +} + +int pci_remove_device(u8 bus, u8 devfn) +{ + struct pci_dev *pdev; + int ret = -ENODEV;; + + spin_lock(&pcidevs_lock); + list_for_each_entry ( pdev, &alldevs_list, alldevs_list ) + if ( pdev->bus == bus && pdev->devfn == devfn ) + { + ret = iommu_remove_device(pdev); + if ( pdev->domain ) + list_del(&pdev->domain_list); + pci_cleanup_msi(pdev); + free_pdev(pdev); + printk(XENLOG_DEBUG "PCI remove device %02x:%02x.%x\n", bus, + PCI_SLOT(devfn), PCI_FUNC(devfn)); + break; + } + + spin_unlock(&pcidevs_lock); + return ret; +} + +int pci_set_device_msixtbl(u8 bus, u8 devfn, u64 gtable) +{ + struct pci_dev *pdev; + + pdev = pci_get_pdev(bus, devfn); + + if ( !pdev ) + return -ENODEV; + + pdev->msix_table = gtable; + + return 0; +} + +static void pci_clean_dpci_irqs(struct domain *d) +{ + struct hvm_irq_dpci *hvm_irq_dpci = NULL; + uint32_t i; + struct list_head *digl_list, *tmp; + struct dev_intx_gsi_link *digl; + + if ( !iommu_enabled ) + return; + + spin_lock(&d->event_lock); + hvm_irq_dpci = domain_get_irq_dpci(d); + if ( hvm_irq_dpci != NULL ) + { + for ( i = find_first_bit(hvm_irq_dpci->mapping, NR_IRQS); + i < NR_IRQS; + i = find_next_bit(hvm_irq_dpci->mapping, NR_IRQS, i + 1) ) + { + pirq_guest_unbind(d, i); + kill_timer(&hvm_irq_dpci->hvm_timer[irq_to_vector(i)]); + + list_for_each_safe ( digl_list, tmp, + &hvm_irq_dpci->mirq[i].digl_list ) + { + digl = list_entry(digl_list, + struct dev_intx_gsi_link, list); + list_del(&digl->list); + xfree(digl); + } + } + + d->arch.hvm_domain.irq.dpci = NULL; + xfree(hvm_irq_dpci); + } + spin_unlock(&d->event_lock); +} + +void pci_release_devices(struct domain *d) +{ + struct pci_dev *pdev; + u8 bus, devfn; + + spin_lock(&pcidevs_lock); + pci_clean_dpci_irqs(d); + while ( (pdev = pci_get_pdev_by_domain(d, -1, -1)) ) + { + pci_cleanup_msi(pdev); + bus = pdev->bus; devfn = pdev->devfn; + deassign_device(d, bus, devfn); + } + spin_unlock(&pcidevs_lock); +} + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ diff -Naurp xen/drivers/passthrough/vtd/dmar.c xen-redhat/drivers/passthrough/vtd/dmar.c --- xen/drivers/passthrough/vtd/dmar.c +++ xen-redhat/drivers/passthrough/vtd/dmar.c @@ -0,0 +1,548 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Ashok Raj <ashok.raj@intel.com> + * Copyright (C) Shaohua Li <shaohua.li@intel.com> + * Copyright (C) Allen Kay <allen.m.kay@intel.com> - adapted to xen + */ + +#include <xen/init.h> +#include <xen/bitmap.h> +#include <xen/kernel.h> +#include <xen/acpi.h> +#include <xen/mm.h> +#include <xen/xmalloc.h> +#include <xen/pci.h> +#include <xen/pci_regs.h> +#include <asm/string.h> +#include "dmar.h" +#include "iommu.h" + +int vtd_enabled = 1; + +#undef PREFIX +#define PREFIX VTDPREFIX "ACPI DMAR:" +#define DEBUG + +#define MIN_SCOPE_LEN (sizeof(struct acpi_pci_path) + \ + sizeof(struct acpi_dev_scope)) + +LIST_HEAD(acpi_drhd_units); +LIST_HEAD(acpi_rmrr_units); +LIST_HEAD(acpi_atsr_units); + +u8 dmar_host_address_width; + +void dmar_scope_add_buses(struct dmar_scope *scope, u16 sec_bus, u16 sub_bus) +{ + sub_bus &= 0xff; + if (sec_bus > sub_bus) + return; + + while ( sec_bus <= sub_bus ) + set_bit(sec_bus++, scope->buses); +} + +void dmar_scope_remove_buses(struct dmar_scope *scope, u16 sec_bus, u16 sub_bus) +{ + sub_bus &= 0xff; + if (sec_bus > sub_bus) + return; + + while ( sec_bus <= sub_bus ) + clear_bit(sec_bus++, scope->buses); +} + +static int __init acpi_register_drhd_unit(struct acpi_drhd_unit *drhd) +{ + /* + * add INCLUDE_ALL at the tail, so scan the list will find it at + * the very end. + */ + if ( drhd->include_all ) + list_add_tail(&drhd->list, &acpi_drhd_units); + else + list_add(&drhd->list, &acpi_drhd_units); + return 0; +} + +static int __init acpi_register_rmrr_unit(struct acpi_rmrr_unit *rmrr) +{ + list_add(&rmrr->list, &acpi_rmrr_units); + return 0; +} + +static void __init disable_all_dmar_units(void) +{ + struct acpi_drhd_unit *drhd, *_drhd; + struct acpi_rmrr_unit *rmrr, *_rmrr; + struct acpi_atsr_unit *atsr, *_atsr; + + list_for_each_entry_safe ( drhd, _drhd, &acpi_drhd_units, list ) + { + list_del(&drhd->list); + xfree(drhd); + } + list_for_each_entry_safe ( rmrr, _rmrr, &acpi_rmrr_units, list ) + { + list_del(&rmrr->list); + xfree(rmrr); + } + list_for_each_entry_safe ( atsr, _atsr, &acpi_atsr_units, list ) + { + list_del(&atsr->list); + xfree(atsr); + } +} + +static int acpi_ioapic_device_match( + struct list_head *ioapic_list, unsigned int apic_id) +{ + struct acpi_ioapic_unit *ioapic; + list_for_each_entry( ioapic, ioapic_list, list ) { + if (ioapic->apic_id == apic_id) + return 1; + } + return 0; +} + +struct acpi_drhd_unit * ioapic_to_drhd(unsigned int apic_id) +{ + struct acpi_drhd_unit *drhd; + list_for_each_entry( drhd, &acpi_drhd_units, list ) + if ( acpi_ioapic_device_match(&drhd->ioapic_list, apic_id) ) + return drhd; + return NULL; +} + +struct iommu * ioapic_to_iommu(unsigned int apic_id) +{ + struct acpi_drhd_unit *drhd; + + list_for_each_entry( drhd, &acpi_drhd_units, list ) + if ( acpi_ioapic_device_match(&drhd->ioapic_list, apic_id) ) + return drhd->iommu; + return NULL; +} + +static int __init acpi_register_atsr_unit(struct acpi_atsr_unit *atsr) +{ + /* + * add ALL_PORTS at the tail, so scan the list will find it at + * the very end. + */ + if ( atsr->all_ports ) + list_add_tail(&atsr->list, &acpi_atsr_units); + else + list_add(&atsr->list, &acpi_atsr_units); + return 0; +} + +struct acpi_drhd_unit * acpi_find_matched_drhd_unit(u8 bus, u8 devfn) +{ + struct acpi_drhd_unit *drhd; + struct acpi_drhd_unit *found = NULL, *include_all = NULL; + int i; + + list_for_each_entry ( drhd, &acpi_drhd_units, list ) + { + for (i = 0; i < drhd->scope.devices_cnt; i++) + if ( drhd->scope.devices[i] == PCI_BDF2(bus, devfn) ) + return drhd; + + if ( test_bit(bus, drhd->scope.buses) ) + found = drhd; + + if ( drhd->include_all ) + include_all = drhd; + } + + return found ? found : include_all; +} + +struct acpi_atsr_unit * acpi_find_matched_atsr_unit(u8 bus, u8 devfn) +{ + struct acpi_atsr_unit *atsr; + struct acpi_atsr_unit *found = NULL, *include_all = NULL; + int i; + + list_for_each_entry ( atsr, &acpi_atsr_units, list ) + { + for (i = 0; i < atsr->scope.devices_cnt; i++) + if ( atsr->scope.devices[i] == PCI_BDF2(bus, devfn) ) + return atsr; + + if ( test_bit(bus, atsr->scope.buses) ) + found = atsr; + + if ( atsr->all_ports ) + include_all = atsr; + } + + return found ? found : include_all; +} + +/* + * Count number of devices in device scope. Do not include PCI sub + * hierarchies. + */ +static int scope_device_count(void *start, void *end) +{ + struct acpi_dev_scope *scope; + int count = 0; + + while ( start < end ) + { + scope = start; + if ( (scope->length < MIN_SCOPE_LEN) || + (scope->dev_type >= ACPI_DEV_ENTRY_COUNT) ) + { + dprintk(XENLOG_WARNING VTDPREFIX, "Invalid device scope.\n"); + return -EINVAL; + } + + if ( scope->dev_type == ACPI_DEV_ENDPOINT || + scope->dev_type == ACPI_DEV_IOAPIC || + scope->dev_type == ACPI_DEV_MSI_HPET ) + count++; + + start += scope->length; + } + + return count; +} + + +static int __init acpi_parse_dev_scope(void *start, void *end, + void *acpi_entry, int type) +{ + struct dmar_scope *scope = acpi_entry; + struct acpi_ioapic_unit *acpi_ioapic_unit; + struct acpi_dev_scope *acpi_scope; + u16 bus, sub_bus, sec_bus; + struct acpi_pci_path *path; + int depth, cnt, didx = 0; + + if ( (cnt = scope_device_count(start, end)) < 0 ) + return cnt; + + scope->devices_cnt = cnt; + if ( cnt > 0 ) + { + scope->devices = xmalloc_array(u16, cnt); + if ( !scope->devices ) + return -ENOMEM; + memset(scope->devices, 0, sizeof(u16) * cnt); + } + + while ( start < end ) + { + acpi_scope = start; + path = (struct acpi_pci_path *)(acpi_scope + 1); + depth = (acpi_scope->length - sizeof(struct acpi_dev_scope)) + / sizeof(struct acpi_pci_path); + bus = acpi_scope->start_bus; + + while ( --depth > 0 ) + { + bus = pci_conf_read8(bus, path->dev, path->fn, PCI_SECONDARY_BUS); + path++; + } + + switch ( acpi_scope->dev_type ) + { + case ACPI_DEV_P2PBRIDGE: + sec_bus = pci_conf_read8( + bus, path->dev, path->fn, PCI_SECONDARY_BUS); + sub_bus = pci_conf_read8( + bus, path->dev, path->fn, PCI_SUBORDINATE_BUS); + dprintk(XENLOG_INFO VTDPREFIX, + "found bridge: bdf = %x:%x.%x sec = %x sub = %x\n", + bus, path->dev, path->fn, sec_bus, sub_bus); + + dmar_scope_add_buses(scope, sec_bus, sub_bus); + break; + + case ACPI_DEV_MSI_HPET: + dprintk(XENLOG_INFO VTDPREFIX, "found MSI HPET: bdf = %x:%x.%x\n", + bus, path->dev, path->fn); + scope->devices[didx++] = PCI_BDF(bus, path->dev, path->fn); + break; + + case ACPI_DEV_ENDPOINT: + dprintk(XENLOG_INFO VTDPREFIX, "found endpoint: bdf = %x:%x.%x\n", + bus, path->dev, path->fn); + scope->devices[didx++] = PCI_BDF(bus, path->dev, path->fn); + break; + + case ACPI_DEV_IOAPIC: + dprintk(XENLOG_INFO VTDPREFIX, "found IOAPIC: bdf = %x:%x.%x\n", + bus, path->dev, path->fn); + + if ( type == DMAR_TYPE ) + { + struct acpi_drhd_unit *drhd = acpi_entry; + acpi_ioapic_unit = xmalloc(struct acpi_ioapic_unit); + if ( !acpi_ioapic_unit ) + return -ENOMEM; + acpi_ioapic_unit->apic_id = acpi_scope->enum_id; + acpi_ioapic_unit->ioapic.bdf.bus = bus; + acpi_ioapic_unit->ioapic.bdf.dev = path->dev; + acpi_ioapic_unit->ioapic.bdf.func = path->fn; + list_add(&acpi_ioapic_unit->list, &drhd->ioapic_list); + } + + scope->devices[didx++] = PCI_BDF(bus, path->dev, path->fn); + break; + } + + start += acpi_scope->length; + } + + return 0; +} + +static int __init +acpi_parse_one_drhd(struct acpi_dmar_entry_header *header) +{ + struct acpi_table_drhd * drhd = (struct acpi_table_drhd *)header; + void *dev_scope_start, *dev_scope_end; + struct acpi_drhd_unit *dmaru; + int ret = 0; + static int include_all = 0; + + dmaru = xmalloc(struct acpi_drhd_unit); + if ( !dmaru ) + return -ENOMEM; + memset(dmaru, 0, sizeof(struct acpi_drhd_unit)); + + dmaru->address = drhd->address; + dmaru->include_all = drhd->flags & 1; /* BIT0: INCLUDE_ALL */ + INIT_LIST_HEAD(&dmaru->ioapic_list); + dprintk(XENLOG_INFO VTDPREFIX, "dmaru->address = %"PRIx64"\n", + dmaru->address); + + dev_scope_start = (void *)(drhd + 1); + dev_scope_end = ((void *)drhd) + header->length; + ret = acpi_parse_dev_scope(dev_scope_start, dev_scope_end, + dmaru, DMAR_TYPE); + + if ( dmaru->include_all ) + { + dprintk(XENLOG_INFO VTDPREFIX, "found INCLUDE_ALL\n"); + /* Only allow one INCLUDE_ALL */ + if ( include_all ) + { + dprintk(XENLOG_WARNING VTDPREFIX, + "Only one INCLUDE_ALL device scope is allowed\n"); + ret = -EINVAL; + } + include_all = 1; + } + + if ( ret ) + xfree(dmaru); + else + acpi_register_drhd_unit(dmaru); + return ret; +} + +static int __init +acpi_parse_one_rmrr(struct acpi_dmar_entry_header *header) +{ + struct acpi_table_rmrr *rmrr = (struct acpi_table_rmrr *)header; + struct acpi_rmrr_unit *rmrru; + void *dev_scope_start, *dev_scope_end; + int ret = 0; + + if ( rmrr->base_address >= rmrr->end_address ) + { + dprintk(XENLOG_ERR VTDPREFIX, + "RMRR error: base_addr %"PRIx64" end_address %"PRIx64"\n", + rmrr->base_address, rmrr->end_address); + return -EFAULT; + } + + rmrru = xmalloc(struct acpi_rmrr_unit); + if ( !rmrru ) + return -ENOMEM; + memset(rmrru, 0, sizeof(struct acpi_rmrr_unit)); + + rmrru->base_address = rmrr->base_address; + rmrru->end_address = rmrr->end_address; + dev_scope_start = (void *)(rmrr + 1); + dev_scope_end = ((void *)rmrr) + header->length; + ret = acpi_parse_dev_scope(dev_scope_start, dev_scope_end, + rmrru, RMRR_TYPE); + + if ( ret || (rmrru->scope.devices_cnt == 0) ) + xfree(rmrru); + else + acpi_register_rmrr_unit(rmrru); + return ret; +} + +static int __init +acpi_parse_one_atsr(struct acpi_dmar_entry_header *header) +{ + struct acpi_table_atsr *atsr = (struct acpi_table_atsr *)header; + struct acpi_atsr_unit *atsru; + int ret = 0; + static int all_ports; + void *dev_scope_start, *dev_scope_end; + + atsru = xmalloc(struct acpi_atsr_unit); + if ( !atsru ) + return -ENOMEM; + memset(atsru, 0, sizeof(struct acpi_atsr_unit)); + + atsru->all_ports = atsr->flags & 1; /* BIT0: ALL_PORTS */ + if ( !atsru->all_ports ) + { + dev_scope_start = (void *)(atsr + 1); + dev_scope_end = ((void *)atsr) + header->length; + ret = acpi_parse_dev_scope(dev_scope_start, dev_scope_end, + atsru, ATSR_TYPE); + } + else + { + dprintk(XENLOG_INFO VTDPREFIX, "found ALL_PORTS\n"); + /* Only allow one ALL_PORTS */ + if ( all_ports ) + { + dprintk(XENLOG_WARNING VTDPREFIX, + "Only one ALL_PORTS device scope is allowed\n"); + ret = -EINVAL; + } + all_ports = 1; + } + + if ( ret ) + xfree(atsr); + else + acpi_register_atsr_unit(atsru); + return ret; +} + +static int __init acpi_parse_dmar(unsigned long phys_addr, + unsigned long size) +{ + struct acpi_table_dmar *dmar; + struct acpi_dmar_entry_header *entry_header; + int ret = 0; + + if ( !phys_addr || !size ) + return -EINVAL; + + dmar = (struct acpi_table_dmar *)__acpi_map_table(phys_addr, size); + if ( !dmar ) + { + dprintk(XENLOG_WARNING VTDPREFIX, "Unable to map DMAR\n"); + return -ENODEV; + } + + if ( !dmar->haw ) + { + dprintk(XENLOG_WARNING VTDPREFIX, "Zero: Invalid DMAR width\n"); + if ( force_iommu ) + panic("acpi_parse_dmar: Invalid DMAR width," + " crash Xen for security purpose!\n"); + return -EINVAL; + } + + dmar_host_address_width = dmar->haw + 1; + dprintk(XENLOG_INFO VTDPREFIX, "Host address width %d\n", + dmar_host_address_width); + + entry_header = (struct acpi_dmar_entry_header *)(dmar + 1); + while ( ((unsigned long)entry_header) < + (((unsigned long)dmar) + size) ) + { + switch ( entry_header->type ) + { + case ACPI_DMAR_DRHD: + dprintk(XENLOG_INFO VTDPREFIX, "found ACPI_DMAR_DRHD\n"); + ret = acpi_parse_one_drhd(entry_header); + break; + case ACPI_DMAR_RMRR: + dprintk(XENLOG_INFO VTDPREFIX, "found ACPI_DMAR_RMRR\n"); + ret = acpi_parse_one_rmrr(entry_header); + break; + case ACPI_DMAR_ATSR: + dprintk(XENLOG_INFO VTDPREFIX, "found ACPI_DMAR_ATSR\n"); + ret = acpi_parse_one_atsr(entry_header); + break; + default: + /* + * Don't panic on an unknown table, just print a warning + * and continue + */ + dprintk(XENLOG_WARNING VTDPREFIX, "Unknown DMAR structure type\n"); + break; + } + if ( ret ) + break; + + entry_header = ((void *)entry_header + entry_header->length); + } + + /* Zap APCI DMAR signature to prevent dom0 using vt-d HW. */ + dmar->header.signature[0] = '\0'; + + if ( ret ) + { + if ( force_iommu ) + panic("acpi_parse_dmar: Failed to parse ACPI DMAR," + " crash Xen for security purpose!\n"); + else + { + printk(XENLOG_WARNING + "Failed to parse ACPI DMAR. Disabling VT-d.\n"); + disable_all_dmar_units(); + } + } + + return ret; +} + +int acpi_dmar_init(void) +{ + int rc; + + rc = -ENODEV; + if ( force_iommu ) + iommu_enabled = 1; + + if ( !iommu_enabled ) + goto fail; + + acpi_table_parse(ACPI_DMAR, acpi_parse_dmar); + + if ( list_empty(&acpi_drhd_units) ) + goto fail; + + printk("Intel VT-d has been enabled\n"); + + return 0; + + fail: + if ( force_iommu ) + panic("acpi_dmar_init: acpi_dmar_init failed," + " crash Xen for security purpose!\n"); + + vtd_enabled = 0; + return rc; +} diff -Naurp xen/drivers/passthrough/vtd/dmar.h xen-redhat/drivers/passthrough/vtd/dmar.h --- xen/drivers/passthrough/vtd/dmar.h +++ xen-redhat/drivers/passthrough/vtd/dmar.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Ashok Raj <ashok.raj@intel.com> + * Copyright (C) Shaohua Li <shaohua.li@intel.com> + */ + +#ifndef _DMAR_H_ +#define _DMAR_H_ + +#include <xen/list.h> +#include <xen/iommu.h> + +extern u8 dmar_host_address_width; + +/* This one is for interrupt remapping */ +struct acpi_ioapic_unit { + struct list_head list; + int apic_id; + union { + u16 info; + struct { + u16 func: 3, + dev: 5, + bus: 8; + }bdf; + }ioapic; +}; + +struct dmar_scope { + DECLARE_BITMAP(buses, 256); /* buses owned by this unit */ + u16 *devices; /* devices owned by this unit */ + int devices_cnt; +}; + +struct acpi_drhd_unit { + struct dmar_scope scope; /* must be first member of struct */ + struct list_head list; + u64 address; /* register base address of the unit */ + u8 include_all:1; + struct iommu *iommu; + struct list_head ioapic_list; +}; + +struct acpi_rmrr_unit { + struct dmar_scope scope; /* must be first member of struct */ + struct list_head list; + u64 base_address; + u64 end_address; + u8 allow_all:1; +}; + +struct acpi_atsr_unit { + struct dmar_scope scope; /* must be first member of struct */ + struct list_head list; + u8 all_ports:1; +}; + + +#define for_each_drhd_unit(drhd) \ + list_for_each_entry(drhd, &acpi_drhd_units, list) + +#define for_each_rmrr_device(rmrr, bdf, idx) \ + list_for_each_entry(rmrr, &acpi_rmrr_units, list) \ + /* assume there never is a bdf == 0 */ \ + for (idx = 0; (bdf = rmrr->scope.devices[idx]) && \ + idx < rmrr->scope.devices_cnt; idx++) + +struct acpi_drhd_unit * acpi_find_matched_drhd_unit(u8 bus, u8 devfn); +struct acpi_atsr_unit * acpi_find_matched_atsr_unit(u8 bus, u8 devfn); +void dmar_scope_add_buses(struct dmar_scope *scope, u16 sec, u16 sub); +void dmar_scope_remove_buses(struct dmar_scope *scope, u16 sec, u16 sub); + +#define DMAR_TYPE 1 +#define RMRR_TYPE 2 +#define ATSR_TYPE 3 + +#define DMAR_OPERATION_TIMEOUT MILLISECS(1000) + +int vtd_hw_check(void); +void disable_pmr(struct iommu *iommu); +int is_usb_device(u8 bus, u8 devfn); + +#endif /* _DMAR_H_ */ diff -Naurp xen/drivers/passthrough/vtd/extern.h xen-redhat/drivers/passthrough/vtd/extern.h --- xen/drivers/passthrough/vtd/extern.h +++ xen-redhat/drivers/passthrough/vtd/extern.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Allen Kay <allen.m.kay@intel.com> + * Copyright (C) Weidong Han <weidong.han@intel.com> + */ + +#ifndef _VTD_EXTERN_H_ +#define _VTD_EXTERN_H_ + +#include "dmar.h" + +extern struct qi_ctrl *qi_ctrl; +extern struct ir_ctrl *ir_ctrl; + +void print_iommu_regs(struct acpi_drhd_unit *drhd); +void print_vtd_entries(struct iommu *iommu, int bus, int devfn, u64 gmfn); +void dump_iommu_info(unsigned char key); + +int qinval_setup(struct iommu *iommu); +int intremap_setup(struct iommu *iommu); +int queue_invalidate_context(struct iommu *iommu, + u16 did, u16 source_id, u8 function_mask, u8 granu); +int queue_invalidate_iotlb(struct iommu *iommu, + u8 granu, u8 dr, u8 dw, u16 did, u8 am, u8 ih, u64 addr); +int queue_invalidate_iec(struct iommu *iommu, + u8 granu, u8 im, u16 iidx); +int invalidate_sync(struct iommu *iommu); +int iommu_flush_iec_global(struct iommu *iommu); +int iommu_flush_iec_index(struct iommu *iommu, u8 im, u16 iidx); +struct iommu * ioapic_to_iommu(unsigned int apic_id); +struct acpi_drhd_unit * ioapic_to_drhd(unsigned int apic_id); +void clear_fault_bits(struct iommu *iommu); + +#endif // _VTD_EXTERN_H_ diff -Naurp xen/drivers/passthrough/vtd/intremap.c xen-redhat/drivers/passthrough/vtd/intremap.c --- xen/drivers/passthrough/vtd/intremap.c +++ xen-redhat/drivers/passthrough/vtd/intremap.c @@ -0,0 +1,703 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Allen Kay <allen.m.kay@intel.com> + * Copyright (C) Xiaohui Xin <xiaohui.xin@intel.com> + */ + +#include <xen/irq.h> +#include <xen/sched.h> +#include <xen/iommu.h> +#include <asm/hvm/iommu.h> +#include <xen/time.h> +#include <xen/list.h> +#include <xen/pci.h> +#include <xen/pci_regs.h> +#include "iommu.h" +#include "dmar.h" +#include "vtd.h" +#include "extern.h" + +/* The max number of IOAPIC (or IOSAPIC) pin. The typical values can be 24 or + * 48 on x86 and Itanium platforms. Here we use a biger number 256. This + * should be big enough. Actually now IREMAP_ENTRY_NR is also 256. + */ +#define MAX_IOAPIC_PIN_NUM 256 + +struct ioapicid_pin_intremap_index { + struct list_head list; + unsigned int ioapic_id; + unsigned int pin; + int intremap_index; +}; + +static struct list_head ioapic_pin_to_intremap_index[MAX_IOAPIC_PIN_NUM]; + +static int init_ioapic_pin_intremap_index(void) +{ + static int initialized = 0; + int i; + + if ( initialized == 1 ) + return 0; + + for ( i = 0; i < MAX_IOAPIC_PIN_NUM; i++ ) + INIT_LIST_HEAD(&ioapic_pin_to_intremap_index[i]); + + initialized = 1; + return 0; +} + +static int get_ioapic_pin_intremap_index(unsigned int ioapic_id, + unsigned int pin) +{ + struct ioapicid_pin_intremap_index *entry; + struct list_head *pos, *tmp; + + list_for_each_safe ( pos, tmp, &ioapic_pin_to_intremap_index[pin] ) + { + entry = list_entry(pos, struct ioapicid_pin_intremap_index, list); + if ( entry->ioapic_id == ioapic_id ) + return entry->intremap_index; + } + + return -1; +} + +static int set_ioapic_pin_intremap_index(unsigned int ioapic_id, + unsigned int pin, + int index) +{ + struct ioapicid_pin_intremap_index *entry; + + entry = xmalloc(struct ioapicid_pin_intremap_index); + if ( !entry ) + return -ENOMEM; + + entry->ioapic_id = ioapic_id; + entry->pin = pin; + entry->intremap_index = index; + + list_add_tail(&entry->list, &ioapic_pin_to_intremap_index[pin]); + + return 0; +} + +u16 apicid_to_bdf(int apic_id) +{ + struct acpi_drhd_unit *drhd = ioapic_to_drhd(apic_id); + struct acpi_ioapic_unit *acpi_ioapic_unit; + + list_for_each_entry ( acpi_ioapic_unit, &drhd->ioapic_list, list ) + if ( acpi_ioapic_unit->apic_id == apic_id ) + return acpi_ioapic_unit->ioapic.info; + + dprintk(XENLOG_ERR VTDPREFIX, "Didn't find the bdf for the apic_id!\n"); + return 0; +} + +/* Mark specified intr remap entry as free */ +static void free_remap_entry(struct iommu *iommu, int index) +{ + struct iremap_entry *iremap_entry = NULL, *iremap_entries; + struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); + + if ( index < 0 || index > IREMAP_ENTRY_NR - 1 ) + return; + + ASSERT( spin_is_locked(&ir_ctrl->iremap_lock) ); + + GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, index, + iremap_entries, iremap_entry); + + memset(iremap_entry, 0, sizeof(struct iremap_entry)); + iommu_flush_cache_entry(iremap_entry); + iommu_flush_iec_index(iommu, 0, index); + + unmap_vtd_domain_page(iremap_entries); + ir_ctrl->iremap_num--; +} + +/* + * Look for a free intr remap entry. + * Need hold iremap_lock, and setup returned entry before releasing lock. + */ +static int alloc_remap_entry(struct iommu *iommu) +{ + struct iremap_entry *iremap_entries = NULL; + struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); + int i; + + ASSERT( spin_is_locked(&ir_ctrl->iremap_lock) ); + + for ( i = 0; i < IREMAP_ENTRY_NR; i++ ) + { + struct iremap_entry *p; + if ( i % (1 << IREMAP_ENTRY_ORDER) == 0 ) + { + /* This entry across page boundry */ + if ( iremap_entries ) + unmap_vtd_domain_page(iremap_entries); + + GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, i, + iremap_entries, p); + } + else + p = &iremap_entries[i % (1 << IREMAP_ENTRY_ORDER)]; + + if ( p->lo_val == 0 && p->hi_val == 0 ) /* a free entry */ + break; + } + + if ( iremap_entries ) + unmap_vtd_domain_page(iremap_entries); + + if ( i < IREMAP_ENTRY_NR ) + ir_ctrl->iremap_num++; + return i; +} + +static int remap_entry_to_ioapic_rte( + struct iommu *iommu, int index, struct IO_xAPIC_route_entry *old_rte) +{ + struct iremap_entry *iremap_entry = NULL, *iremap_entries; + unsigned long flags; + struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); + + if ( ir_ctrl == NULL ) + { + dprintk(XENLOG_ERR VTDPREFIX, + "remap_entry_to_ioapic_rte: ir_ctl is not ready\n"); + return -EFAULT; + } + + if ( index < 0 || index > IREMAP_ENTRY_NR - 1 ) + { + dprintk(XENLOG_ERR VTDPREFIX, + "%s: index (%d) for remap table is invalid !\n", + __func__, index); + return -EFAULT; + } + + spin_lock_irqsave(&ir_ctrl->iremap_lock, flags); + + GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, index, + iremap_entries, iremap_entry); + + if ( iremap_entry->hi_val == 0 && iremap_entry->lo_val == 0 ) + { + dprintk(XENLOG_ERR VTDPREFIX, + "%s: index (%d) get an empty entry!\n", + __func__, index); + unmap_vtd_domain_page(iremap_entries); + spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); + return -EFAULT; + } + + old_rte->vector = iremap_entry->lo.vector; + old_rte->delivery_mode = iremap_entry->lo.dlm; + old_rte->dest_mode = iremap_entry->lo.dm; + old_rte->trigger = iremap_entry->lo.tm; + old_rte->__reserved_2 = 0; + old_rte->dest.logical.__reserved_1 = 0; + old_rte->dest.logical.logical_dest = iremap_entry->lo.dst >> 8; + + unmap_vtd_domain_page(iremap_entries); + spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); + return 0; +} + +static int ioapic_rte_to_remap_entry(struct iommu *iommu, + int apic_id, unsigned int ioapic_pin, struct IO_xAPIC_route_entry *old_rte, + unsigned int rte_upper, unsigned int value) +{ + struct iremap_entry *iremap_entry = NULL, *iremap_entries; + struct iremap_entry new_ire; + struct IO_APIC_route_remap_entry *remap_rte; + struct IO_xAPIC_route_entry new_rte; + int index; + unsigned long flags; + struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); + + remap_rte = (struct IO_APIC_route_remap_entry *) old_rte; + spin_lock_irqsave(&ir_ctrl->iremap_lock, flags); + + index = get_ioapic_pin_intremap_index(apic_id, ioapic_pin); + if ( index < 0 ) + { + index = alloc_remap_entry(iommu); + if ( index < IREMAP_ENTRY_NR ) + set_ioapic_pin_intremap_index(apic_id, ioapic_pin, index); + } + + if ( index > IREMAP_ENTRY_NR - 1 ) + { + dprintk(XENLOG_ERR VTDPREFIX, + "%s: intremap index (%d) is larger than" + " the maximum index (%d)!\n", + __func__, index, IREMAP_ENTRY_NR - 1); + spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); + return -EFAULT; + } + + GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, index, + iremap_entries, iremap_entry); + + memcpy(&new_ire, iremap_entry, sizeof(struct iremap_entry)); + + if ( rte_upper ) + { +#if defined(__i386__) || defined(__x86_64__) + new_ire.lo.dst = (value >> 24) << 8; +#else /* __ia64__ */ + new_ire.lo.dst = value >> 16; +#endif + } + else + { + *(((u32 *)&new_rte) + 0) = value; + new_ire.lo.fpd = 0; + new_ire.lo.dm = new_rte.dest_mode; + new_ire.lo.rh = 0; + new_ire.lo.tm = new_rte.trigger; + new_ire.lo.dlm = new_rte.delivery_mode; + new_ire.lo.avail = 0; + new_ire.lo.res_1 = 0; + new_ire.lo.vector = new_rte.vector; + new_ire.lo.res_2 = 0; + new_ire.hi.sid = apicid_to_bdf(apic_id); + + new_ire.hi.sq = 0; /* comparing all 16-bit of SID */ + new_ire.hi.svt = 1; /* requestor ID verification SID/SQ */ + new_ire.hi.res_1 = 0; + new_ire.lo.p = 1; /* finally, set present bit */ + + /* now construct new ioapic rte entry */ + remap_rte->vector = new_rte.vector; + remap_rte->delivery_mode = 0; /* has to be 0 for remap format */ + remap_rte->index_15 = (index >> 15) & 0x1; + remap_rte->index_0_14 = index & 0x7fff; + + remap_rte->delivery_status = new_rte.delivery_status; + remap_rte->polarity = new_rte.polarity; + remap_rte->irr = new_rte.irr; + remap_rte->trigger = new_rte.trigger; + remap_rte->mask = new_rte.mask; + remap_rte->reserved = 0; + remap_rte->format = 1; /* indicate remap format */ + } + + memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry)); + iommu_flush_cache_entry(iremap_entry); + iommu_flush_iec_index(iommu, 0, index); + invalidate_sync(iommu); + + unmap_vtd_domain_page(iremap_entries); + spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); + return 0; +} + +unsigned int io_apic_read_remap_rte( + unsigned int apic, unsigned int reg) +{ + struct IO_xAPIC_route_entry old_rte = { 0 }; + struct IO_APIC_route_remap_entry *remap_rte; + int rte_upper = (reg & 1) ? 1 : 0; + struct iommu *iommu = ioapic_to_iommu(IO_APIC_ID(apic)); + struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); + unsigned int ioapic_pin = (reg - 0x10) / 2; + int index; + + if ( !iommu || !ir_ctrl || ir_ctrl->iremap_maddr == 0 || + (ir_ctrl->iremap_num == 0) ) + { + *IO_APIC_BASE(apic) = reg; + return *(IO_APIC_BASE(apic)+4); + } + + index = get_ioapic_pin_intremap_index(IO_APIC_ID(apic), ioapic_pin); + if ( index < 0 ) + { + *IO_APIC_BASE(apic) = reg; + return *(IO_APIC_BASE(apic)+4); + } + + if ( rte_upper ) + reg--; + + /* read lower and upper 32-bits of rte entry */ + *IO_APIC_BASE(apic) = reg; + *(((u32 *)&old_rte) + 0) = *(IO_APIC_BASE(apic)+4); + *IO_APIC_BASE(apic) = reg + 1; + *(((u32 *)&old_rte) + 1) = *(IO_APIC_BASE(apic)+4); + + remap_rte = (struct IO_APIC_route_remap_entry *) &old_rte; + + if ( remap_entry_to_ioapic_rte(iommu, index, &old_rte) ) + { + *IO_APIC_BASE(apic) = rte_upper ? (reg + 1) : reg; + return *(IO_APIC_BASE(apic)+4); + } + + if ( rte_upper ) + return (*(((u32 *)&old_rte) + 1)); + else + return (*(((u32 *)&old_rte) + 0)); +} + +void io_apic_write_remap_rte( + unsigned int apic, unsigned int reg, unsigned int value) +{ + unsigned int ioapic_pin = (reg - 0x10) / 2; + struct IO_xAPIC_route_entry old_rte = { 0 }; + struct IO_APIC_route_remap_entry *remap_rte; + unsigned int rte_upper = (reg & 1) ? 1 : 0; + struct iommu *iommu = ioapic_to_iommu(IO_APIC_ID(apic)); + struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); + int saved_mask; + + if ( !iommu || !ir_ctrl || ir_ctrl->iremap_maddr == 0 ) + { + *IO_APIC_BASE(apic) = reg; + *(IO_APIC_BASE(apic)+4) = value; + return; + } + + if ( rte_upper ) + reg--; + + /* read both lower and upper 32-bits of rte entry */ + *IO_APIC_BASE(apic) = reg; + *(((u32 *)&old_rte) + 0) = *(IO_APIC_BASE(apic)+4); + *IO_APIC_BASE(apic) = reg + 1; + *(((u32 *)&old_rte) + 1) = *(IO_APIC_BASE(apic)+4); + + remap_rte = (struct IO_APIC_route_remap_entry *) &old_rte; + + /* mask the interrupt while we change the intremap table */ + saved_mask = remap_rte->mask; + remap_rte->mask = 1; + *IO_APIC_BASE(apic) = reg; + *(IO_APIC_BASE(apic)+4) = *(((int *)&old_rte)+0); + remap_rte->mask = saved_mask; + + ASSERT(ioapic_pin < MAX_IOAPIC_PIN_NUM); + if ( ioapic_rte_to_remap_entry(iommu, IO_APIC_ID(apic), ioapic_pin, + &old_rte, rte_upper, value) ) + { + *IO_APIC_BASE(apic) = rte_upper ? (reg + 1) : reg; + *(IO_APIC_BASE(apic)+4) = value; + return; + } + + /* write new entry to ioapic */ + *IO_APIC_BASE(apic) = reg; + *(IO_APIC_BASE(apic)+4) = *(((u32 *)&old_rte)+0); + *IO_APIC_BASE(apic) = reg + 1; + *(IO_APIC_BASE(apic)+4) = *(((u32 *)&old_rte)+1); +} + +#if defined(__i386__) || defined(__x86_64__) +static int remap_entry_to_msi_msg( + struct iommu *iommu, struct msi_msg *msg) +{ + struct iremap_entry *iremap_entry = NULL, *iremap_entries; + struct msi_msg_remap_entry *remap_rte; + int index; + unsigned long flags; + struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); + + if ( ir_ctrl == NULL ) + { + dprintk(XENLOG_ERR VTDPREFIX, + "remap_entry_to_msi_msg: ir_ctl == NULL"); + return -EFAULT; + } + + remap_rte = (struct msi_msg_remap_entry *) msg; + index = (remap_rte->address_lo.index_15 << 15) | + remap_rte->address_lo.index_0_14; + + if ( index < 0 || index > IREMAP_ENTRY_NR - 1 ) + { + dprintk(XENLOG_ERR VTDPREFIX, + "%s: index (%d) for remap table is invalid !\n", + __func__, index); + return -EFAULT; + } + + spin_lock_irqsave(&ir_ctrl->iremap_lock, flags); + + GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, index, + iremap_entries, iremap_entry); + + if ( iremap_entry->hi_val == 0 && iremap_entry->lo_val == 0 ) + { + dprintk(XENLOG_ERR VTDPREFIX, + "%s: index (%d) get an empty entry!\n", + __func__, index); + unmap_vtd_domain_page(iremap_entries); + spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); + return -EFAULT; + } + + msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_lo = + MSI_ADDR_BASE_LO | + ((iremap_entry->lo.dm == 0) ? + MSI_ADDR_DESTMODE_PHYS: + MSI_ADDR_DESTMODE_LOGIC) | + ((iremap_entry->lo.dlm != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU: + MSI_ADDR_REDIRECTION_LOWPRI) | + iremap_entry->lo.dst >> 8; + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((iremap_entry->lo.dlm != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED: + MSI_DATA_DELIVERY_LOWPRI) | + iremap_entry->lo.vector; + + unmap_vtd_domain_page(iremap_entries); + spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); + return 0; +} + +static int msi_msg_to_remap_entry( + struct iommu *iommu, struct pci_dev *pdev, + struct msi_desc *msi_desc, struct msi_msg *msg) +{ + struct iremap_entry *iremap_entry = NULL, *iremap_entries; + struct iremap_entry new_ire; + struct msi_msg_remap_entry *remap_rte; + unsigned int index; + unsigned long flags; + struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); + + remap_rte = (struct msi_msg_remap_entry *) msg; + spin_lock_irqsave(&ir_ctrl->iremap_lock, flags); + + if ( msg == NULL ) + { + /* Free specified unused IRTE */ + free_remap_entry(iommu, msi_desc->remap_index); + spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); + return 0; + } + + if ( msi_desc->remap_index < 0 ) + { + /* + * TODO: Multiple-vector MSI requires allocating multiple continuous + * entries and configuring addr/data of msi_msg in different way. So + * alloca_remap_entry will be changed if enabling multiple-vector MSI + * in future. + */ + index = alloc_remap_entry(iommu); + msi_desc->remap_index = index; + } + else + index = msi_desc->remap_index; + + if ( index > IREMAP_ENTRY_NR - 1 ) + { + dprintk(XENLOG_ERR VTDPREFIX, + "%s: intremap index (%d) is larger than" + " the maximum index (%d)!\n", + __func__, index, IREMAP_ENTRY_NR - 1); + msi_desc->remap_index = -1; + spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); + return -EFAULT; + } + + GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, index, + iremap_entries, iremap_entry); + + memcpy(&new_ire, iremap_entry, sizeof(struct iremap_entry)); + + /* Set interrupt remapping table entry */ + new_ire.lo.fpd = 0; + new_ire.lo.dm = (msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT) & 0x1; + new_ire.lo.rh = 0; + new_ire.lo.tm = (msg->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1; + new_ire.lo.dlm = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x1; + new_ire.lo.avail = 0; + new_ire.lo.res_1 = 0; + new_ire.lo.vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) & + MSI_DATA_VECTOR_MASK; + new_ire.lo.res_2 = 0; + new_ire.lo.dst = ((msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT) + & 0xff) << 8; + + new_ire.hi.sid = (pdev->bus << 8) | pdev->devfn; + new_ire.hi.sq = 0; + new_ire.hi.svt = 1; + new_ire.hi.res_1 = 0; + new_ire.lo.p = 1; /* finally, set present bit */ + + /* now construct new MSI/MSI-X rte entry */ + remap_rte->address_lo.dontcare = 0; + remap_rte->address_lo.index_15 = (index >> 15) & 0x1; + remap_rte->address_lo.index_0_14 = index & 0x7fff; + remap_rte->address_lo.SHV = 1; + remap_rte->address_lo.format = 1; + + remap_rte->address_hi = 0; + remap_rte->data = 0; + + memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry)); + iommu_flush_cache_entry(iremap_entry); + iommu_flush_iec_index(iommu, 0, index); + invalidate_sync(iommu); + + unmap_vtd_domain_page(iremap_entries); + spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); + return 0; +} + +void msi_msg_read_remap_rte( + struct msi_desc *msi_desc, struct msi_msg *msg) +{ + struct pci_dev *pdev = msi_desc->dev; + struct acpi_drhd_unit *drhd = NULL; + struct iommu *iommu = NULL; + struct ir_ctrl *ir_ctrl; + + drhd = acpi_find_matched_drhd_unit(pdev->bus, pdev->devfn); + if (!drhd) + return; + iommu = drhd->iommu; + + ir_ctrl = iommu_ir_ctrl(iommu); + if ( !iommu || !ir_ctrl || ir_ctrl->iremap_maddr == 0 ) + return; + + remap_entry_to_msi_msg(iommu, msg); +} + +void msi_msg_write_remap_rte( + struct msi_desc *msi_desc, struct msi_msg *msg) +{ + struct pci_dev *pdev = msi_desc->dev; + struct acpi_drhd_unit *drhd = NULL; + struct iommu *iommu = NULL; + struct ir_ctrl *ir_ctrl; + + drhd = acpi_find_matched_drhd_unit(pdev->bus, pdev->devfn); + if (!drhd) + return; + iommu = drhd->iommu; + + ir_ctrl = iommu_ir_ctrl(iommu); + if ( !iommu || !ir_ctrl || ir_ctrl->iremap_maddr == 0 ) + return; + + msi_msg_to_remap_entry(iommu, pdev, msi_desc, msg); +} +#elif defined(__ia64__) +void msi_msg_read_remap_rte( + struct msi_desc *msi_desc, struct msi_msg *msg) +{ + /* TODO. */ +} + +void msi_msg_write_remap_rte( + struct msi_desc *msi_desc, struct msi_msg *msg) +{ + /* TODO. */ +} +#endif + +int intremap_setup(struct iommu *iommu) +{ + struct ir_ctrl *ir_ctrl; + s_time_t start_time; + + if ( !ecap_intr_remap(iommu->ecap) ) + return -ENODEV; + + ir_ctrl = iommu_ir_ctrl(iommu); + if ( ir_ctrl->iremap_maddr == 0 ) + { + ir_ctrl->iremap_maddr = alloc_pgtable_maddr(NULL); + if ( ir_ctrl->iremap_maddr == 0 ) + { + dprintk(XENLOG_WARNING VTDPREFIX, + "Cannot allocate memory for ir_ctrl->iremap_maddr\n"); + return -ENOMEM; + } + ir_ctrl->iremap_num = 0; + } + +#if defined(ENABLED_EXTENDED_INTERRUPT_SUPPORT) + /* set extended interrupt mode bit */ + ir_ctrl->iremap_maddr |= + ecap_ext_intr(iommu->ecap) ? (1 << IRTA_REG_EIME_SHIFT) : 0; +#endif + /* set size of the interrupt remapping table */ + ir_ctrl->iremap_maddr |= IRTA_REG_TABLE_SIZE; + dmar_writeq(iommu->reg, DMAR_IRTA_REG, ir_ctrl->iremap_maddr); + + /* set SIRTP */ + iommu->gcmd |= DMA_GCMD_SIRTP; + dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd); + + /* Make sure hardware complete it */ + start_time = NOW(); + while ( !(dmar_readl(iommu->reg, DMAR_GSTS_REG) & DMA_GSTS_SIRTPS) ) + { + if ( NOW() > (start_time + DMAR_OPERATION_TIMEOUT) ) + panic("Cannot set SIRTP field for interrupt remapping\n"); + cpu_relax(); + } + + /* enable comaptiblity format interrupt pass through */ + iommu->gcmd |= DMA_GCMD_CFI; + dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd); + + start_time = NOW(); + while ( !(dmar_readl(iommu->reg, DMAR_GSTS_REG) & DMA_GSTS_CFIS) ) + { + if ( NOW() > (start_time + DMAR_OPERATION_TIMEOUT) ) + panic("Cannot set CFI field for interrupt remapping\n"); + cpu_relax(); + } + + /* enable interrupt remapping hardware */ + iommu->gcmd |= DMA_GCMD_IRE; + dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd); + + start_time = NOW(); + while ( !(dmar_readl(iommu->reg, DMAR_GSTS_REG) & DMA_GSTS_IRES) ) + { + if ( NOW() > (start_time + DMAR_OPERATION_TIMEOUT) ) + { + dprintk(XENLOG_ERR VTDPREFIX, + "Cannot set IRE field for interrupt remapping\n"); + return -ENODEV; + } + cpu_relax(); + } + + /* After set SIRTP, we should do globally invalidate the IEC */ + iommu_flush_iec_global(iommu); + + init_ioapic_pin_intremap_index(); + + return 0; +} diff -Naurp xen/drivers/passthrough/vtd/iommu.c xen-redhat/drivers/passthrough/vtd/iommu.c --- xen/drivers/passthrough/vtd/iommu.c +++ xen-redhat/drivers/passthrough/vtd/iommu.c @@ -0,0 +1,1964 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Ashok Raj <ashok.raj@intel.com> + * Copyright (C) Shaohua Li <shaohua.li@intel.com> + * Copyright (C) Allen Kay <allen.m.kay@intel.com> - adapted to xen + */ + +#include <xen/irq.h> +#include <xen/sched.h> +#include <xen/xmalloc.h> +#include <xen/domain_page.h> +#include <xen/iommu.h> +#include <asm/hvm/iommu.h> +#include <xen/numa.h> +#include <xen/time.h> +#include <xen/pci.h> +#include <xen/pci_regs.h> +#include <xen/keyhandler.h> +#include "iommu.h" +#include "dmar.h" +#include "extern.h" +#include "vtd.h" + +#define domain_iommu_domid(d) ((d)->arch.hvm_domain.hvm_iommu.iommu_domid) + +static spinlock_t domid_bitmap_lock; /* protect domain id bitmap */ +static int domid_bitmap_size; /* domain id bitmap size in bits */ +static unsigned long *domid_bitmap; /* iommu domain id bitmap */ +static int rwbf_quirk = 0; + +static void setup_dom0_devices(struct domain *d); +static void setup_dom0_rmrr(struct domain *d); + +#define DID_FIELD_WIDTH 16 +#define DID_HIGH_OFFSET 8 +static void context_set_domain_id(struct context_entry *context, + struct domain *d) +{ + domid_t iommu_domid = domain_iommu_domid(d); + + if ( iommu_domid == 0 ) + { + spin_lock(&domid_bitmap_lock); + iommu_domid = find_first_zero_bit(domid_bitmap, domid_bitmap_size); + set_bit(iommu_domid, domid_bitmap); + spin_unlock(&domid_bitmap_lock); + d->arch.hvm_domain.hvm_iommu.iommu_domid = iommu_domid; + } + + context->hi &= (1 << DID_HIGH_OFFSET) - 1; + context->hi |= iommu_domid << DID_HIGH_OFFSET; +} + +static void iommu_domid_release(struct domain *d) +{ + domid_t iommu_domid = domain_iommu_domid(d); + + if ( iommu_domid != 0 ) + { + d->arch.hvm_domain.hvm_iommu.iommu_domid = 0; + clear_bit(iommu_domid, domid_bitmap); + } +} + +static struct intel_iommu *alloc_intel_iommu(void) +{ + struct intel_iommu *intel; + + intel = xmalloc(struct intel_iommu); + if ( intel == NULL ) + return NULL; + memset(intel, 0, sizeof(struct intel_iommu)); + + spin_lock_init(&intel->qi_ctrl.qinval_lock); + spin_lock_init(&intel->qi_ctrl.qinval_poll_lock); + spin_lock_init(&intel->ir_ctrl.iremap_lock); + + return intel; +} + +static void free_intel_iommu(struct intel_iommu *intel) +{ + xfree(intel); +} + +struct qi_ctrl *iommu_qi_ctrl(struct iommu *iommu) +{ + return iommu ? &iommu->intel->qi_ctrl : NULL; +} + +struct ir_ctrl *iommu_ir_ctrl(struct iommu *iommu) +{ + return iommu ? &iommu->intel->ir_ctrl : NULL; +} + +struct iommu_flush *iommu_get_flush(struct iommu *iommu) +{ + return iommu ? &iommu->intel->flush : NULL; +} + +static unsigned int clflush_size; +static int iommus_incoherent; +static void __iommu_flush_cache(void *addr, int size) +{ + int i; + + if ( !iommus_incoherent ) + return; + + for ( i = 0; i < size; i += clflush_size ) + cacheline_flush((char *)addr + i); +} + +void iommu_flush_cache_entry(void *addr) +{ + __iommu_flush_cache(addr, 8); +} + +void iommu_flush_cache_page(void *addr) +{ + __iommu_flush_cache(addr, PAGE_SIZE_4K); +} + +int nr_iommus; +/* context entry handling */ +static u64 bus_to_context_maddr(struct iommu *iommu, u8 bus) +{ + struct root_entry *root, *root_entries; + u64 maddr; + + ASSERT(spin_is_locked(&iommu->lock)); + root_entries = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr); + root = &root_entries[bus]; + if ( !root_present(*root) ) + { + maddr = alloc_pgtable_maddr(NULL); + if ( maddr == 0 ) + { + unmap_vtd_domain_page(root_entries); + return 0; + } + set_root_value(*root, maddr); + set_root_present(*root); + iommu_flush_cache_entry(root); + } + maddr = (u64) get_context_addr(*root); + unmap_vtd_domain_page(root_entries); + return maddr; +} + +static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc) +{ + struct hvm_iommu *hd = domain_hvm_iommu(domain); + int addr_width = agaw_to_width(hd->agaw); + struct dma_pte *parent, *pte = NULL; + int level = agaw_to_level(hd->agaw); + int offset; + u64 pte_maddr = 0, maddr; + u64 *vaddr = NULL; + + addr &= (((u64)1) << addr_width) - 1; + ASSERT(spin_is_locked(&hd->mapping_lock)); + if ( hd->pgd_maddr == 0 ) + if ( !alloc || ((hd->pgd_maddr = alloc_pgtable_maddr(domain)) == 0) ) + goto out; + + parent = (struct dma_pte *)map_vtd_domain_page(hd->pgd_maddr); + while ( level > 1 ) + { + offset = address_level_offset(addr, level); + pte = &parent[offset]; + + if ( dma_pte_addr(*pte) == 0 ) + { + if ( !alloc ) + break; + maddr = alloc_pgtable_maddr(domain); + if ( !maddr ) + break; + dma_set_pte_addr(*pte, maddr); + vaddr = map_vtd_domain_page(maddr); + + /* + * high level table always sets r/w, last level + * page table control read/write + */ + dma_set_pte_readable(*pte); + dma_set_pte_writable(*pte); + iommu_flush_cache_entry(pte); + } + else + { + vaddr = map_vtd_domain_page(pte->val); + } + + if ( level == 2 ) + { + pte_maddr = pte->val & PAGE_MASK_4K; + unmap_vtd_domain_page(vaddr); + break; + } + + unmap_vtd_domain_page(parent); + parent = (struct dma_pte *)vaddr; + vaddr = NULL; + level--; + } + + unmap_vtd_domain_page(parent); + out: + return pte_maddr; +} + +static void iommu_flush_write_buffer(struct iommu *iommu) +{ + u32 val; + unsigned long flag; + s_time_t start_time; + + if ( !rwbf_quirk && !cap_rwbf(iommu->cap) ) + return; + val = iommu->gcmd | DMA_GCMD_WBF; + + spin_lock_irqsave(&iommu->register_lock, flag); + dmar_writel(iommu->reg, DMAR_GCMD_REG, val); + + /* Make sure hardware complete it */ + start_time = NOW(); + for ( ; ; ) + { + val = dmar_readl(iommu->reg, DMAR_GSTS_REG); + if ( !(val & DMA_GSTS_WBFS) ) + break; + if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT ) + panic("%s: DMAR hardware is malfunctional," + " please disable IOMMU\n", __func__); + cpu_relax(); + } + spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +/* return value determine if we need a write buffer flush */ +static int flush_context_reg( + void *_iommu, + u16 did, u16 source_id, u8 function_mask, u64 type, + int non_present_entry_flush) +{ + struct iommu *iommu = (struct iommu *) _iommu; + u64 val = 0; + unsigned long flag; + s_time_t start_time; + + /* + * In the non-present entry flush case, if hardware doesn't cache + * non-present entry we do nothing and if hardware cache non-present + * entry, we flush entries of domain 0 (the domain id is used to cache + * any non-present entries) + */ + if ( non_present_entry_flush ) + { + if ( !cap_caching_mode(iommu->cap) ) + return 1; + else + did = 0; + } + + /* use register invalidation */ + switch ( type ) + { + case DMA_CCMD_GLOBAL_INVL: + val = DMA_CCMD_GLOBAL_INVL; + break; + case DMA_CCMD_DOMAIN_INVL: + val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); + break; + case DMA_CCMD_DEVICE_INVL: + val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) + |DMA_CCMD_SID(source_id)|DMA_CCMD_FM(function_mask); + break; + default: + BUG(); + } + val |= DMA_CCMD_ICC; + + spin_lock_irqsave(&iommu->register_lock, flag); + dmar_writeq(iommu->reg, DMAR_CCMD_REG, val); + + /* Make sure hardware complete it */ + start_time = NOW(); + for ( ; ; ) + { + val = dmar_readq(iommu->reg, DMAR_CCMD_REG); + if ( !(val & DMA_CCMD_ICC) ) + break; + if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT ) + panic("%s: DMAR hardware is malfunctional," + " please disable IOMMU\n", __func__); + cpu_relax(); + } + spin_unlock_irqrestore(&iommu->register_lock, flag); + /* flush context entry will implicitly flush write buffer */ + return 0; +} + +static int inline iommu_flush_context_global( + struct iommu *iommu, int non_present_entry_flush) +{ + struct iommu_flush *flush = iommu_get_flush(iommu); + return flush->context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL, + non_present_entry_flush); +} + +static int inline iommu_flush_context_domain( + struct iommu *iommu, u16 did, int non_present_entry_flush) +{ + struct iommu_flush *flush = iommu_get_flush(iommu); + return flush->context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL, + non_present_entry_flush); +} + +static int inline iommu_flush_context_device( + struct iommu *iommu, u16 did, u16 source_id, + u8 function_mask, int non_present_entry_flush) +{ + struct iommu_flush *flush = iommu_get_flush(iommu); + return flush->context(iommu, did, source_id, function_mask, + DMA_CCMD_DEVICE_INVL, + non_present_entry_flush); +} + +/* return value determine if we need a write buffer flush */ +static int flush_iotlb_reg(void *_iommu, u16 did, + u64 addr, unsigned int size_order, u64 type, + int non_present_entry_flush) +{ + struct iommu *iommu = (struct iommu *) _iommu; + int tlb_offset = ecap_iotlb_offset(iommu->ecap); + u64 val = 0, val_iva = 0; + unsigned long flag; + s_time_t start_time; + + /* + * In the non-present entry flush case, if hardware doesn't cache + * non-present entry we do nothing and if hardware cache non-present + * entry, we flush entries of domain 0 (the domain id is used to cache + * any non-present entries) + */ + if ( non_present_entry_flush ) + { + if ( !cap_caching_mode(iommu->cap) ) + return 1; + else + did = 0; + } + + /* use register invalidation */ + switch ( type ) + { + case DMA_TLB_GLOBAL_FLUSH: + /* global flush doesn't need set IVA_REG */ + val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; + break; + case DMA_TLB_DSI_FLUSH: + val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); + break; + case DMA_TLB_PSI_FLUSH: + val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); + /* Note: always flush non-leaf currently */ + val_iva = size_order | addr; + break; + default: + BUG(); + } + /* Note: set drain read/write */ + if ( cap_read_drain(iommu->cap) ) + val |= DMA_TLB_READ_DRAIN; + if ( cap_write_drain(iommu->cap) ) + val |= DMA_TLB_WRITE_DRAIN; + + spin_lock_irqsave(&iommu->register_lock, flag); + /* Note: Only uses first TLB reg currently */ + if ( val_iva ) + dmar_writeq(iommu->reg, tlb_offset, val_iva); + dmar_writeq(iommu->reg, tlb_offset + 8, val); + + /* Make sure hardware complete it */ + start_time = NOW(); + for ( ; ; ) + { + val = dmar_readq(iommu->reg, tlb_offset + 8); + if ( !(val & DMA_TLB_IVT) ) + break; + if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT ) + panic("%s: DMAR hardware is malfunctional," + " please disable IOMMU\n", __func__); + cpu_relax(); + } + spin_unlock_irqrestore(&iommu->register_lock, flag); + + /* check IOTLB invalidation granularity */ + if ( DMA_TLB_IAIG(val) == 0 ) + dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: flush IOTLB failed\n"); + + /* flush iotlb entry will implicitly flush write buffer */ + return 0; +} + +static int inline iommu_flush_iotlb_global(struct iommu *iommu, + int non_present_entry_flush) +{ + struct iommu_flush *flush = iommu_get_flush(iommu); + return flush->iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH, + non_present_entry_flush); +} + +static int inline iommu_flush_iotlb_dsi(struct iommu *iommu, u16 did, + int non_present_entry_flush) +{ + struct iommu_flush *flush = iommu_get_flush(iommu); + return flush->iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH, + non_present_entry_flush); +} + +static int inline get_alignment(u64 base, unsigned int size) +{ + int t = 0; + u64 end; + + end = base + size - 1; + while ( base != end ) + { + t++; + base >>= 1; + end >>= 1; + } + return t; +} + +static int inline iommu_flush_iotlb_psi( + struct iommu *iommu, u16 did, + u64 addr, unsigned int pages, int non_present_entry_flush) +{ + unsigned int align; + struct iommu_flush *flush = iommu_get_flush(iommu); + + ASSERT(!(addr & (~PAGE_MASK_4K))); + ASSERT(pages > 0); + + /* Fallback to domain selective flush if no PSI support */ + if ( !cap_pgsel_inv(iommu->cap) ) + return iommu_flush_iotlb_dsi(iommu, did, + non_present_entry_flush); + + /* + * PSI requires page size is 2 ^ x, and the base address is naturally + * aligned to the size + */ + align = get_alignment(addr >> PAGE_SHIFT_4K, pages); + /* Fallback to domain selective flush if size is too big */ + if ( align > cap_max_amask_val(iommu->cap) ) + return iommu_flush_iotlb_dsi(iommu, did, + non_present_entry_flush); + + addr >>= PAGE_SHIFT_4K + align; + addr <<= PAGE_SHIFT_4K + align; + + return flush->iotlb(iommu, did, addr, align, + DMA_TLB_PSI_FLUSH, non_present_entry_flush); +} + +void iommu_flush_all(void) +{ + struct acpi_drhd_unit *drhd; + struct iommu *iommu; + + flush_all_cache(); + for_each_drhd_unit ( drhd ) + { + iommu = drhd->iommu; + iommu_flush_context_global(iommu, 0); + iommu_flush_iotlb_global(iommu, 0); + } +} + +/* clear one page's page table */ +static void dma_pte_clear_one(struct domain *domain, u64 addr) +{ + struct hvm_iommu *hd = domain_hvm_iommu(domain); + struct acpi_drhd_unit *drhd; + struct iommu *iommu; + struct dma_pte *page = NULL, *pte = NULL; + u64 pg_maddr; + + spin_lock(&hd->mapping_lock); + /* get last level pte */ + pg_maddr = addr_to_dma_page_maddr(domain, addr, 0); + if ( pg_maddr == 0 ) + { + spin_unlock(&hd->mapping_lock); + return; + } + + page = (struct dma_pte *)map_vtd_domain_page(pg_maddr); + pte = page + address_level_offset(addr, 1); + + if ( !dma_pte_present(*pte) ) + { + spin_unlock(&hd->mapping_lock); + unmap_vtd_domain_page(page); + return; + } + + dma_clear_pte(*pte); + spin_unlock(&hd->mapping_lock); + iommu_flush_cache_entry(pte); + + /* No need pcidevs_lock here since do that on assign/deassign device*/ + for_each_drhd_unit ( drhd ) + { + iommu = drhd->iommu; + if ( test_bit(iommu->index, &hd->iommu_bitmap) ) + if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(domain), + addr, 1, 0)) + iommu_flush_write_buffer(iommu); + } + + unmap_vtd_domain_page(page); +} + +static void iommu_free_pagetable(u64 pt_maddr, int level) +{ + int i; + struct dma_pte *pt_vaddr, *pte; + int next_level = level - 1; + + if ( pt_maddr == 0 ) + return; + + pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr); + + for ( i = 0; i < PTE_NUM; i++ ) + { + pte = &pt_vaddr[i]; + if ( !dma_pte_present(*pte) ) + continue; + + if ( next_level >= 1 ) + iommu_free_pagetable(dma_pte_addr(*pte), next_level); + + dma_clear_pte(*pte); + iommu_flush_cache_entry(pte); + } + + unmap_vtd_domain_page(pt_vaddr); + free_pgtable_maddr(pt_maddr); +} + +static int iommu_set_root_entry(struct iommu *iommu) +{ + u32 cmd, sts; + unsigned long flags; + s_time_t start_time; + + spin_lock(&iommu->lock); + + if ( iommu->root_maddr == 0 ) + iommu->root_maddr = alloc_pgtable_maddr(NULL); + if ( iommu->root_maddr == 0 ) + { + spin_unlock(&iommu->lock); + return -ENOMEM; + } + + spin_unlock(&iommu->lock); + spin_lock_irqsave(&iommu->register_lock, flags); + dmar_writeq(iommu->reg, DMAR_RTADDR_REG, iommu->root_maddr); + cmd = iommu->gcmd | DMA_GCMD_SRTP; + dmar_writel(iommu->reg, DMAR_GCMD_REG, cmd); + + /* Make sure hardware complete it */ + start_time = NOW(); + for ( ; ; ) + { + sts = dmar_readl(iommu->reg, DMAR_GSTS_REG); + if ( sts & DMA_GSTS_RTPS ) + break; + if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT ) + panic("%s: DMAR hardware is malfunctional," + " please disable IOMMU\n", __func__); + cpu_relax(); + } + + spin_unlock_irqrestore(&iommu->register_lock, flags); + + return 0; +} + +static void iommu_enable_translation(struct iommu *iommu) +{ + u32 sts; + unsigned long flags; + s_time_t start_time; + + dprintk(XENLOG_INFO VTDPREFIX, + "iommu_enable_translation: iommu->reg = %p\n", iommu->reg); + spin_lock_irqsave(&iommu->register_lock, flags); + iommu->gcmd |= DMA_GCMD_TE; + dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd); + /* Make sure hardware complete it */ + start_time = NOW(); + for ( ; ; ) + { + sts = dmar_readl(iommu->reg, DMAR_GSTS_REG); + if ( sts & DMA_GSTS_TES ) + break; + if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT ) + panic("%s: DMAR hardware is malfunctional," + " please disable IOMMU\n", __func__); + cpu_relax(); + } + + /* Disable PMRs when VT-d engine takes effect per spec definition */ + disable_pmr(iommu); + spin_unlock_irqrestore(&iommu->register_lock, flags); +} + +int iommu_disable_translation(struct iommu *iommu) +{ + u32 sts; + unsigned long flags; + s_time_t start_time; + + spin_lock_irqsave(&iommu->register_lock, flags); + iommu->gcmd &= ~ DMA_GCMD_TE; + dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd); + + /* Make sure hardware complete it */ + start_time = NOW(); + for ( ; ; ) + { + sts = dmar_readl(iommu->reg, DMAR_GSTS_REG); + if ( !(sts & DMA_GSTS_TES) ) + break; + if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT ) + panic("%s: DMAR hardware is malfunctional," + " please disable IOMMU\n", __func__); + cpu_relax(); + } + spin_unlock_irqrestore(&iommu->register_lock, flags); + return 0; +} + +static struct iommu *vector_to_iommu[NR_VECTORS]; +static int iommu_page_fault_do_one(struct iommu *iommu, int type, + u8 fault_reason, u16 source_id, u64 addr) +{ + dprintk(XENLOG_WARNING VTDPREFIX, + "iommu_fault:%s: %x:%x.%x addr %"PRIx64" REASON %x " + "iommu->reg = %p\n", + (type ? "DMA Read" : "DMA Write"), (source_id >> 8), + PCI_SLOT(source_id & 0xFF), PCI_FUNC(source_id & 0xFF), addr, + fault_reason, iommu->reg); + +#ifndef __i386__ /* map_domain_page() cannot be used in this context */ + if ( fault_reason < 0x20 ) + print_vtd_entries(iommu, (source_id >> 8), + (source_id & 0xff), (addr >> PAGE_SHIFT)); +#endif + + return 0; +} + +static void iommu_fault_status(u32 fault_status) +{ + if ( fault_status & DMA_FSTS_PFO ) + dprintk(XENLOG_ERR VTDPREFIX, + "iommu_fault_status: Fault Overflow\n"); + if ( fault_status & DMA_FSTS_PPF ) + dprintk(XENLOG_ERR VTDPREFIX, + "iommu_fault_status: Primary Pending Fault\n"); + if ( fault_status & DMA_FSTS_AFO ) + dprintk(XENLOG_ERR VTDPREFIX, + "iommu_fault_status: Advanced Fault Overflow\n"); + if ( fault_status & DMA_FSTS_APF ) + dprintk(XENLOG_ERR VTDPREFIX, + "iommu_fault_status: Advanced Pending Fault\n"); + if ( fault_status & DMA_FSTS_IQE ) + dprintk(XENLOG_ERR VTDPREFIX, + "iommu_fault_status: Invalidation Queue Error\n"); + if ( fault_status & DMA_FSTS_ICE ) + dprintk(XENLOG_ERR VTDPREFIX, + "iommu_fault_status: Invalidation Completion Error\n"); + if ( fault_status & DMA_FSTS_ITE ) + dprintk(XENLOG_ERR VTDPREFIX, + "iommu_fault_status: Invalidation Time-out Error\n"); +} + +#define PRIMARY_FAULT_REG_LEN (16) +static void iommu_page_fault(int vector, void *dev_id, + struct cpu_user_regs *regs) +{ + struct iommu *iommu = dev_id; + int reg, fault_index; + u32 fault_status; + unsigned long flags; + + dprintk(XENLOG_WARNING VTDPREFIX, + "iommu_page_fault: iommu->reg = %p\n", iommu->reg); + + fault_status = dmar_readl(iommu->reg, DMAR_FSTS_REG); + + iommu_fault_status(fault_status); + + /* FIXME: ignore advanced fault log */ + if ( !(fault_status & DMA_FSTS_PPF) ) + goto clear_overflow; + + fault_index = dma_fsts_fault_record_index(fault_status); + reg = cap_fault_reg_offset(iommu->cap); + while (1) + { + u8 fault_reason; + u16 source_id, cword; + u32 data; + u64 guest_addr; + int type; + + /* highest 32 bits */ + spin_lock_irqsave(&iommu->register_lock, flags); + data = dmar_readl(iommu->reg, reg + + fault_index * PRIMARY_FAULT_REG_LEN + 12); + if ( !(data & DMA_FRCD_F) ) + { + spin_unlock_irqrestore(&iommu->register_lock, flags); + break; + } + + fault_reason = dma_frcd_fault_reason(data); + type = dma_frcd_type(data); + + data = dmar_readl(iommu->reg, reg + + fault_index * PRIMARY_FAULT_REG_LEN + 8); + source_id = dma_frcd_source_id(data); + + guest_addr = dmar_readq(iommu->reg, reg + + fault_index * PRIMARY_FAULT_REG_LEN); + guest_addr = dma_frcd_page_addr(guest_addr); + /* clear the fault */ + dmar_writel(iommu->reg, reg + + fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F); + spin_unlock_irqrestore(&iommu->register_lock, flags); + + iommu_page_fault_do_one(iommu, type, fault_reason, + source_id, guest_addr); + + /* Tell the device to stop DMAing; we can't rely on the guest to + * control it for us. */ + cword = pci_conf_read16(PCI_BUS(source_id), PCI_SLOT(source_id), + PCI_FUNC(source_id), PCI_COMMAND); + pci_conf_write16(PCI_BUS(source_id), PCI_SLOT(source_id), + PCI_FUNC(source_id), PCI_COMMAND, + cword & ~PCI_COMMAND_MASTER); + + fault_index++; + if ( fault_index > cap_num_fault_regs(iommu->cap) ) + fault_index = 0; + } +clear_overflow: + /* clear primary fault overflow */ + fault_status = readl(iommu->reg + DMAR_FSTS_REG); + if ( fault_status & DMA_FSTS_PFO ) + { + spin_lock_irqsave(&iommu->register_lock, flags); + dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO); + spin_unlock_irqrestore(&iommu->register_lock, flags); + } +} + +static void dma_msi_unmask(unsigned int vector) +{ + struct iommu *iommu = vector_to_iommu[vector]; + unsigned long flags; + + /* unmask it */ + spin_lock_irqsave(&iommu->register_lock, flags); + dmar_writel(iommu->reg, DMAR_FECTL_REG, 0); + spin_unlock_irqrestore(&iommu->register_lock, flags); +} + +static void dma_msi_mask(unsigned int vector) +{ + unsigned long flags; + struct iommu *iommu = vector_to_iommu[vector]; + + /* mask it */ + spin_lock_irqsave(&iommu->register_lock, flags); + dmar_writel(iommu->reg, DMAR_FECTL_REG, DMA_FECTL_IM); + spin_unlock_irqrestore(&iommu->register_lock, flags); +} + +static unsigned int dma_msi_startup(unsigned int vector) +{ + dma_msi_unmask(vector); + return 0; +} + +static void dma_msi_end(unsigned int vector) +{ + dma_msi_unmask(vector); + ack_APIC_irq(); +} + +static void dma_msi_data_init(struct iommu *iommu, int vector) +{ + u32 msi_data = 0; + unsigned long flags; + + /* Fixed, edge, assert mode. Follow MSI setting */ + msi_data |= vector & 0xff; + msi_data |= 1 << 14; + + spin_lock_irqsave(&iommu->register_lock, flags); + dmar_writel(iommu->reg, DMAR_FEDATA_REG, msi_data); + spin_unlock_irqrestore(&iommu->register_lock, flags); +} + +#ifdef SUPPORT_MSI_REMAPPING +static void dma_msi_addr_init(struct iommu *iommu, int phy_cpu) +{ + u64 msi_address; + unsigned long flags; + + /* Physical, dedicated cpu. Follow MSI setting */ + msi_address = (MSI_ADDRESS_HEADER << (MSI_ADDRESS_HEADER_SHIFT + 8)); + msi_address |= MSI_PHYSICAL_MODE << 2; + msi_address |= MSI_REDIRECTION_HINT_MODE << 3; + msi_address |= phy_cpu << MSI_TARGET_CPU_SHIFT; + + spin_lock_irqsave(&iommu->register_lock, flags); + dmar_writel(iommu->reg, DMAR_FEADDR_REG, (u32)msi_address); + dmar_writel(iommu->reg, DMAR_FEUADDR_REG, (u32)(msi_address >> 32)); + spin_unlock_irqrestore(&iommu->register_lock, flags); +} +#else +static void dma_msi_addr_init(struct iommu *iommu, int phy_cpu) +{ + /* ia64: TODO */ +} +#endif + +static void dma_msi_set_affinity(unsigned int vector, cpumask_t dest) +{ + struct iommu *iommu = vector_to_iommu[vector]; + dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(dest))); +} + +static struct hw_interrupt_type dma_msi_type = { + .typename = "DMA_MSI", + .startup = dma_msi_startup, + .shutdown = dma_msi_mask, + .enable = dma_msi_unmask, + .disable = dma_msi_mask, + .ack = dma_msi_mask, + .end = dma_msi_end, + .set_affinity = dma_msi_set_affinity, +}; + +int iommu_set_interrupt(struct iommu *iommu) +{ + int vector, ret; + + vector = assign_irq_vector(AUTO_ASSIGN); + vector_to_iommu[vector] = iommu; + + if ( !vector ) + { + gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no vectors\n"); + return -EINVAL; + } + + irq_desc[vector].handler = &dma_msi_type; + ret = request_irq(vector, iommu_page_fault, 0, "dmar", iommu); + if ( ret ) + gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: can't request irq\n"); + return vector; +} + +static int iommu_alloc(struct acpi_drhd_unit *drhd) +{ + struct iommu *iommu; + unsigned long sagaw; + int agaw; + + if ( nr_iommus > MAX_IOMMUS ) + { + gdprintk(XENLOG_ERR VTDPREFIX, + "IOMMU: nr_iommus %d > MAX_IOMMUS\n", nr_iommus); + return -ENOMEM; + } + + iommu = xmalloc(struct iommu); + if ( iommu == NULL ) + return -ENOMEM; + memset(iommu, 0, sizeof(struct iommu)); + + iommu->intel = alloc_intel_iommu(); + if ( iommu->intel == NULL ) + { + xfree(iommu); + return -ENOMEM; + } + + iommu->reg = map_to_nocache_virt(nr_iommus, drhd->address); + iommu->index = nr_iommus++; + + iommu->cap = dmar_readq(iommu->reg, DMAR_CAP_REG); + iommu->ecap = dmar_readq(iommu->reg, DMAR_ECAP_REG); + + /* Calculate number of pagetable levels: between 2 and 4. */ + sagaw = cap_sagaw(iommu->cap); + for ( agaw = level_to_agaw(4); agaw >= 0; agaw-- ) + if ( test_bit(agaw, &sagaw) ) + break; + if ( agaw < 0 ) + { + gdprintk(XENLOG_ERR VTDPREFIX, + "IOMMU: unsupported sagaw %lx\n", sagaw); + xfree(iommu); + return -ENODEV; + } + iommu->nr_pt_levels = agaw_to_level(agaw); + + if ( !ecap_coherent(iommu->ecap) ) + iommus_incoherent = 1; + + spin_lock_init(&iommu->lock); + spin_lock_init(&iommu->register_lock); + + drhd->iommu = iommu; + return 0; +} + +static void iommu_free(struct acpi_drhd_unit *drhd) +{ + struct iommu *iommu = drhd->iommu; + + if ( iommu == NULL ) + return; + + if ( iommu->root_maddr != 0 ) + { + free_pgtable_maddr(iommu->root_maddr); + iommu->root_maddr = 0; + } + + if ( iommu->reg ) + iounmap(iommu->reg); + + free_intel_iommu(iommu->intel); + free_irq(iommu->vector); + xfree(iommu); + + drhd->iommu = NULL; +} + +#define guestwidth_to_adjustwidth(gaw) ({ \ + int agaw, r = (gaw - 12) % 9; \ + agaw = (r == 0) ? gaw : (gaw + 9 - r); \ + if ( agaw > 64 ) \ + agaw = 64; \ + agaw; }) + +static int intel_iommu_domain_init(struct domain *d) +{ + struct hvm_iommu *hd = domain_hvm_iommu(d); + u64 i, j, tmp; + struct acpi_drhd_unit *drhd; + + hd->agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH); + + if ( d->domain_id == 0 ) + { + extern int xen_in_range(paddr_t start, paddr_t end); + + /* + * Set up 1:1 page table for dom0 except the critical segments + * like Xen. + */ + for ( i = 0; i < max_page; i++ ) + { + if ( xen_in_range(i << PAGE_SHIFT, (i + 1) << PAGE_SHIFT) ) + continue; + + tmp = 1 << (PAGE_SHIFT - PAGE_SHIFT_4K); + for ( j = 0; j < tmp; j++ ) + iommu_map_page(d, (i*tmp+j), (i*tmp+j)); + } + + setup_dom0_devices(d); + setup_dom0_rmrr(d); + + iommu_flush_all(); + + for_each_drhd_unit ( drhd ) + iommu_enable_translation(drhd->iommu); + } + + return 0; +} + +static int domain_context_mapping_one( + struct domain *domain, + struct iommu *iommu, + u8 bus, u8 devfn) +{ + struct hvm_iommu *hd = domain_hvm_iommu(domain); + struct context_entry *context, *context_entries; + u64 maddr, pgd_maddr; + struct pci_dev *pdev = NULL; + int agaw; + + ASSERT(spin_is_locked(&pcidevs_lock)); + spin_lock(&iommu->lock); + maddr = bus_to_context_maddr(iommu, bus); + context_entries = (struct context_entry *)map_vtd_domain_page(maddr); + context = &context_entries[devfn]; + + if ( context_present(*context) ) + { + int res = 0; + + pdev = pci_get_pdev(bus, devfn); + if (!pdev) + res = -ENODEV; + else if (pdev->domain != domain) + res = -EINVAL; + unmap_vtd_domain_page(context_entries); + spin_unlock(&iommu->lock); + return res; + } + + if ( iommu_passthrough && + ecap_pass_thru(iommu->ecap) && (domain->domain_id == 0) ) + { + context_set_translation_type(*context, CONTEXT_TT_PASS_THRU); + agaw = level_to_agaw(iommu->nr_pt_levels); + } + else + { + spin_lock(&hd->mapping_lock); + + /* Ensure we have pagetables allocated down to leaf PTE. */ + if ( hd->pgd_maddr == 0 ) + { + addr_to_dma_page_maddr(domain, 0, 1); + if ( hd->pgd_maddr == 0 ) + { + nomem: + spin_unlock(&hd->mapping_lock); + spin_unlock(&iommu->lock); + unmap_vtd_domain_page(context_entries); + return -ENOMEM; + } + } + + /* Skip top levels of page tables for 2- and 3-level DRHDs. */ + pgd_maddr = hd->pgd_maddr; + for ( agaw = level_to_agaw(4); + agaw != level_to_agaw(iommu->nr_pt_levels); + agaw-- ) + { + struct dma_pte *p = map_vtd_domain_page(pgd_maddr); + pgd_maddr = dma_pte_addr(*p); + unmap_vtd_domain_page(p); + if ( pgd_maddr == 0 ) + goto nomem; + } + + context_set_address_root(*context, pgd_maddr); + context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL); + spin_unlock(&hd->mapping_lock); + } + + /* + * domain_id 0 is not valid on Intel's IOMMU, force domain_id to + * be 1 based as required by intel's iommu hw. + */ + context_set_domain_id(context, domain); + context_set_address_width(*context, agaw); + context_set_fault_enable(*context); + context_set_present(*context); + iommu_flush_cache_entry(context); + spin_unlock(&iommu->lock); + + /* Context entry was previously non-present (with domid 0). */ + if ( iommu_flush_context_device(iommu, 0, (((u16)bus) << 8) | devfn, + DMA_CCMD_MASK_NOBIT, 1) ) + iommu_flush_write_buffer(iommu); + else + iommu_flush_iotlb_dsi(iommu, 0, 1); + + set_bit(iommu->index, &hd->iommu_bitmap); + + unmap_vtd_domain_page(context_entries); + + return 0; +} + +#define PCI_BASE_CLASS_BRIDGE 0x06 +#define PCI_CLASS_BRIDGE_PCI 0x0604 + +enum { + DEV_TYPE_PCIe_ENDPOINT, + DEV_TYPE_PCIe_BRIDGE, // PCIe root port, switch + DEV_TYPE_PCI_BRIDGE, // PCIe-to-PCI/PCIx bridge, PCI-to-PCI bridge + DEV_TYPE_PCI, +}; + +int pdev_type(u8 bus, u8 devfn) +{ + u16 class_device; + u16 status, creg; + int pos; + u8 d = PCI_SLOT(devfn), f = PCI_FUNC(devfn); + + class_device = pci_conf_read16(bus, d, f, PCI_CLASS_DEVICE); + if ( class_device == PCI_CLASS_BRIDGE_PCI ) + { + pos = pci_find_next_cap(bus, devfn, + PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP); + if ( !pos ) + return DEV_TYPE_PCI_BRIDGE; + creg = pci_conf_read16(bus, d, f, pos + PCI_EXP_FLAGS); + return ((creg & PCI_EXP_FLAGS_TYPE) >> 4) == PCI_EXP_TYPE_PCI_BRIDGE ? + DEV_TYPE_PCI_BRIDGE : DEV_TYPE_PCIe_BRIDGE; + } + + status = pci_conf_read16(bus, d, f, PCI_STATUS); + if ( !(status & PCI_STATUS_CAP_LIST) ) + return DEV_TYPE_PCI; + + if ( pci_find_next_cap(bus, devfn, PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP) ) + return DEV_TYPE_PCIe_ENDPOINT; + + return DEV_TYPE_PCI; +} + +#define MAX_BUSES 256 +static DEFINE_SPINLOCK(bus2bridge_lock); +static struct { u8 map, bus, devfn; } bus2bridge[MAX_BUSES]; + +static int _find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus) +{ + int cnt = 0; + *secbus = *bus; + + ASSERT(spin_is_locked(&bus2bridge_lock)); + if ( !bus2bridge[*bus].map ) + return 0; + + while ( bus2bridge[*bus].map ) + { + *secbus = *bus; + *devfn = bus2bridge[*bus].devfn; + *bus = bus2bridge[*bus].bus; + if ( cnt++ >= MAX_BUSES ) + return 0; + } + + return 1; +} + +static int find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus) +{ + int ret = 0; + + if ( *bus == 0 ) + /* assume integrated PCI devices in RC have valid requester-id */ + return 1; + + spin_lock(&bus2bridge_lock); + ret = _find_pcie_endpoint(bus, devfn, secbus); + spin_unlock(&bus2bridge_lock); + + return ret; +} + +static int domain_context_mapping(struct domain *domain, u8 bus, u8 devfn) +{ + struct acpi_drhd_unit *drhd; + int ret = 0; + u16 sec_bus, sub_bus; + u32 type; + u8 secbus, secdevfn; + + drhd = acpi_find_matched_drhd_unit(bus, devfn); + if ( !drhd ) + return -ENODEV; + + ASSERT(spin_is_locked(&pcidevs_lock)); + + type = pdev_type(bus, devfn); + switch ( type ) + { + case DEV_TYPE_PCIe_BRIDGE: + break; + + case DEV_TYPE_PCI_BRIDGE: + sec_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn), + PCI_SECONDARY_BUS); + sub_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn), + PCI_SUBORDINATE_BUS); + + spin_lock(&bus2bridge_lock); + for ( sub_bus &= 0xff; sec_bus <= sub_bus; sec_bus++ ) + { + bus2bridge[sec_bus].map = 1; + bus2bridge[sec_bus].bus = bus; + bus2bridge[sec_bus].devfn = devfn; + } + spin_unlock(&bus2bridge_lock); + break; + + case DEV_TYPE_PCIe_ENDPOINT: + gdprintk(XENLOG_INFO VTDPREFIX, + "domain_context_mapping:PCIe: bdf = %x:%x.%x\n", + bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn); + break; + + case DEV_TYPE_PCI: + gdprintk(XENLOG_INFO VTDPREFIX, + "domain_context_mapping:PCI: bdf = %x:%x.%x\n", + bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + + ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn); + if ( ret ) + break; + + secbus = bus; + secdevfn = devfn; + /* dependent devices mapping */ + while ( bus2bridge[bus].map ) + { + secbus = bus; + secdevfn = devfn; + devfn = bus2bridge[bus].devfn; + bus = bus2bridge[bus].bus; + ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn); + if ( ret ) + return ret; + } + + if ( (secbus != bus) && (secdevfn != 0) ) + /* + * The source-id for transactions on non-PCIe buses seem + * to originate from devfn=0 on the secondary bus behind + * the bridge. Map that id as well. The id to use in + * these scanarios is not particularly well documented + * anywhere. + */ + ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0); + break; + + default: + gdprintk(XENLOG_ERR VTDPREFIX, + "domain_context_mapping:unknown type : bdf = %x:%x.%x\n", + bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + ret = -EINVAL; + break; + } + + return ret; +} + +static int domain_context_unmap_one( + struct domain *domain, + struct iommu *iommu, + u8 bus, u8 devfn) +{ + struct context_entry *context, *context_entries; + u64 maddr; + + ASSERT(spin_is_locked(&pcidevs_lock)); + spin_lock(&iommu->lock); + + maddr = bus_to_context_maddr(iommu, bus); + context_entries = (struct context_entry *)map_vtd_domain_page(maddr); + context = &context_entries[devfn]; + + if ( !context_present(*context) ) + { + spin_unlock(&iommu->lock); + unmap_vtd_domain_page(context_entries); + return 0; + } + + context_clear_present(*context); + context_clear_entry(*context); + iommu_flush_cache_entry(context); + + if ( iommu_flush_context_device(iommu, domain_iommu_domid(domain), + (((u16)bus) << 8) | devfn, + DMA_CCMD_MASK_NOBIT, 0) ) + iommu_flush_write_buffer(iommu); + else + iommu_flush_iotlb_dsi(iommu, domain_iommu_domid(domain), 0); + + spin_unlock(&iommu->lock); + unmap_vtd_domain_page(context_entries); + + return 0; +} + +static int domain_context_unmap(struct domain *domain, u8 bus, u8 devfn) +{ + struct acpi_drhd_unit *drhd; + int ret = 0; + u32 type; + u8 secbus, secdevfn; + + drhd = acpi_find_matched_drhd_unit(bus, devfn); + if ( !drhd ) + return -ENODEV; + + type = pdev_type(bus, devfn); + switch ( type ) + { + case DEV_TYPE_PCIe_BRIDGE: + case DEV_TYPE_PCI_BRIDGE: + break; + + case DEV_TYPE_PCIe_ENDPOINT: + gdprintk(XENLOG_INFO VTDPREFIX, + "domain_context_unmap:PCIe: bdf = %x:%x.%x\n", + bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn); + break; + + case DEV_TYPE_PCI: + gdprintk(XENLOG_INFO VTDPREFIX, + "domain_context_unmap:PCI: bdf = %x:%x.%x\n", + bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn); + if ( ret ) + break; + + secbus = bus; + secdevfn = devfn; + /* dependent devices unmapping */ + while ( bus2bridge[bus].map ) + { + secbus = bus; + secdevfn = devfn; + devfn = bus2bridge[bus].devfn; + bus = bus2bridge[bus].bus; + ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn); + if ( ret ) + return ret; + } + + if ( (secbus != bus) && (secdevfn != 0) ) + ret = domain_context_unmap_one(domain, drhd->iommu, secbus, 0); + break; + + default: + gdprintk(XENLOG_ERR VTDPREFIX, + "domain_context_unmap:unknown type: bdf = %x:%x:%x\n", + bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + ret = -EINVAL; + break; + } + + return ret; +} + +static int intel_iommu_assignable(const struct domain *d) +{ + if (d != dom0 && !iommu_intremap && iommu_intremap_cmdline) { + printk("Interrupt Remapping hardware not found, passing devices\n"); + printk("to unprivileged domains is insecure. If you really want\n"); + printk("to do this, please boot with \"iommu=no-intremap\".\n"); + return 0; + } + return 1; +} + +static int reassign_device_ownership( + struct domain *source, + struct domain *target, + u8 bus, u8 devfn) +{ + struct hvm_iommu *source_hd = domain_hvm_iommu(source); + struct pci_dev *pdev; + struct acpi_drhd_unit *drhd; + struct iommu *pdev_iommu; + int ret, found = 0; + + ASSERT(spin_is_locked(&pcidevs_lock)); + pdev = pci_get_pdev_by_domain(source, bus, devfn); + + if (!pdev || !intel_iommu_assignable(target)) + return -ENODEV; + + drhd = acpi_find_matched_drhd_unit(bus, devfn); + if ( !drhd ) + return -ENODEV; + pdev_iommu = drhd->iommu; + domain_context_unmap(source, bus, devfn); + + ret = domain_context_mapping(target, bus, devfn); + if ( ret ) + return ret; + + list_move(&pdev->domain_list, &target->arch.pdev_list); + pdev->domain = target; + + for_each_pdev ( source, pdev ) + { + drhd = acpi_find_matched_drhd_unit(pdev->bus, pdev->devfn); + if ( drhd && drhd->iommu == pdev_iommu ) + { + found = 1; + break; + } + } + + if ( !found ) + clear_bit(pdev_iommu->index, &source_hd->iommu_bitmap); + + return ret; +} + +void iommu_domain_teardown(struct domain *d) +{ + struct hvm_iommu *hd = domain_hvm_iommu(d); + + if ( list_empty(&acpi_drhd_units) ) + return; + + spin_lock(&hd->mapping_lock); + iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw)); + hd->pgd_maddr = 0; + spin_unlock(&hd->mapping_lock); + + iommu_domid_release(d); +} + +int intel_iommu_map_page( + struct domain *d, unsigned long gfn, unsigned long mfn) +{ + struct hvm_iommu *hd = domain_hvm_iommu(d); + struct acpi_drhd_unit *drhd; + struct iommu *iommu; + struct dma_pte *page = NULL, *pte = NULL; + u64 pg_maddr; + int pte_present; + + drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list); + iommu = drhd->iommu; + + /* do nothing if dom0 and iommu supports pass thru */ + if ( iommu_passthrough && + ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) ) + return 0; + + spin_lock(&hd->mapping_lock); + + pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, 1); + if ( pg_maddr == 0 ) + { + spin_unlock(&hd->mapping_lock); + return -ENOMEM; + } + page = (struct dma_pte *)map_vtd_domain_page(pg_maddr); + pte = page + (gfn & LEVEL_MASK); + pte_present = dma_pte_present(*pte); + dma_set_pte_addr(*pte, (paddr_t)mfn << PAGE_SHIFT_4K); + dma_set_pte_prot(*pte, DMA_PTE_READ | DMA_PTE_WRITE); + + /* Set the SNP on leaf page table if Snoop Control available */ + if ( iommu_snoop ) + dma_set_pte_snp(*pte); + + iommu_flush_cache_entry(pte); + spin_unlock(&hd->mapping_lock); + unmap_vtd_domain_page(page); + + /* + * No need pcideves_lock here because we have flush + * when assign/deassign device + */ + for_each_drhd_unit ( drhd ) + { + iommu = drhd->iommu; + + if ( !test_bit(iommu->index, &hd->iommu_bitmap) ) + continue; + + if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(d), + (paddr_t)gfn << PAGE_SHIFT_4K, 1, + !pte_present) ) + iommu_flush_write_buffer(iommu); + } + + return 0; +} + +int intel_iommu_unmap_page(struct domain *d, unsigned long gfn) +{ + struct acpi_drhd_unit *drhd; + struct iommu *iommu; + + drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list); + iommu = drhd->iommu; + + /* do nothing if dom0 and iommu supports pass thru */ + if ( iommu_passthrough && + ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) ) + return 0; + + dma_pte_clear_one(d, (paddr_t)gfn << PAGE_SHIFT_4K); + + return 0; +} + +static int iommu_prepare_rmrr_dev(struct domain *d, + struct acpi_rmrr_unit *rmrr, + u8 bus, u8 devfn) +{ + int ret = 0; + u64 base, end; + unsigned long base_pfn, end_pfn; + + ASSERT(spin_is_locked(&pcidevs_lock)); + ASSERT(rmrr->base_address < rmrr->end_address); + + base = rmrr->base_address & PAGE_MASK_4K; + base_pfn = base >> PAGE_SHIFT_4K; + end = PAGE_ALIGN_4K(rmrr->end_address); + end_pfn = end >> PAGE_SHIFT_4K; + + while ( base_pfn < end_pfn ) + { + intel_iommu_map_page(d, base_pfn, base_pfn); + base_pfn++; + } + + ret = domain_context_mapping(d, bus, devfn); + + return ret; +} + +static int intel_iommu_add_device(struct pci_dev *pdev) +{ + struct acpi_rmrr_unit *rmrr; + u16 bdf; + int ret, i; + + ASSERT(spin_is_locked(&pcidevs_lock)); + + if ( !pdev->domain ) + return -EINVAL; + + ret = domain_context_mapping(pdev->domain, pdev->bus, pdev->devfn); + if ( ret ) + { + gdprintk(XENLOG_ERR VTDPREFIX, + "intel_iommu_add_device: context mapping failed\n"); + return ret; + } + + for_each_rmrr_device ( rmrr, bdf, i ) + { + if ( PCI_BUS(bdf) == pdev->bus && PCI_DEVFN2(bdf) == pdev->devfn ) + { + ret = iommu_prepare_rmrr_dev(pdev->domain, rmrr, + pdev->bus, pdev->devfn); + if ( ret ) + gdprintk(XENLOG_ERR VTDPREFIX, + "intel_iommu_add_device: RMRR mapping failed\n"); + break; + } + } + + return ret; +} + +static int intel_iommu_remove_device(struct pci_dev *pdev) +{ + struct acpi_rmrr_unit *rmrr; + u16 bdf; + int i; + + if ( !pdev->domain ) + return -EINVAL; + + /* If the device belongs to dom0, and it has RMRR, don't remove it + * from dom0, because BIOS may use RMRR at booting time. + */ + if ( pdev->domain->domain_id == 0 ) + { + for_each_rmrr_device ( rmrr, bdf, i ) + { + if ( PCI_BUS(bdf) == pdev->bus && + PCI_DEVFN2(bdf) == pdev->devfn ) + return 0; + } + } + + return domain_context_unmap(pdev->domain, pdev->bus, pdev->devfn); +} + +static void setup_dom0_devices(struct domain *d) +{ + struct hvm_iommu *hd; + struct pci_dev *pdev; + int bus, dev, func; + u32 l; + + hd = domain_hvm_iommu(d); + + spin_lock(&pcidevs_lock); + for ( bus = 0; bus < 256; bus++ ) + { + for ( dev = 0; dev < 32; dev++ ) + { + for ( func = 0; func < 8; func++ ) + { + l = pci_conf_read32(bus, dev, func, PCI_VENDOR_ID); + /* some broken boards return 0 or ~0 if a slot is empty: */ + if ( (l == 0xffffffff) || (l == 0x00000000) || + (l == 0x0000ffff) || (l == 0xffff0000) ) + continue; + + pdev = alloc_pdev(bus, PCI_DEVFN(dev, func)); + pdev->domain = d; + list_add(&pdev->domain_list, &d->arch.pdev_list); + domain_context_mapping(d, pdev->bus, pdev->devfn); + } + } + } + spin_unlock(&pcidevs_lock); +} + +void clear_fault_bits(struct iommu *iommu) +{ + u64 val; + + val = dmar_readq( + iommu->reg, + cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+0x8); + dmar_writeq( + iommu->reg, + cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+8, + val); + dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_FAULTS); +} + +static int init_vtd_hw(void) +{ + struct acpi_drhd_unit *drhd; + struct iommu *iommu; + struct iommu_flush *flush = NULL; + int vector; + int ret; + + for_each_drhd_unit ( drhd ) + { + iommu = drhd->iommu; + ret = iommu_set_root_entry(iommu); + if ( ret ) + { + gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: set root entry failed\n"); + return -EIO; + } + + vector = iommu_set_interrupt(iommu); + dma_msi_data_init(iommu, vector); + dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(cpu_online_map))); + iommu->vector = vector; + clear_fault_bits(iommu); + dmar_writel(iommu->reg, DMAR_FECTL_REG, 0); + + /* initialize flush functions */ + flush = iommu_get_flush(iommu); + flush->context = flush_context_reg; + flush->iotlb = flush_iotlb_reg; + } + + for_each_drhd_unit ( drhd ) + { + iommu = drhd->iommu; + if ( qinval_setup(iommu) != 0 ) + dprintk(XENLOG_INFO VTDPREFIX, + "Queued Invalidation hardware not found\n"); + } + + if (iommu_intremap) { + for_each_drhd_unit ( drhd ) + { + iommu = drhd->iommu; + if ( intremap_setup(iommu) != 0 ) { + iommu_intremap = 0; + dprintk(XENLOG_INFO VTDPREFIX, + "Interrupt Remapping hardware not found\n"); + dprintk(XENLOG_INFO VTDPREFIX, + "Device assignment will be disabled for security reasons (CVE-2011-1898).\n"); + dprintk(XENLOG_INFO VTDPREFIX, + "Use iommu=no-intremap to override.\n"); + } + } + } + + return 0; +} + +static void setup_dom0_rmrr(struct domain *d) +{ + struct acpi_rmrr_unit *rmrr; + u16 bdf; + int ret, i; + + spin_lock(&pcidevs_lock); + for_each_rmrr_device ( rmrr, bdf, i ) + { + ret = iommu_prepare_rmrr_dev(d, rmrr, PCI_BUS(bdf), PCI_DEVFN2(bdf)); + if ( ret ) + gdprintk(XENLOG_ERR VTDPREFIX, + "IOMMU: mapping reserved region failed\n"); + } + spin_unlock(&pcidevs_lock); +} + +static void platform_quirks(void) +{ + u32 id; + + /* Mobile 4 Series Chipset neglects to set RWBF capability, + * but needs it + */ + gdprintk(XENLOG_INFO VTDPREFIX, "DMAR: Forcing write-buffer flush\n"); + id = pci_conf_read32(0, 0, 0, 0); + if ( id == 0x2a408086 ) + rwbf_quirk = 1; +} + +int intel_vtd_setup(void) +{ + struct acpi_drhd_unit *drhd; + struct iommu *iommu; + + if ( !vtd_enabled ) + return -ENODEV; + + platform_quirks(); + + spin_lock_init(&domid_bitmap_lock); + clflush_size = get_cache_line_size(); + + for_each_drhd_unit ( drhd ) + if ( iommu_alloc(drhd) != 0 ) + goto error; + + /* Allocate IO page directory page for the domain. */ + drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list); + iommu = drhd->iommu; + + /* Allocate domain id bitmap, and set bit 0 as reserved */ + domid_bitmap_size = cap_ndoms(iommu->cap); + domid_bitmap = xmalloc_array(unsigned long, + BITS_TO_LONGS(domid_bitmap_size)); + if ( domid_bitmap == NULL ) + goto error; + memset(domid_bitmap, 0, domid_bitmap_size / 8); + set_bit(0, domid_bitmap); + + if ( init_vtd_hw() ) + goto error; + + /* Giving that all devices within guest use same io page table, + * enable snoop control only if all VT-d engines support it. + */ + if ( iommu_snoop ) + { + for_each_drhd_unit ( drhd ) + { + iommu = drhd->iommu; + if ( !ecap_snp_ctl(iommu->ecap) ) { + iommu_snoop = 0; + break; + } + } + } + + printk("Intel VT-d snoop control %sabled\n", iommu_snoop ? "en" : "dis"); + return 0; + + error: + for_each_drhd_unit ( drhd ) + iommu_free(drhd); + vtd_enabled = 0; + iommu_snoop = 0; + return -ENOMEM; +} + +int intel_iommu_assign_device(struct domain *d, u8 bus, u8 devfn) +{ + struct acpi_rmrr_unit *rmrr; + int ret = 0, i; + struct pci_dev *pdev; + u16 bdf; + + if ( list_empty(&acpi_drhd_units) ) + return -ENODEV; + + ASSERT(spin_is_locked(&pcidevs_lock)); + pdev = pci_get_pdev(bus, devfn); + if (!pdev) + return -ENODEV; + + if (pdev->domain != dom0) + { + gdprintk(XENLOG_ERR VTDPREFIX, + "IOMMU: assign a assigned device\n"); + return -EBUSY; + } + + ret = reassign_device_ownership(dom0, d, bus, devfn); + if ( ret ) + goto done; + + /* Setup rmrr identity mapping */ + for_each_rmrr_device( rmrr, bdf, i ) + { + if ( PCI_BUS(bdf) == bus && PCI_DEVFN2(bdf) == devfn ) + { + /* FIXME: Because USB RMRR conflicts with guest bios region, + * ignore USB RMRR temporarily. + */ + if ( is_usb_device(bus, devfn) ) + { + ret = 0; + goto done; + } + + ret = iommu_prepare_rmrr_dev(d, rmrr, bus, devfn); + if ( ret ) + gdprintk(XENLOG_ERR VTDPREFIX, + "IOMMU: mapping reserved region failed\n"); + goto done; + } + } + +done: + return ret; +} + +static int intel_iommu_group_id(u8 bus, u8 devfn) +{ + u8 secbus; + if ( !bus2bridge[bus].map || find_pcie_endpoint(&bus, &devfn, &secbus) ) + return PCI_BDF2(bus, devfn); + else + return -1; +} + +static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS]; +void iommu_suspend(void) +{ + struct acpi_drhd_unit *drhd; + struct iommu *iommu; + u32 i; + + if ( !vtd_enabled ) + return; + + iommu_flush_all(); + + for_each_drhd_unit ( drhd ) + { + iommu = drhd->iommu; + i = iommu->index; + + iommu_state[i][DMAR_FECTL_REG] = + (u32) dmar_readl(iommu->reg, DMAR_FECTL_REG); + iommu_state[i][DMAR_FEDATA_REG] = + (u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG); + iommu_state[i][DMAR_FEADDR_REG] = + (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG); + iommu_state[i][DMAR_FEUADDR_REG] = + (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG); + } +} + +void iommu_resume(void) +{ + struct acpi_drhd_unit *drhd; + struct iommu *iommu; + u32 i; + + if ( !vtd_enabled ) + return; + + iommu_flush_all(); + + if ( init_vtd_hw() != 0 && force_iommu ) + panic("IOMMU setup failed, crash Xen for security purpose!\n"); + + for_each_drhd_unit ( drhd ) + { + iommu = drhd->iommu; + i = iommu->index; + + dmar_writel(iommu->reg, DMAR_FECTL_REG, + (u32) iommu_state[i][DMAR_FECTL_REG]); + dmar_writel(iommu->reg, DMAR_FEDATA_REG, + (u32) iommu_state[i][DMAR_FEDATA_REG]); + dmar_writel(iommu->reg, DMAR_FEADDR_REG, + (u32) iommu_state[i][DMAR_FEADDR_REG]); + dmar_writel(iommu->reg, DMAR_FEUADDR_REG, + (u32) iommu_state[i][DMAR_FEUADDR_REG]); + iommu_enable_translation(iommu); + } +} + +struct iommu_ops intel_iommu_ops = { + .init = intel_iommu_domain_init, + .add_device = intel_iommu_add_device, + .remove_device = intel_iommu_remove_device, + .assignable = intel_iommu_assignable, + .assign_device = intel_iommu_assign_device, + .teardown = iommu_domain_teardown, + .map_page = intel_iommu_map_page, + .unmap_page = intel_iommu_unmap_page, + .reassign_device = reassign_device_ownership, + .get_device_group_id = intel_iommu_group_id, + .update_ire_from_apic = io_apic_write_remap_rte, + .update_ire_from_msi = msi_msg_write_remap_rte, +}; + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -Naurp xen/drivers/passthrough/vtd/iommu.h xen-redhat/drivers/passthrough/vtd/iommu.h --- xen/drivers/passthrough/vtd/iommu.h +++ xen-redhat/drivers/passthrough/vtd/iommu.h @@ -0,0 +1,497 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Ashok Raj <ashok.raj@intel.com> + */ + +#ifndef _INTEL_IOMMU_H_ +#define _INTEL_IOMMU_H_ + +#include <xen/types.h> + +/* + * Intel IOMMU register specification per version 1.0 public spec. + */ + +#define DMAR_VER_REG 0x0 /* Arch version supported by this IOMMU */ +#define DMAR_CAP_REG 0x8 /* Hardware supported capabilities */ +#define DMAR_ECAP_REG 0x10 /* Extended capabilities supported */ +#define DMAR_GCMD_REG 0x18 /* Global command register */ +#define DMAR_GSTS_REG 0x1c /* Global status register */ +#define DMAR_RTADDR_REG 0x20 /* Root entry table */ +#define DMAR_CCMD_REG 0x28 /* Context command reg */ +#define DMAR_FSTS_REG 0x34 /* Fault Status register */ +#define DMAR_FECTL_REG 0x38 /* Fault control register */ +#define DMAR_FEDATA_REG 0x3c /* Fault event interrupt data register */ +#define DMAR_FEADDR_REG 0x40 /* Fault event interrupt addr register */ +#define DMAR_FEUADDR_REG 0x44 /* Upper address register */ +#define DMAR_AFLOG_REG 0x58 /* Advanced Fault control */ +#define DMAR_PMEN_REG 0x64 /* Enable Protected Memory Region */ +#define DMAR_PLMBASE_REG 0x68 /* PMRR Low addr */ +#define DMAR_PLMLIMIT_REG 0x6c /* PMRR low limit */ +#define DMAR_PHMBASE_REG 0x70 /* pmrr high base addr */ +#define DMAR_PHMLIMIT_REG 0x78 /* pmrr high limit */ +#define DMAR_IQH_REG 0x80 /* invalidation queue head */ +#define DMAR_IQT_REG 0x88 /* invalidation queue tail */ +#define DMAR_IQA_REG 0x90 /* invalidation queue addr */ +#define DMAR_IRTA_REG 0xB8 /* intr remap */ + +#define OFFSET_STRIDE (9) +#define dmar_readl(dmar, reg) readl(dmar + reg) +#define dmar_writel(dmar, reg, val) writel(val, dmar + reg) +#define dmar_readq(dmar, reg) ({ \ + u32 lo, hi; \ + lo = dmar_readl(dmar, reg); \ + hi = dmar_readl(dmar, reg + 4); \ + (((u64) hi) << 32) + lo; }) +#define dmar_writeq(dmar, reg, val) do {\ + dmar_writel(dmar, reg, (u32)val); \ + dmar_writel(dmar, reg + 4, (u32)((u64) val >> 32)); \ + } while (0) + +#define VER_MAJOR(v) (((v) & 0xf0) >> 4) +#define VER_MINOR(v) ((v) & 0x0f) + +/* + * Decoding Capability Register + */ +#define cap_read_drain(c) (((c) >> 55) & 1) +#define cap_write_drain(c) (((c) >> 54) & 1) +#define cap_max_amask_val(c) (((c) >> 48) & 0x3f) +#define cap_num_fault_regs(c) ((((c) >> 40) & 0xff) + 1) +#define cap_pgsel_inv(c) (((c) >> 39) & 1) + +#define cap_super_page_val(c) (((c) >> 34) & 0xf) +#define cap_super_offset(c) (((find_first_bit(&cap_super_page_val(c), 4)) \ + * OFFSET_STRIDE) + 21) + +#define cap_fault_reg_offset(c) ((((c) >> 24) & 0x3ff) * 16) + +#define cap_isoch(c) (((c) >> 23) & 1) +#define cap_qos(c) (((c) >> 22) & 1) +#define cap_mgaw(c) ((((c) >> 16) & 0x3f) + 1) +#define cap_sagaw(c) (((c) >> 8) & 0x1f) +#define cap_caching_mode(c) (((c) >> 7) & 1) +#define cap_phmr(c) (((c) >> 6) & 1) +#define cap_plmr(c) (((c) >> 5) & 1) +#define cap_rwbf(c) (((c) >> 4) & 1) +#define cap_afl(c) (((c) >> 3) & 1) +#define cap_ndoms(c) (1 << (4 + 2 * ((c) & 0x7))) + +/* + * Extended Capability Register + */ + +#define ecap_niotlb_iunits(e) ((((e) >> 24) & 0xff) + 1) +#define ecap_iotlb_offset(e) ((((e) >> 8) & 0x3ff) * 16) +#define ecap_coherent(e) ((e >> 0) & 0x1) +#define ecap_queued_inval(e) ((e >> 1) & 0x1) +#define ecap_dev_iotlb(e) ((e >> 2) & 0x1) +#define ecap_intr_remap(e) ((e >> 3) & 0x1) +#define ecap_ext_intr(e) ((e >> 4) & 0x1) +#define ecap_cache_hints(e) ((e >> 5) & 0x1) +#define ecap_pass_thru(e) ((e >> 6) & 0x1) +#define ecap_snp_ctl(e) ((e >> 7) & 0x1) + +/* IOTLB_REG */ +#define DMA_TLB_FLUSH_GRANU_OFFSET 60 +#define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60) +#define DMA_TLB_DSI_FLUSH (((u64)2) << 60) +#define DMA_TLB_PSI_FLUSH (((u64)3) << 60) +#define DMA_TLB_IIRG(x) (((x) >> 60) & 7) +#define DMA_TLB_IAIG(val) (((val) >> 57) & 7) +#define DMA_TLB_DID(x) (((u64)(x & 0xffff)) << 32) + +#define DMA_TLB_READ_DRAIN (((u64)1) << 49) +#define DMA_TLB_WRITE_DRAIN (((u64)1) << 48) +#define DMA_TLB_IVT (((u64)1) << 63) + +#define DMA_TLB_IVA_ADDR(x) ((((u64)x) >> 12) << 12) +#define DMA_TLB_IVA_HINT(x) ((((u64)x) & 1) << 6) + +/* GCMD_REG */ +#define DMA_GCMD_TE (((u64)1) << 31) +#define DMA_GCMD_SRTP (((u64)1) << 30) +#define DMA_GCMD_SFL (((u64)1) << 29) +#define DMA_GCMD_EAFL (((u64)1) << 28) +#define DMA_GCMD_WBF (((u64)1) << 27) +#define DMA_GCMD_QIE (((u64)1) << 26) +#define DMA_GCMD_IRE (((u64)1) << 25) +#define DMA_GCMD_SIRTP (((u64)1) << 24) +#define DMA_GCMD_CFI (((u64)1) << 23) + +/* GSTS_REG */ +#define DMA_GSTS_TES (((u64)1) << 31) +#define DMA_GSTS_RTPS (((u64)1) << 30) +#define DMA_GSTS_FLS (((u64)1) << 29) +#define DMA_GSTS_AFLS (((u64)1) << 28) +#define DMA_GSTS_WBFS (((u64)1) << 27) +#define DMA_GSTS_QIES (((u64)1) <<26) +#define DMA_GSTS_IRES (((u64)1) <<25) +#define DMA_GSTS_SIRTPS (((u64)1) << 24) +#define DMA_GSTS_CFIS (((u64)1) <<23) + +/* PMEN_REG */ +#define DMA_PMEN_EPM (((u32)1) << 31) +#define DMA_PMEN_PRS (((u32)1) << 0) + +/* CCMD_REG */ +#define DMA_CCMD_INVL_GRANU_OFFSET 61 +#define DMA_CCMD_ICC (((u64)1) << 63) +#define DMA_CCMD_GLOBAL_INVL (((u64)1) << 61) +#define DMA_CCMD_DOMAIN_INVL (((u64)2) << 61) +#define DMA_CCMD_DEVICE_INVL (((u64)3) << 61) +#define DMA_CCMD_FM(m) (((u64)((m) & 0x3)) << 32) +#define DMA_CCMD_CIRG(x) ((((u64)3) << 61) & x) +#define DMA_CCMD_MASK_NOBIT 0 +#define DMA_CCMD_MASK_1BIT 1 +#define DMA_CCMD_MASK_2BIT 2 +#define DMA_CCMD_MASK_3BIT 3 +#define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16) +#define DMA_CCMD_DID(d) ((u64)((d) & 0xffff)) + +#define DMA_CCMD_CAIG_MASK(x) (((u64)x) & ((u64) 0x3 << 59)) + +/* FECTL_REG */ +#define DMA_FECTL_IM (((u64)1) << 31) + +/* FSTS_REG */ +#define DMA_FSTS_PFO ((u64)1 << 0) +#define DMA_FSTS_PPF ((u64)1 << 1) +#define DMA_FSTS_AFO ((u64)1 << 2) +#define DMA_FSTS_APF ((u64)1 << 3) +#define DMA_FSTS_IQE ((u64)1 << 4) +#define DMA_FSTS_ICE ((u64)1 << 5) +#define DMA_FSTS_ITE ((u64)1 << 6) +#define DMA_FSTS_FAULTS DMA_FSTS_PFO | DMA_FSTS_PPF | DMA_FSTS_AFO | DMA_FSTS_APF | DMA_FSTS_IQE | DMA_FSTS_ICE | DMA_FSTS_ITE +#define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff) + +/* FRCD_REG, 32 bits access */ +#define DMA_FRCD_F (((u64)1) << 31) +#define dma_frcd_type(d) ((d >> 30) & 1) +#define dma_frcd_fault_reason(c) (c & 0xff) +#define dma_frcd_source_id(c) (c & 0xffff) +#define dma_frcd_page_addr(d) (d & (((u64)-1) << 12)) /* low 64 bit */ + +/* + * 0: Present + * 1-11: Reserved + * 12-63: Context Ptr (12 - (haw-1)) + * 64-127: Reserved + */ +struct root_entry { + u64 val; + u64 rsvd1; +}; +#define root_present(root) ((root).val & 1) +#define set_root_present(root) do {(root).val |= 1;} while(0) +#define get_context_addr(root) ((root).val & PAGE_MASK_4K) +#define set_root_value(root, value) \ + do {(root).val |= ((value) & PAGE_MASK_4K);} while(0) + +struct context_entry { + u64 lo; + u64 hi; +}; +#define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry)) +#define context_present(c) ((c).lo & 1) +#define context_fault_disable(c) (((c).lo >> 1) & 1) +#define context_translation_type(c) (((c).lo >> 2) & 3) +#define context_address_root(c) ((c).lo & PAGE_MASK_4K) +#define context_address_width(c) ((c).hi & 7) +#define context_domain_id(c) (((c).hi >> 8) & ((1 << 16) - 1)) + +#define context_set_present(c) do {(c).lo |= 1;} while(0) +#define context_clear_present(c) do {(c).lo &= ~1;} while(0) +#define context_set_fault_enable(c) \ + do {(c).lo &= (((u64)-1) << 2) | 1;} while(0) + +#define context_set_translation_type(c, val) do { \ + (c).lo &= (((u64)-1) << 4) | 3; \ + (c).lo |= (val & 3) << 2; \ + } while(0) +#define CONTEXT_TT_MULTI_LEVEL 0 +#define CONTEXT_TT_DEV_IOTLB 1 +#define CONTEXT_TT_PASS_THRU 2 + +#define context_set_address_root(c, val) \ + do {(c).lo &= 0xfff; (c).lo |= (val) & PAGE_MASK_4K ;} while(0) +#define context_set_address_width(c, val) \ + do {(c).hi &= 0xfffffff8; (c).hi |= (val) & 7;} while(0) +#define context_clear_entry(c) do {(c).lo = 0; (c).hi = 0;} while(0) + +/* page table handling */ +#define LEVEL_STRIDE (9) +#define LEVEL_MASK ((1 << LEVEL_STRIDE) - 1) +#define PTE_NUM (1 << LEVEL_STRIDE) +#define level_to_agaw(val) ((val) - 2) +#define agaw_to_level(val) ((val) + 2) +#define agaw_to_width(val) (30 + val * LEVEL_STRIDE) +#define width_to_agaw(w) ((w - 30)/LEVEL_STRIDE) +#define level_to_offset_bits(l) (12 + (l - 1) * LEVEL_STRIDE) +#define address_level_offset(addr, level) \ + ((addr >> level_to_offset_bits(level)) & LEVEL_MASK) +#define level_mask(l) (((u64)(-1)) << level_to_offset_bits(l)) +#define level_size(l) (1 << level_to_offset_bits(l)) +#define align_to_level(addr, l) ((addr + level_size(l) - 1) & level_mask(l)) + +/* + * 0: readable + * 1: writable + * 2-6: reserved + * 7: super page + * 8-11: available + * 12-63: Host physcial address + */ +struct dma_pte { + u64 val; +}; +#define DMA_PTE_READ (1) +#define DMA_PTE_WRITE (2) +#define DMA_PTE_SNP (1 << 11) + +#define dma_clear_pte(p) do {(p).val = 0;} while(0) +#define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while(0) +#define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while(0) +#define dma_set_pte_superpage(p) do {(p).val |= (1 << 7);} while(0) +#define dma_set_pte_snp(p) do {(p).val |= DMA_PTE_SNP;} while(0) + +#define dma_set_pte_prot(p, prot) \ + do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0) +#define dma_pte_addr(p) ((p).val & PAGE_MASK_4K) +#define dma_set_pte_addr(p, addr) do {\ + (p).val |= ((addr) & PAGE_MASK_4K); } while (0) +#define dma_pte_present(p) (((p).val & 3) != 0) + +/* interrupt remap entry */ +struct iremap_entry { + union { + u64 lo_val; + struct { + u64 p : 1, + fpd : 1, + dm : 1, + rh : 1, + tm : 1, + dlm : 3, + avail : 4, + res_1 : 4, + vector : 8, + res_2 : 8, + dst : 32; + }lo; + }; + union { + u64 hi_val; + struct { + u64 sid : 16, + sq : 2, + svt : 2, + res_1 : 44; + }hi; + }; +}; + +/* Max intr remapping table page order is 8, as max number of IRTEs is 64K */ +#define IREMAP_PAGE_ORDER 8 + +/* + * VTd engine handles 4K page, while CPU may have different page size on + * different arch. E.g. 16K on IPF. + */ +#define IREMAP_ARCH_PAGE_ORDER (IREMAP_PAGE_ORDER + PAGE_SHIFT_4K - PAGE_SHIFT) +#define IREMAP_ARCH_PAGE_NR ( IREMAP_ARCH_PAGE_ORDER < 0 ? \ + 1 : \ + 1 << IREMAP_ARCH_PAGE_ORDER ) + +/* Each entry is 16 bytes, so 2^8 entries per 4K page */ +#define IREMAP_ENTRY_ORDER ( PAGE_SHIFT - 4 ) +#define IREMAP_ENTRY_NR ( 1 << ( IREMAP_PAGE_ORDER + 8 ) ) + +#define iremap_present(v) ((v).lo & 1) +#define iremap_fault_disable(v) (((v).lo >> 1) & 1) + +#define iremap_set_present(v) do {(v).lo |= 1;} while(0) +#define iremap_clear_present(v) do {(v).lo &= ~1;} while(0) + +/* + * Get the intr remap entry: + * maddr - machine addr of the table + * index - index of the entry + * entries - return addr of the page holding this entry, need unmap it + * entry - return required entry + */ +#define GET_IREMAP_ENTRY(maddr, index, entries, entry) \ +do { \ + entries = (struct iremap_entry *)map_vtd_domain_page( \ + (maddr) + (( (index) >> IREMAP_ENTRY_ORDER ) << PAGE_SHIFT ) ); \ + entry = &entries[(index) % (1 << IREMAP_ENTRY_ORDER)]; \ +} while(0) + +/* queue invalidation entry */ +struct qinval_entry { + union { + struct { + u64 lo; + u64 hi; + }val; + struct { + struct { + u64 type : 4, + granu : 2, + res_1 : 10, + did : 16, + sid : 16, + fm : 2, + res_2 : 14; + }lo; + struct { + u64 res; + }hi; + }cc_inv_dsc; + struct { + struct { + u64 type : 4, + granu : 2, + dw : 1, + dr : 1, + res_1 : 8, + did : 16, + res_2 : 32; + }lo; + struct { + u64 am : 6, + ih : 1, + res_1 : 5, + addr : 52; + }hi; + }iotlb_inv_dsc; + struct { + struct { + u64 type : 4, + res_1 : 12, + max_invs_pend: 5, + res_2 : 11, + sid : 16, + res_3 : 16; + }lo; + struct { + u64 size : 1, + res_1 : 11, + addr : 52; + }hi; + }dev_iotlb_inv_dsc; + struct { + struct { + u64 type : 4, + granu : 1, + res_1 : 22, + im : 5, + iidx : 16, + res_2 : 16; + }lo; + struct { + u64 res; + }hi; + }iec_inv_dsc; + struct { + struct { + u64 type : 4, + iflag : 1, + sw : 1, + fn : 1, + res_1 : 25, + sdata : 32; + }lo; + struct { + u64 res_1 : 2, + saddr : 62; + }hi; + }inv_wait_dsc; + }q; +}; + +struct poll_info { + u64 saddr; + u32 udata; +}; + +#define QINVAL_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct qinval_entry)) +#define qinval_present(v) ((v).lo & 1) +#define qinval_fault_disable(v) (((v).lo >> 1) & 1) + +#define qinval_set_present(v) do {(v).lo |= 1;} while(0) +#define qinval_clear_present(v) do {(v).lo &= ~1;} while(0) + +#define RESERVED_VAL 0 + +#define TYPE_INVAL_CONTEXT 0x1 +#define TYPE_INVAL_IOTLB 0x2 +#define TYPE_INVAL_DEVICE_IOTLB 0x3 +#define TYPE_INVAL_IEC 0x4 +#define TYPE_INVAL_WAIT 0x5 + +#define NOTIFY_TYPE_POLL 1 +#define NOTIFY_TYPE_INTR 1 +#define INTERRUTP_FLAG 1 +#define STATUS_WRITE 1 +#define FENCE_FLAG 1 + +#define IEC_GLOBAL_INVL 0 +#define IEC_INDEX_INVL 1 +#define IRTA_REG_EIME_SHIFT 11 +#define IRTA_REG_TABLE_SIZE 7 // 4k page = 256 * 16 byte entries + // 2^^(IRTA_REG_TABLE_SIZE + 1) = 256 + // IRTA_REG_TABLE_SIZE = 7 + +#define VTD_PAGE_TABLE_LEVEL_3 3 +#define VTD_PAGE_TABLE_LEVEL_4 4 + +#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48 +#define MAX_IOMMU_REGS 0xc0 + +extern struct list_head acpi_drhd_units; +extern struct list_head acpi_rmrr_units; +extern struct list_head acpi_ioapic_units; + +struct qi_ctrl { + u64 qinval_maddr; /* queue invalidation page machine address */ + int qinval_index; /* queue invalidation index */ + spinlock_t qinval_lock; /* lock for queue invalidation page */ + spinlock_t qinval_poll_lock; /* lock for queue invalidation poll addr */ + volatile u32 qinval_poll_status; /* used by poll methord to sync */ +}; + +struct ir_ctrl { + u64 iremap_maddr; /* interrupt remap table machine address */ + int iremap_num; /* total num of used interrupt remap entry */ + spinlock_t iremap_lock; /* lock for irq remappping table */ +}; + +struct iommu_flush { + int (*context)(void *iommu, u16 did, u16 source_id, + u8 function_mask, u64 type, int non_present_entry_flush); + int (*iotlb)(void *iommu, u16 did, u64 addr, unsigned int size_order, + u64 type, int non_present_entry_flush); +}; + +struct intel_iommu { + struct qi_ctrl qi_ctrl; + struct ir_ctrl ir_ctrl; + struct iommu_flush flush; +}; + +#endif diff -Naurp xen/drivers/passthrough/vtd/Makefile xen-redhat/drivers/passthrough/vtd/Makefile --- xen/drivers/passthrough/vtd/Makefile +++ xen-redhat/drivers/passthrough/vtd/Makefile @@ -0,0 +1,8 @@ +subdir-$(x86_32) += x86 +subdir-$(x86_64) += x86 + +obj-y += iommu.o +obj-y += dmar.o +obj-y += utils.o +obj-y += qinval.o +obj-y += intremap.o diff -Naurp xen/drivers/passthrough/vtd/qinval.c xen-redhat/drivers/passthrough/vtd/qinval.c --- xen/drivers/passthrough/vtd/qinval.c +++ xen-redhat/drivers/passthrough/vtd/qinval.c @@ -0,0 +1,463 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Allen Kay <allen.m.kay@intel.com> + * Copyright (C) Xiaohui Xin <xiaohui.xin@intel.com> + */ + + +#include <xen/sched.h> +#include <xen/iommu.h> +#include <xen/time.h> +#include <xen/pci.h> +#include <xen/pci_regs.h> +#include "iommu.h" +#include "dmar.h" +#include "vtd.h" +#include "extern.h" + +static void print_qi_regs(struct iommu *iommu) +{ + u64 val; + + val = dmar_readq(iommu->reg, DMAR_IQA_REG); + printk("DMAR_IQA_REG = %"PRIx64"\n", val); + + val = dmar_readq(iommu->reg, DMAR_IQH_REG); + printk("DMAR_IQH_REG = %"PRIx64"\n", val); + + val = dmar_readq(iommu->reg, DMAR_IQT_REG); + printk("DMAR_IQT_REG = %"PRIx64"\n", val); +} + +static int qinval_next_index(struct iommu *iommu) +{ + u64 val; + val = dmar_readq(iommu->reg, DMAR_IQT_REG); + return (val >> 4); +} + +static int qinval_update_qtail(struct iommu *iommu, int index) +{ + u64 val; + + /* Need an ASSERT to insure that we have got register lock */ + val = (index < (QINVAL_ENTRY_NR-1)) ? (index + 1) : 0; + dmar_writeq(iommu->reg, DMAR_IQT_REG, (val << 4)); + return 0; +} + +static int gen_cc_inv_dsc(struct iommu *iommu, int index, + u16 did, u16 source_id, u8 function_mask, u8 granu) +{ + unsigned long flags; + struct qinval_entry *qinval_entry = NULL, *qinval_entries; + struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu); + + spin_lock_irqsave(&qi_ctrl->qinval_lock, flags); + qinval_entries = + (struct qinval_entry *)map_vtd_domain_page(qi_ctrl->qinval_maddr); + qinval_entry = &qinval_entries[index]; + qinval_entry->q.cc_inv_dsc.lo.type = TYPE_INVAL_CONTEXT; + qinval_entry->q.cc_inv_dsc.lo.granu = granu; + qinval_entry->q.cc_inv_dsc.lo.res_1 = 0; + qinval_entry->q.cc_inv_dsc.lo.did = did; + qinval_entry->q.cc_inv_dsc.lo.sid = source_id; + qinval_entry->q.cc_inv_dsc.lo.fm = function_mask; + qinval_entry->q.cc_inv_dsc.lo.res_2 = 0; + qinval_entry->q.cc_inv_dsc.hi.res = 0; + + unmap_vtd_domain_page(qinval_entries); + spin_unlock_irqrestore(&qi_ctrl->qinval_lock, flags); + + return 0; +} + +int queue_invalidate_context(struct iommu *iommu, + u16 did, u16 source_id, u8 function_mask, u8 granu) +{ + int ret = -1; + unsigned long flags; + int index = -1; + + spin_lock_irqsave(&iommu->register_lock, flags); + index = qinval_next_index(iommu); + if ( index == -1 ) + return -EBUSY; + ret = gen_cc_inv_dsc(iommu, index, did, source_id, + function_mask, granu); + ret |= qinval_update_qtail(iommu, index); + spin_unlock_irqrestore(&iommu->register_lock, flags); + return ret; +} + +static int gen_iotlb_inv_dsc(struct iommu *iommu, int index, + u8 granu, u8 dr, u8 dw, u16 did, u8 am, u8 ih, u64 addr) +{ + unsigned long flags; + struct qinval_entry *qinval_entry = NULL, *qinval_entries; + struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu); + + if ( index == -1 ) + return -1; + spin_lock_irqsave(&qi_ctrl->qinval_lock, flags); + + qinval_entries = + (struct qinval_entry *)map_vtd_domain_page(qi_ctrl->qinval_maddr); + qinval_entry = &qinval_entries[index]; + qinval_entry->q.iotlb_inv_dsc.lo.type = TYPE_INVAL_IOTLB; + qinval_entry->q.iotlb_inv_dsc.lo.granu = granu; + qinval_entry->q.iotlb_inv_dsc.lo.dr = 0; + qinval_entry->q.iotlb_inv_dsc.lo.dw = 0; + qinval_entry->q.iotlb_inv_dsc.lo.res_1 = 0; + qinval_entry->q.iotlb_inv_dsc.lo.did = did; + qinval_entry->q.iotlb_inv_dsc.lo.res_2 = 0; + + qinval_entry->q.iotlb_inv_dsc.hi.am = am; + qinval_entry->q.iotlb_inv_dsc.hi.ih = ih; + qinval_entry->q.iotlb_inv_dsc.hi.res_1 = 0; + qinval_entry->q.iotlb_inv_dsc.hi.addr = addr; + + unmap_vtd_domain_page(qinval_entries); + spin_unlock_irqrestore(&qi_ctrl->qinval_lock, flags); + return 0; +} + +int queue_invalidate_iotlb(struct iommu *iommu, + u8 granu, u8 dr, u8 dw, u16 did, u8 am, u8 ih, u64 addr) +{ + int ret = -1; + unsigned long flags; + int index = -1; + + spin_lock_irqsave(&iommu->register_lock, flags); + + index = qinval_next_index(iommu); + ret = gen_iotlb_inv_dsc(iommu, index, granu, dr, dw, did, + am, ih, addr); + ret |= qinval_update_qtail(iommu, index); + spin_unlock_irqrestore(&iommu->register_lock, flags); + return ret; +} + +static int gen_wait_dsc(struct iommu *iommu, int index, + u8 iflag, u8 sw, u8 fn, u32 sdata, volatile u32 *saddr) +{ + unsigned long flags; + struct qinval_entry *qinval_entry = NULL, *qinval_entries; + struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu); + + if ( index == -1 ) + return -1; + spin_lock_irqsave(&qi_ctrl->qinval_lock, flags); + qinval_entries = + (struct qinval_entry *)map_vtd_domain_page(qi_ctrl->qinval_maddr); + qinval_entry = &qinval_entries[index]; + qinval_entry->q.inv_wait_dsc.lo.type = TYPE_INVAL_WAIT; + qinval_entry->q.inv_wait_dsc.lo.iflag = iflag; + qinval_entry->q.inv_wait_dsc.lo.sw = sw; + qinval_entry->q.inv_wait_dsc.lo.fn = fn; + qinval_entry->q.inv_wait_dsc.lo.res_1 = 0; + qinval_entry->q.inv_wait_dsc.lo.sdata = sdata; + qinval_entry->q.inv_wait_dsc.hi.res_1 = 0; + qinval_entry->q.inv_wait_dsc.hi.saddr = virt_to_maddr(saddr) >> 2; + unmap_vtd_domain_page(qinval_entries); + spin_unlock_irqrestore(&qi_ctrl->qinval_lock, flags); + return 0; +} + +static int queue_invalidate_wait(struct iommu *iommu, + u8 iflag, u8 sw, u8 fn, u32 sdata, volatile u32 *saddr) +{ + unsigned long flags; + s_time_t start_time; + int index = -1; + int ret = -1; + struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu); + + spin_lock_irqsave(&qi_ctrl->qinval_poll_lock, flags); + spin_lock(&iommu->register_lock); + index = qinval_next_index(iommu); + if ( *saddr == 1 ) + *saddr = 0; + ret = gen_wait_dsc(iommu, index, iflag, sw, fn, sdata, saddr); + ret |= qinval_update_qtail(iommu, index); + spin_unlock(&iommu->register_lock); + + /* Now we don't support interrupt method */ + if ( sw ) + { + /* In case all wait descriptor writes to same addr with same data */ + start_time = NOW(); + while ( *saddr != 1 ) + { + if ( NOW() > (start_time + DMAR_OPERATION_TIMEOUT) ) + { + print_qi_regs(iommu); + panic("queue invalidate wait descriptor was not executed\n"); + } + cpu_relax(); + } + } + spin_unlock_irqrestore(&qi_ctrl->qinval_poll_lock, flags); + return ret; +} + +int invalidate_sync(struct iommu *iommu) +{ + int ret = -1; + struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu); + + if ( qi_ctrl->qinval_maddr != 0 ) + { + ret = queue_invalidate_wait(iommu, + 0, 1, 1, 1, &qi_ctrl->qinval_poll_status); + return ret; + } + return 0; +} + +static int gen_dev_iotlb_inv_dsc(struct iommu *iommu, int index, + u32 max_invs_pend, u16 sid, u16 size, u64 addr) +{ + unsigned long flags; + struct qinval_entry *qinval_entry = NULL, *qinval_entries; + struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu); + + if ( index == -1 ) + return -1; + spin_lock_irqsave(&qi_ctrl->qinval_lock, flags); + + qinval_entries = + (struct qinval_entry *)map_vtd_domain_page(qi_ctrl->qinval_maddr); + qinval_entry = &qinval_entries[index]; + qinval_entry->q.dev_iotlb_inv_dsc.lo.type = TYPE_INVAL_DEVICE_IOTLB; + qinval_entry->q.dev_iotlb_inv_dsc.lo.res_1 = 0; + qinval_entry->q.dev_iotlb_inv_dsc.lo.max_invs_pend = max_invs_pend; + qinval_entry->q.dev_iotlb_inv_dsc.lo.res_2 = 0; + qinval_entry->q.dev_iotlb_inv_dsc.lo.sid = sid; + qinval_entry->q.dev_iotlb_inv_dsc.lo.res_3 = 0; + + qinval_entry->q.dev_iotlb_inv_dsc.hi.size = size; + qinval_entry->q.dev_iotlb_inv_dsc.hi.res_1 = 0; + qinval_entry->q.dev_iotlb_inv_dsc.hi.addr = addr >> PAGE_SHIFT_4K; + + unmap_vtd_domain_page(qinval_entries); + spin_unlock_irqrestore(&qi_ctrl->qinval_lock, flags); + return 0; +} + +int qinval_device_iotlb(struct iommu *iommu, + u32 max_invs_pend, u16 sid, u16 size, u64 addr) +{ + int ret = -1; + unsigned long flags; + int index = -1; + + spin_lock_irqsave(&iommu->register_lock, flags); + index = qinval_next_index(iommu); + ret = gen_dev_iotlb_inv_dsc(iommu, index, max_invs_pend, + sid, size, addr); + ret |= qinval_update_qtail(iommu, index); + spin_unlock_irqrestore(&iommu->register_lock, flags); + return ret; +} + +static int gen_iec_inv_dsc(struct iommu *iommu, int index, + u8 granu, u8 im, u16 iidx) +{ + unsigned long flags; + struct qinval_entry *qinval_entry = NULL, *qinval_entries; + struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu); + + if ( index == -1 ) + return -1; + spin_lock_irqsave(&qi_ctrl->qinval_lock, flags); + + qinval_entries = + (struct qinval_entry *)map_vtd_domain_page(qi_ctrl->qinval_maddr); + qinval_entry = &qinval_entries[index]; + qinval_entry->q.iec_inv_dsc.lo.type = TYPE_INVAL_IEC; + qinval_entry->q.iec_inv_dsc.lo.granu = granu; + qinval_entry->q.iec_inv_dsc.lo.res_1 = 0; + qinval_entry->q.iec_inv_dsc.lo.im = im; + qinval_entry->q.iec_inv_dsc.lo.iidx = iidx; + qinval_entry->q.iec_inv_dsc.lo.res_2 = 0; + qinval_entry->q.iec_inv_dsc.hi.res = 0; + + unmap_vtd_domain_page(qinval_entries); + spin_unlock_irqrestore(&qi_ctrl->qinval_lock, flags); + return 0; +} + +int queue_invalidate_iec(struct iommu *iommu, u8 granu, u8 im, u16 iidx) +{ + int ret; + unsigned long flags; + int index = -1; + + spin_lock_irqsave(&iommu->register_lock, flags); + index = qinval_next_index(iommu); + ret = gen_iec_inv_dsc(iommu, index, granu, im, iidx); + ret |= qinval_update_qtail(iommu, index); + spin_unlock_irqrestore(&iommu->register_lock, flags); + return ret; +} + +int __iommu_flush_iec(struct iommu *iommu, u8 granu, u8 im, u16 iidx) +{ + int ret; + ret = queue_invalidate_iec(iommu, granu, im, iidx); + ret |= invalidate_sync(iommu); + + /* + * reading vt-d architecture register will ensure + * draining happens in implementation independent way. + */ + (void)dmar_readq(iommu->reg, DMAR_CAP_REG); + return ret; +} + +int iommu_flush_iec_global(struct iommu *iommu) +{ + return __iommu_flush_iec(iommu, IEC_GLOBAL_INVL, 0, 0); +} + +int iommu_flush_iec_index(struct iommu *iommu, u8 im, u16 iidx) +{ + return __iommu_flush_iec(iommu, IEC_INDEX_INVL, im, iidx); +} + +static int flush_context_qi( + void *_iommu, u16 did, u16 sid, u8 fm, u64 type, + int non_present_entry_flush) +{ + int ret = 0; + struct iommu *iommu = (struct iommu *)_iommu; + struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu); + + /* + * In the non-present entry flush case, if hardware doesn't cache + * non-present entry we do nothing and if hardware cache non-present + * entry, we flush entries of domain 0 (the domain id is used to cache + * any non-present entries) + */ + if ( non_present_entry_flush ) + { + if ( !cap_caching_mode(iommu->cap) ) + return 1; + else + did = 0; + } + + if ( qi_ctrl->qinval_maddr != 0 ) + { + ret = queue_invalidate_context(iommu, did, sid, fm, + type >> DMA_CCMD_INVL_GRANU_OFFSET); + ret |= invalidate_sync(iommu); + } + return ret; +} + +static int flush_iotlb_qi( + void *_iommu, u16 did, + u64 addr, unsigned int size_order, u64 type, + int non_present_entry_flush) +{ + u8 dr = 0, dw = 0; + int ret = 0; + struct iommu *iommu = (struct iommu *)_iommu; + struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu); + + /* + * In the non-present entry flush case, if hardware doesn't cache + * non-present entry we do nothing and if hardware cache non-present + * entry, we flush entries of domain 0 (the domain id is used to cache + * any non-present entries) + */ + if ( non_present_entry_flush ) + { + if ( !cap_caching_mode(iommu->cap) ) + return 1; + else + did = 0; + } + + if ( qi_ctrl->qinval_maddr != 0 ) + { + /* use queued invalidation */ + if (cap_write_drain(iommu->cap)) + dw = 1; + if (cap_read_drain(iommu->cap)) + dr = 1; + /* Need to conside the ih bit later */ + ret = queue_invalidate_iotlb(iommu, + (type >> DMA_TLB_FLUSH_GRANU_OFFSET), dr, + dw, did, (u8)size_order, 0, addr); + ret |= invalidate_sync(iommu); + } + return ret; +} + +int qinval_setup(struct iommu *iommu) +{ + s_time_t start_time; + struct qi_ctrl *qi_ctrl; + struct iommu_flush *flush; + + qi_ctrl = iommu_qi_ctrl(iommu); + flush = iommu_get_flush(iommu); + + if ( !ecap_queued_inval(iommu->ecap) ) + return -ENODEV; + + if ( qi_ctrl->qinval_maddr == 0 ) + { + qi_ctrl->qinval_maddr = alloc_pgtable_maddr(NULL); + if ( qi_ctrl->qinval_maddr == 0 ) + { + dprintk(XENLOG_WARNING VTDPREFIX, + "Cannot allocate memory for qi_ctrl->qinval_maddr\n"); + return -ENOMEM; + } + flush->context = flush_context_qi; + flush->iotlb = flush_iotlb_qi; + } + + /* Setup Invalidation Queue Address(IQA) register with the + * address of the page we just allocated. QS field at + * bits[2:0] to indicate size of queue is one 4KB page. + * That's 256 entries. Queued Head (IQH) and Queue Tail (IQT) + * registers are automatically reset to 0 with write + * to IQA register. + */ + dmar_writeq(iommu->reg, DMAR_IQA_REG, qi_ctrl->qinval_maddr); + + /* enable queued invalidation hardware */ + iommu->gcmd |= DMA_GCMD_QIE; + dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd); + + /* Make sure hardware complete it */ + start_time = NOW(); + while ( !(dmar_readl(iommu->reg, DMAR_GSTS_REG) & DMA_GSTS_QIES) ) + { + if ( NOW() > (start_time + DMAR_OPERATION_TIMEOUT) ) + panic("Cannot set QIE field for queue invalidation\n"); + cpu_relax(); + } + + return 0; +} diff -Naurp xen/drivers/passthrough/vtd/utils.c xen-redhat/drivers/passthrough/vtd/utils.c --- xen/drivers/passthrough/vtd/utils.c +++ xen-redhat/drivers/passthrough/vtd/utils.c @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Allen Kay <allen.m.kay@intel.com> + */ + +#include <xen/sched.h> +#include <xen/delay.h> +#include <xen/iommu.h> +#include <xen/time.h> +#include <xen/pci.h> +#include <xen/pci_regs.h> +#include "iommu.h" +#include "dmar.h" +#include "vtd.h" +#include "extern.h" + +int is_usb_device(u8 bus, u8 devfn) +{ + u16 class = pci_conf_read16(bus, PCI_SLOT(devfn), PCI_FUNC(devfn), + PCI_CLASS_DEVICE); + return (class == 0xc03); +} + +/* Disable vt-d protected memory registers. */ +void disable_pmr(struct iommu *iommu) +{ + s_time_t start_time; + unsigned int val; + + val = dmar_readl(iommu->reg, DMAR_PMEN_REG); + if ( !(val & DMA_PMEN_PRS) ) + return; + + dmar_writel(iommu->reg, DMAR_PMEN_REG, val & ~DMA_PMEN_EPM); + start_time = NOW(); + + for ( ; ; ) + { + val = dmar_readl(iommu->reg, DMAR_PMEN_REG); + if ( (val & DMA_PMEN_PRS) == 0 ) + break; + + if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT ) + panic("Disable PMRs timeout\n"); + + cpu_relax(); + } + + dprintk(XENLOG_INFO VTDPREFIX, + "Disabled protected memory registers\n"); +} + +void print_iommu_regs(struct acpi_drhd_unit *drhd) +{ + struct iommu *iommu = drhd->iommu; + + printk("---- print_iommu_regs ----\n"); + printk("print_iommu_regs: drhd->address = %"PRIx64"\n", drhd->address); + printk("print_iommu_regs: DMAR_VER_REG = %x\n", + dmar_readl(iommu->reg,DMAR_VER_REG)); + printk("print_iommu_regs: DMAR_CAP_REG = %"PRIx64"\n", + dmar_readq(iommu->reg,DMAR_CAP_REG)); + printk("print_iommu_regs: n_fault_reg = %"PRIx64"\n", + cap_num_fault_regs(dmar_readq(iommu->reg, DMAR_CAP_REG))); + printk("print_iommu_regs: fault_recording_offset_l = %"PRIx64"\n", + cap_fault_reg_offset(dmar_readq(iommu->reg, DMAR_CAP_REG))); + printk("print_iommu_regs: fault_recording_offset_h = %"PRIx64"\n", + cap_fault_reg_offset(dmar_readq(iommu->reg, DMAR_CAP_REG)) + 8); + printk("print_iommu_regs: fault_recording_reg_l = %"PRIx64"\n", + dmar_readq(iommu->reg, + cap_fault_reg_offset(dmar_readq(iommu->reg, DMAR_CAP_REG)))); + printk("print_iommu_regs: fault_recording_reg_h = %"PRIx64"\n", + dmar_readq(iommu->reg, + cap_fault_reg_offset(dmar_readq(iommu->reg, DMAR_CAP_REG)) + 8)); + printk("print_iommu_regs: DMAR_ECAP_REG = %"PRIx64"\n", + dmar_readq(iommu->reg,DMAR_ECAP_REG)); + printk("print_iommu_regs: DMAR_GCMD_REG = %x\n", + dmar_readl(iommu->reg,DMAR_GCMD_REG)); + printk("print_iommu_regs: DMAR_GSTS_REG = %x\n", + dmar_readl(iommu->reg,DMAR_GSTS_REG)); + printk("print_iommu_regs: DMAR_RTADDR_REG = %"PRIx64"\n", + dmar_readq(iommu->reg,DMAR_RTADDR_REG)); + printk("print_iommu_regs: DMAR_CCMD_REG = %"PRIx64"\n", + dmar_readq(iommu->reg,DMAR_CCMD_REG)); + printk("print_iommu_regs: DMAR_FSTS_REG = %x\n", + dmar_readl(iommu->reg,DMAR_FSTS_REG)); + printk("print_iommu_regs: DMAR_FECTL_REG = %x\n", + dmar_readl(iommu->reg,DMAR_FECTL_REG)); + printk("print_iommu_regs: DMAR_FEDATA_REG = %x\n", + dmar_readl(iommu->reg,DMAR_FEDATA_REG)); + printk("print_iommu_regs: DMAR_FEADDR_REG = %x\n", + dmar_readl(iommu->reg,DMAR_FEADDR_REG)); + printk("print_iommu_regs: DMAR_FEUADDR_REG = %x\n", + dmar_readl(iommu->reg,DMAR_FEUADDR_REG)); +} + +u32 get_level_index(unsigned long gmfn, int level) +{ + while ( --level ) + gmfn = gmfn >> LEVEL_STRIDE; + + return gmfn & LEVEL_MASK; +} + +void print_vtd_entries(struct iommu *iommu, int bus, int devfn, u64 gmfn) +{ + struct context_entry *ctxt_entry; + struct root_entry *root_entry; + struct dma_pte pte; + u64 *l; + u32 l_index, level; + + printk("print_vtd_entries: iommu = %p bdf = %x:%x:%x gmfn = %"PRIx64"\n", + iommu, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), gmfn); + + if ( iommu->root_maddr == 0 ) + { + printk(" iommu->root_maddr = 0\n"); + return; + } + + root_entry = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr); + + printk(" root_entry = %p\n", root_entry); + printk(" root_entry[%x] = %"PRIx64"\n", bus, root_entry[bus].val); + if ( !root_present(root_entry[bus]) ) + { + unmap_vtd_domain_page(root_entry); + printk(" root_entry[%x] not present\n", bus); + return; + } + + ctxt_entry = + (struct context_entry *)map_vtd_domain_page(root_entry[bus].val); + if ( ctxt_entry == NULL ) + { + unmap_vtd_domain_page(root_entry); + printk(" ctxt_entry == NULL\n"); + return; + } + + printk(" context = %p\n", ctxt_entry); + printk(" context[%x] = %"PRIx64"_%"PRIx64"\n", + devfn, ctxt_entry[devfn].hi, ctxt_entry[devfn].lo); + if ( !context_present(ctxt_entry[devfn]) ) + { + unmap_vtd_domain_page(ctxt_entry); + unmap_vtd_domain_page(root_entry); + printk(" ctxt_entry[%x] not present\n", devfn); + return; + } + + level = agaw_to_level(context_address_width(ctxt_entry[devfn])); + if ( level != VTD_PAGE_TABLE_LEVEL_3 && + level != VTD_PAGE_TABLE_LEVEL_4) + { + unmap_vtd_domain_page(ctxt_entry); + unmap_vtd_domain_page(root_entry); + printk("Unsupported VTD page table level (%d)!\n", level); + } + + l = maddr_to_virt(ctxt_entry[devfn].lo); + do + { + l = (u64*)(((unsigned long)l >> PAGE_SHIFT_4K) << PAGE_SHIFT_4K); + printk(" l%d = %p\n", level, l); + if ( l == NULL ) + { + unmap_vtd_domain_page(ctxt_entry); + unmap_vtd_domain_page(root_entry); + printk(" l%d == NULL\n", level); + break; + } + l_index = get_level_index(gmfn, level); + printk(" l%d_index = %x\n", level, l_index); + printk(" l%d[%x] = %"PRIx64"\n", level, l_index, l[l_index]); + + pte.val = l[l_index]; + if ( !dma_pte_present(pte) ) + { + unmap_vtd_domain_page(ctxt_entry); + unmap_vtd_domain_page(root_entry); + printk(" l%d[%x] not present\n", level, l_index); + break; + } + + l = maddr_to_virt(l[l_index]); + } while ( --level ); +} + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ diff -Naurp xen/drivers/passthrough/vtd/vtd.h xen-redhat/drivers/passthrough/vtd/vtd.h --- xen/drivers/passthrough/vtd/vtd.h +++ xen-redhat/drivers/passthrough/vtd/vtd.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Allen Kay <allen.m.kay@intel.com> + * Copyright (C) Weidong Han <weidong.han@intel.com> + */ + +#ifndef _VTD_H_ +#define _VTD_H_ + +#include <xen/iommu.h> + +/* Accomodate both IOAPIC and IOSAPIC. */ +struct IO_xAPIC_route_entry { + __u32 vector : 8, + delivery_mode : 3, /* 000: FIXED + * 001: lowest prio + * 111: ExtINT + */ + dest_mode : 1, /* 0: physical, 1: logical */ + delivery_status : 1, + polarity : 1, + irr : 1, + trigger : 1, /* 0: edge, 1: level */ + mask : 1, /* 0: enabled, 1: disabled */ + __reserved_2 : 15; + + union { + struct { __u32 + __reserved_1 : 24, + physical_dest : 4, + __reserved_2 : 4; + } physical; + + struct { __u32 + __reserved_1 : 24, + logical_dest : 8; + } logical; + +#ifdef __ia64__ + struct { __u32 + __reserved_1 : 16, + dest_id : 16; + }; +#endif + } dest; + +} __attribute__ ((packed)); + +struct IO_APIC_route_remap_entry { + union { + u64 val; + struct { + u64 vector:8, + delivery_mode:3, + index_15:1, + delivery_status:1, + polarity:1, + irr:1, + trigger:1, + mask:1, + reserved:31, + format:1, + index_0_14:15; + }; + }; +}; + +struct msi_msg_remap_entry { + union { + u32 val; + struct { + u32 dontcare:2, + index_15:1, + SHV:1, + format:1, + index_0_14:15, + addr_id_val:12; /* Interrupt address identifier value, + must be 0FEEh */ + }; + } address_lo; /* low 32 bits of msi message address */ + + u32 address_hi; /* high 32 bits of msi message address */ + u32 data; /* msi message data */ +}; + +unsigned int get_cache_line_size(void); +void cacheline_flush(char *); +void flush_all_cache(void); +void *map_to_nocache_virt(int nr_iommus, u64 maddr); +u64 alloc_pgtable_maddr(struct domain *d); +void free_pgtable_maddr(u64 maddr); +void *map_vtd_domain_page(u64 maddr); +void unmap_vtd_domain_page(void *va); + +void iommu_flush_cache_entry(void *addr); +void iommu_flush_cache_page(void *addr); + +#endif // _VTD_H_ diff -Naurp xen/drivers/passthrough/vtd/x86/Makefile xen-redhat/drivers/passthrough/vtd/x86/Makefile --- xen/drivers/passthrough/vtd/x86/Makefile +++ xen-redhat/drivers/passthrough/vtd/x86/Makefile @@ -0,0 +1 @@ +obj-y += vtd.o diff -Naurp xen/drivers/passthrough/vtd/x86/vtd.c xen-redhat/drivers/passthrough/vtd/x86/vtd.c --- xen/drivers/passthrough/vtd/x86/vtd.c +++ xen-redhat/drivers/passthrough/vtd/x86/vtd.c @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2008, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Allen Kay <allen.m.kay@intel.com> + * Copyright (C) Weidong Han <weidong.han@intel.com> + */ + +#include <xen/sched.h> +#include <xen/domain_page.h> +#include <asm/paging.h> +#include <xen/iommu.h> +#include <xen/numa.h> +#include "../iommu.h" +#include "../dmar.h" +#include "../vtd.h" + +void *map_vtd_domain_page(u64 maddr) +{ + return map_domain_page(maddr >> PAGE_SHIFT_4K); +} + +void unmap_vtd_domain_page(void *va) +{ + unmap_domain_page(va); +} + +/* Allocate page table, return its machine address */ +u64 alloc_pgtable_maddr(struct domain *d) +{ + struct page_info *pg; + u64 *vaddr; + unsigned long mfn; + + pg = alloc_domheap_page(NULL); + if ( !pg ) + return 0; + mfn = page_to_mfn(pg); + vaddr = map_domain_page(mfn); + memset(vaddr, 0, PAGE_SIZE); + + iommu_flush_cache_page(vaddr); + unmap_domain_page(vaddr); + + return (u64)mfn << PAGE_SHIFT_4K; +} + +void free_pgtable_maddr(u64 maddr) +{ + if ( maddr != 0 ) + free_domheap_page(maddr_to_page(maddr)); +} + +unsigned int get_cache_line_size(void) +{ + return ((cpuid_ebx(1) >> 8) & 0xff) * 8; +} + +void cacheline_flush(char * addr) +{ + clflush(addr); +} + +void flush_all_cache() +{ + wbinvd(); +} + +void *map_to_nocache_virt(int nr_iommus, u64 maddr) +{ + set_fixmap_nocache(FIX_IOMMU_REGS_BASE_0 + nr_iommus, maddr); + return (void *)fix_to_virt(FIX_IOMMU_REGS_BASE_0 + nr_iommus); +} + +struct hvm_irq_dpci *domain_get_irq_dpci(struct domain *domain) +{ + if ( !domain ) + return NULL; + + return domain->arch.hvm_domain.irq.dpci; +} + +int domain_set_irq_dpci(struct domain *domain, struct hvm_irq_dpci *dpci) +{ + if ( !domain || !dpci ) + return 0; + + domain->arch.hvm_domain.irq.dpci = dpci; + return 1; +} + +void hvm_dpci_isairq_eoi(struct domain *d, unsigned int isairq) +{ + struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; + struct hvm_irq_dpci *dpci = NULL; + struct dev_intx_gsi_link *digl, *tmp; + int i; + + ASSERT(isairq < NR_ISAIRQS); + if ( !vtd_enabled) + return; + + spin_lock(&d->event_lock); + + dpci = domain_get_irq_dpci(d); + + if ( !dpci || !test_bit(isairq, dpci->isairq_map) ) + { + spin_unlock(&d->event_lock); + return; + } + /* Multiple mirq may be mapped to one isa irq */ + for ( i = find_first_bit(dpci->mapping, NR_IRQS); + i < NR_IRQS; + i = find_next_bit(dpci->mapping, NR_IRQS, i + 1) ) + { + list_for_each_entry_safe ( digl, tmp, + &dpci->mirq[i].digl_list, list ) + { + if ( hvm_irq->pci_link.route[digl->link] == isairq ) + { + hvm_pci_intx_deassert(d, digl->device, digl->intx); + if ( --dpci->mirq[i].pending == 0 ) + { + stop_timer(&dpci->hvm_timer[domain_irq_to_vector(d, i)]); + pirq_guest_eoi(d, i); + } + } + } + } + spin_unlock(&d->event_lock); +} diff -Naurp xen/drivers/pci/Makefile xen-redhat/drivers/pci/Makefile --- xen/drivers/pci/Makefile +++ xen-redhat/drivers/pci/Makefile @@ -0,0 +1 @@ +obj-y += pci.o diff -Naurp xen/drivers/pci/pci.c xen-redhat/drivers/pci/pci.c --- xen/drivers/pci/pci.c +++ xen-redhat/drivers/pci/pci.c @@ -0,0 +1,64 @@ +/****************************************************************************** + * pci.c + * + * Architecture-independent PCI access functions. + */ + +#include <xen/pci.h> +#include <xen/pci_regs.h> + +int pci_find_cap_offset(u8 bus, u8 dev, u8 func, u8 cap) +{ + u8 id; + int max_cap = 48; + u8 pos = PCI_CAPABILITY_LIST; + u16 status; + + status = pci_conf_read16(bus, dev, func, PCI_STATUS); + if ( (status & PCI_STATUS_CAP_LIST) == 0 ) + return 0; + + while ( max_cap-- ) + { + pos = pci_conf_read8(bus, dev, func, pos); + if ( pos < 0x40 ) + break; + + pos &= ~3; + id = pci_conf_read8(bus, dev, func, pos + PCI_CAP_LIST_ID); + + if ( id == 0xff ) + break; + else if ( id == cap ) + return pos; + + pos += PCI_CAP_LIST_NEXT; + } + + return 0; +} + +int pci_find_next_cap(u8 bus, unsigned int devfn, u8 pos, int cap) +{ + u8 id; + int ttl = 48; + + while ( ttl-- ) + { + pos = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn), pos); + if ( pos < 0x40 ) + break; + + pos &= ~3; + id = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn), + pos + PCI_CAP_LIST_ID); + + if ( id == 0xff ) + break; + if ( id == cap ) + return pos; + + pos += PCI_CAP_LIST_NEXT; + } + return 0; +} diff -Naurp xen/drivers/video/vesa.c xen-redhat/drivers/video/vesa.c --- xen/drivers/video/vesa.c +++ xen-redhat/drivers/video/vesa.c @@ -146,10 +146,20 @@ void __init vesa_init(void) xfree(text_buf); } -void __init vesa_endboot(void) +void __init vesa_endboot(bool_t keep) { - xpos = 0; - vga_puts = vesa_scroll_puts; + if ( keep ) + { + xpos = 0; + vga_puts = vesa_scroll_puts; + } + else + { + unsigned int i, bpp = (vlfb_info.bits_per_pixel + 7) >> 3; + for ( i = 0; i < vlfb_info.height; i++ ) + memset(lfb + i * vlfb_info.bytes_per_line, 0, + vlfb_info.width * bpp); + } } #if defined(CONFIG_X86) diff -Naurp xen/drivers/video/vga.c xen-redhat/drivers/video/vga.c --- xen/drivers/video/vga.c +++ xen-redhat/drivers/video/vga.c @@ -57,10 +57,10 @@ static unsigned int columns, lines; #ifdef CONFIG_X86_64 void vesa_early_init(void); -void vesa_endboot(void); +void vesa_endboot(bool_t keep); #else #define vesa_early_init() ((void)0) -#define vesa_endboot() ((void)0) +#define vesa_endboot(x) ((void)0) #endif void __init vga_init(void) @@ -99,16 +99,27 @@ void __init vga_init(void) void __init vga_endboot(void) { - if ( vga_puts == vga_noop_puts ) + if ( !vga_console_info.video_type ) return; printk("Xen is %s VGA console.\n", vgacon_keep ? "keeping" : "relinquishing"); - vesa_endboot(); - if ( !vgacon_keep ) vga_puts = vga_noop_puts; + + switch ( vga_console_info.video_type ) + { + case XEN_VGATYPE_TEXT_MODE_3: + if ( !vgacon_keep ) + memset(video, 0, columns * lines * 2); + break; + case XEN_VGATYPE_VESA_LFB: + vesa_endboot(vgacon_keep); + break; + default: + BUG(); + } } static void vga_text_puts(const char *s) diff -Naurp xen/include/asm-ia64/bundle.h xen-redhat/include/asm-ia64/bundle.h --- xen/include/asm-ia64/bundle.h +++ xen-redhat/include/asm-ia64/bundle.h @@ -33,6 +33,11 @@ typedef union U_INST64_B9 { struct { unsigned long qp:6, imm20:20, :1, x6:6, :3, i:1, major:4; }; } INST64_B9; +typedef union U_INST64_I18 { + IA64_INST inst; + struct { unsigned long qp:6, imm20:20, y:1, x6:6, x3:3, i:1, major:4; }; +} INST64_I18; + typedef union U_INST64_I19 { IA64_INST inst; struct { unsigned long qp:6, imm20:20, :1, x6:6, x3:3, i:1, major:4; }; @@ -191,6 +196,7 @@ typedef union U_INST64 { INST64_B4 B4; // used in build_hypercall_bundle only INST64_B8 B8; // rfi, bsw.[01] INST64_B9 B9; // break.b + INST64_I18 I18; // nop.i used in build_fpswa_hypercall_bundle only INST64_I19 I19; // used in build_hypercall_bundle only INST64_I26 I26; // mov register to ar (I unit) INST64_I27 I27; // mov immediate to ar (I unit) diff -Naurp xen/include/asm-ia64/config.h xen-redhat/include/asm-ia64/config.h --- xen/include/asm-ia64/config.h +++ xen-redhat/include/asm-ia64/config.h @@ -282,4 +282,6 @@ struct screen_info { }; /* Define CONFIG_PRIVIFY to support privified OS (deprecated). */ #undef CONFIG_PRIVIFY +#define ARCH_CRASH_SAVE_VMCOREINFO + #endif /* _IA64_CONFIG_H_ */ diff -Naurp xen/include/asm-ia64/debugger.h xen-redhat/include/asm-ia64/debugger.h --- xen/include/asm-ia64/debugger.h +++ xen-redhat/include/asm-ia64/debugger.h @@ -56,13 +56,6 @@ show_execution_state(struct cpu_user_reg #ifdef CRASH_DEBUG // crash_debug=y -/* The main trap handlers use these helper macros which include early bail. */ -static inline int debugger_trap_entry( - unsigned int vector, struct cpu_user_regs *regs) -{ - return 0; -} - extern int __trap_to_cdb(struct cpu_user_regs *r); static inline int debugger_trap_fatal( unsigned int vector, struct cpu_user_regs *regs) @@ -73,23 +66,18 @@ static inline int debugger_trap_fatal( #define ____debugger_trap_immediate(b) __asm__ __volatile__ ("break.m "#b"\n") #define __debugger_trap_immediate(b) ____debugger_trap_immediate(b) -#define debugger_trap_immediate() __debugger_trap_immediate(CDB_BREAK_NUM) +#define debugger_trap_immediate() \ +do { \ + if ( gdb_ctx->serhnd >= 0 ) \ + __debugger_trap_immediate(CDB_BREAK_NUM); \ +} while (0) //XXX temporal work around #ifndef CONFIG_SMP #define smp_send_stop() /* nothing */ #endif -#elif defined DOMU_DEBUG -// domu_debug=y -#warning "domu_debug is not implemented yet." -/* The main trap handlers use these helper macros which include early bail. */ -static inline int debugger_trap_entry( - unsigned int vector, struct cpu_user_regs *regs) -{ - return 0; -} - +#else static inline int debugger_trap_fatal( unsigned int vector, struct cpu_user_regs *regs) { @@ -97,22 +85,21 @@ static inline int debugger_trap_fatal( } #define debugger_trap_immediate() ((void)0) -#else -/* The main trap handlers use these helper macros which include early bail. */ +#endif + static inline int debugger_trap_entry( unsigned int vector, struct cpu_user_regs *regs) { - return 0; -} + struct vcpu *v = current; + + if (guest_kernel_mode(regs) && v->domain->debugger_attached) { + domain_pause_for_debugger(); + return 1; + } -static inline int debugger_trap_fatal( - unsigned int vector, struct cpu_user_regs *regs) -{ return 0; } -#define debugger_trap_immediate() ((void)0) -#endif #endif // __ASSEMBLLY__ #endif /* __ASM_DEBUGGER_H__ */ diff -Naurp xen/include/asm-ia64/domain.h xen-redhat/include/asm-ia64/domain.h --- xen/include/asm-ia64/domain.h +++ xen-redhat/include/asm-ia64/domain.h @@ -18,7 +18,6 @@ struct p2m_entry; struct tlb_track; #endif -extern void domain_relinquish_resources(struct domain *); struct vcpu; extern void relinquish_vcpu_resources(struct vcpu *v); extern void vcpu_share_privregs_with_guest(struct vcpu *v); @@ -132,6 +131,19 @@ struct arch_domain { #ifdef CONFIG_XEN_IA64_TLB_TRACK struct tlb_track* tlb_track; #endif + + /* for domctl_destroy_domain continuation */ + enum { + RELRES_not_started, + RELRES_mm_teardown, + RELRES_xen, + RELRES_dom, + RELRES_done, + } relres; + /* Continuable mm_teardown() */ + unsigned long mm_teardown_offset; + /* Continuable domain_relinquish_resources() */ + struct list_head relmem_list; }; #define INT_ENABLE_OFFSET(v) \ (sizeof(vcpu_info_t) * (v)->vcpu_id + \ @@ -180,6 +192,11 @@ struct arch_vcpu { int starting_rid; /* first RID assigned to domain */ int ending_rid; /* one beyond highest RID assigned to domain */ + /* Bitset for debug register use. */ + unsigned int dbg_used; + u64 dbr[IA64_NUM_DBG_REGS]; + u64 ibr[IA64_NUM_DBG_REGS]; + struct thread_struct _thread; // this must be last thash_cb_t vtlb; @@ -188,9 +205,10 @@ struct arch_vcpu { char irq_new_condition; // vpsr.i/vtpr change, check for pending VHPI char hypercall_continuation; + fpswa_ret_t fpswa_ret; /* save return values of FPSWA emulation */ + //for phycial emulation int mode_flags; - fpswa_ret_t fpswa_ret; /* save return values of FPSWA emulation */ struct timer hlt_timer; struct arch_vmx_struct arch_vmx; /* Virtual Machine Extensions */ @@ -216,6 +234,9 @@ int do_perfmon_op(unsigned long cmd, XEN_GUEST_HANDLE(void) arg1, unsigned long arg2); +void +ia64_lazy_load_fpu(struct vcpu *vcpu); + #endif /* __ASM_DOMAIN_H__ */ /* diff -Naurp xen/include/asm-ia64/dom_fw.h xen-redhat/include/asm-ia64/dom_fw.h --- xen/include/asm-ia64/dom_fw.h +++ xen-redhat/include/asm-ia64/dom_fw.h @@ -7,10 +7,13 @@ #include <linux/efi.h> +#define __IA64_XEN_HYPERCALL_DEFAULT 0x1000 +#define __IA64_XEN_HYPERCALL_DEFAULT_STR "0x1000" + /* Portion of guest physical memory space reserved for PAL/SAL/EFI/ACPI data and code. */ #define FW_BASE_PADDR 0x0000UL -#define FW_END_PADDR 0x3000UL +#define FW_END_PADDR 0x8000UL /* This is used to determined the portion of a domain's metaphysical memory space reserved for the hypercall patch table. */ @@ -30,8 +33,8 @@ #define FW_ACPI_END_PADDR 0x2000UL /* Base and end guest physical address of EFI and SAL (non-ACPI) tables. */ -#define FW_TABLES_BASE_PADDR 0x2000UL -#define FW_TABLES_END_PADDR 0x3000UL +#define FW_TABLES_BASE_PADDR 0x4000UL +#define FW_TABLES_END_PADDR 0x8000UL /* Hypercalls number have a low part and a high part. @@ -157,13 +160,21 @@ /* * This is a hypercall number for FPSWA. - * FPSWA hypercall uses 2 bundles for a pseudo-entry-point and a hypercall-patch. + * FPSWA hypercall uses one bundle for a pseudo-entry-point + * and 14 bundles for a hypercall-patch. + * + * 0x500 was used before. But that implemetation is broken. + * To keep hypercall abi, 0x500 is obsoleted and allocate 0x501 for + * fspwa hypercall. */ #define FW_HYPERCALL_FPSWA_ENTRY_INDEX 0x90UL #define FW_HYPERCALL_FPSWA_PATCH_INDEX 0x91UL #define FW_HYPERCALL_FPSWA_ENTRY_PADDR FW_HYPERCALL_PADDR(FW_HYPERCALL_FPSWA_ENTRY_INDEX) #define FW_HYPERCALL_FPSWA_PATCH_PADDR FW_HYPERCALL_PADDR(FW_HYPERCALL_FPSWA_PATCH_INDEX) -#define FW_HYPERCALL_FPSWA 0x500UL +#define FW_HYPERCALL_FPSWA_BASE 0x500UL +#define FW_HYPERCALL_FPSWA_BROKEN 0x500UL +#define FW_HYPERCALL_FPSWA 0x501UL +#define FW_HYPERCALL_FPSWA_STR "0x501" /* Set the shared_info base virtual address. */ #define FW_HYPERCALL_SET_SHARED_INFO_VA 0x600UL diff -Naurp xen/include/asm-ia64/grant_table.h xen-redhat/include/asm-ia64/grant_table.h --- xen/include/asm-ia64/grant_table.h +++ xen-redhat/include/asm-ia64/grant_table.h @@ -12,7 +12,7 @@ int create_grant_host_mapping(unsigned l int destroy_grant_host_mapping(unsigned long gpaddr, unsigned long mfn, unsigned int flags); // for grant transfer -void guest_physmap_add_page(struct domain *d, unsigned long gpfn, unsigned long mfn); +int guest_physmap_add_page(struct domain *d, unsigned long gpfn, unsigned long mfn, int order); /* XXX * somewhere appropriate diff -Naurp xen/include/asm-ia64/hvm/vacpi.h xen-redhat/include/asm-ia64/hvm/vacpi.h --- xen/include/asm-ia64/hvm/vacpi.h +++ xen-redhat/include/asm-ia64/hvm/vacpi.h @@ -0,0 +1,55 @@ +/* + * vacpi.h: Virtual ACPI definitions + * + * Copyright (c) 2007, FUJITSU LIMITED + * Kouya Shimura <kouya at jp fujitsu com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + */ + +#ifndef __ASM_IA64_HVM_VACPI_H__ +#define __ASM_IA64_HVM_VACPI_H__ + +#include <public/hvm/ioreq.h> + +#define ACPI_PM1A_EVT_BLK_ADDRESS 0x0000000000001f40 +#define ACPI_PM1A_CNT_BLK_ADDRESS (ACPI_PM1A_EVT_BLK_ADDRESS + 0x04) +#define ACPI_PM_TMR_BLK_ADDRESS (ACPI_PM1A_EVT_BLK_ADDRESS + 0x08) + +#define IS_ACPI_ADDR(X) ((unsigned long)((X)-ACPI_PM1A_EVT_BLK_ADDRESS)<12) + +#define FREQUENCE_PMTIMER 3579545UL /* Timer should run at 3.579545 MHz */ + +struct vacpi_regs { + union { + struct { + uint32_t pm1a_sts:16; + uint32_t pm1a_en:16; + }; + uint32_t evt_blk; + }; + uint32_t tmr_val; +}; + +struct vacpi { + struct vacpi_regs regs; + s_time_t last_gtime; + struct timer timer; +}; + +int vacpi_intercept(ioreq_t * p, u64 * val); +void vacpi_init(struct domain *d); +void vacpi_relinquish_resources(struct domain *d); + +#endif /* __ASM_IA64_HVM_VACPI_H__ */ diff -Naurp xen/include/asm-ia64/linux/asm/sn/pcidev.h xen-redhat/include/asm-ia64/linux/asm/sn/pcidev.h --- xen/include/asm-ia64/linux/asm/sn/pcidev.h +++ xen-redhat/include/asm-ia64/linux/asm/sn/pcidev.h @@ -1,83 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 1992 - 1997, 2000-2005 Silicon Graphics, Inc. All rights reserved. - */ -#ifndef _ASM_IA64_SN_PCI_PCIDEV_H -#define _ASM_IA64_SN_PCI_PCIDEV_H - -#include <linux/pci.h> - -/* - * In ia64, pci_dev->sysdata must be a *pci_controller. To provide access to - * the pcidev_info structs for all devices under a controller, we extend the - * definition of pci_controller, via sn_pci_controller, to include a list - * of pcidev_info. - */ -struct sn_pci_controller { - struct pci_controller pci_controller; - struct list_head pcidev_info; -}; - -#define SN_PCI_CONTROLLER(dev) ((struct sn_pci_controller *) dev->sysdata) - -#define SN_PCIDEV_INFO(dev) sn_pcidev_info_get(dev) - -#define SN_PCIBUS_BUSSOFT_INFO(pci_bus) \ - (struct pcibus_info *)((struct pcibus_bussoft *)(PCI_CONTROLLER((pci_bus))->platform_data)) -/* - * Given a pci_bus, return the sn pcibus_bussoft struct. Note that - * this only works for root busses, not for busses represented by PPB's. - */ - -#define SN_PCIBUS_BUSSOFT(pci_bus) \ - ((struct pcibus_bussoft *)(PCI_CONTROLLER((pci_bus))->platform_data)) - -#define SN_PCIBUS_BUSSOFT_INFO(pci_bus) \ - (struct pcibus_info *)((struct pcibus_bussoft *)(PCI_CONTROLLER((pci_bus))->platform_data)) -/* - * Given a struct pci_dev, return the sn pcibus_bussoft struct. Note - * that this is not equivalent to SN_PCIBUS_BUSSOFT(pci_dev->bus) due - * due to possible PPB's in the path. - */ - -#define SN_PCIDEV_BUSSOFT(pci_dev) \ - (SN_PCIDEV_INFO(pci_dev)->pdi_host_pcidev_info->pdi_pcibus_info) - -#define SN_PCIDEV_BUSPROVIDER(pci_dev) \ - (SN_PCIDEV_INFO(pci_dev)->pdi_provider) - -#define PCIIO_BUS_NONE 255 /* bus 255 reserved */ -#define PCIIO_SLOT_NONE 255 -#define PCIIO_FUNC_NONE 255 -#define PCIIO_VENDOR_ID_NONE (-1) - -struct pcidev_info { - u64 pdi_pio_mapped_addr[7]; /* 6 BARs PLUS 1 ROM */ - u64 pdi_slot_host_handle; /* Bus and devfn Host pci_dev */ - - struct pcibus_bussoft *pdi_pcibus_info; /* Kernel common bus soft */ - struct pcidev_info *pdi_host_pcidev_info; /* Kernel Host pci_dev */ - struct pci_dev *pdi_linux_pcidev; /* Kernel pci_dev */ - - struct sn_irq_info *pdi_sn_irq_info; - struct sn_pcibus_provider *pdi_provider; /* sn pci ops */ - struct pci_dev *host_pci_dev; /* host bus link */ - struct list_head pdi_list; /* List of pcidev_info */ -}; - -extern void sn_irq_fixup(struct pci_dev *pci_dev, - struct sn_irq_info *sn_irq_info); -extern void sn_irq_unfixup(struct pci_dev *pci_dev); -extern struct pcidev_info * sn_pcidev_info_get(struct pci_dev *); -extern void sn_pci_controller_fixup(int segment, int busnum, - struct pci_bus *bus); -extern void sn_bus_store_sysdata(struct pci_dev *dev); -extern void sn_bus_free_sysdata(void); -extern void sn_generate_path(struct pci_bus *pci_bus, char *address); -extern void sn_pci_fixup_slot(struct pci_dev *dev); -extern void sn_pci_unfixup_slot(struct pci_dev *dev); -extern void sn_irq_lh_init(void); -#endif /* _ASM_IA64_SN_PCI_PCIDEV_H */ diff -Naurp xen/include/asm-ia64/linux/asm/sn/README.origin xen-redhat/include/asm-ia64/linux/asm/sn/README.origin --- xen/include/asm-ia64/linux/asm/sn/README.origin +++ xen-redhat/include/asm-ia64/linux/asm/sn/README.origin @@ -10,7 +10,6 @@ l1.h -> linux/include/asm-ia64/sn/l1.h leds.h -> linux/include/asm-ia64/sn/leds.h module.h -> linux/include/asm-ia64/sn/module.h pcibus_provider_defs.h -> linux/include/asm-ia64/sn/pcibus_provider_defs.h -pcidev.h -> linux/include/asm-ia64/sn/pcidev.h pda.h -> linux/include/asm-ia64/sn/pda.h pic.h -> linux/include/asm-ia64/sn/pic.h shub_mmr.h -> linux/include/asm-ia64/sn/shub_mmr.h diff -Naurp xen/include/asm-ia64/linux/pci_regs.h xen-redhat/include/asm-ia64/linux/pci_regs.h --- xen/include/asm-ia64/linux/pci_regs.h +++ xen-redhat/include/asm-ia64/linux/pci_regs.h @@ -229,7 +229,7 @@ #define PCI_PM_CAP_PME_D3cold 0x8000 /* PME# from D3 (cold) */ #define PCI_PM_CTRL 4 /* PM control and status register */ #define PCI_PM_CTRL_STATE_MASK 0x0003 /* Current power state (D0 to D3) */ -#define PCI_PM_CTRL_NO_SOFT_RESET 0x0004 /* No reset for D3hot->D0 */ +#define PCI_PM_CTRL_NO_SOFT_RESET 0x0008 /* No reset for D3hot->D0 */ #define PCI_PM_CTRL_PME_ENABLE 0x0100 /* PME pin enable */ #define PCI_PM_CTRL_DATA_SEL_MASK 0x1e00 /* Data select (??) */ #define PCI_PM_CTRL_DATA_SCALE_MASK 0x6000 /* Data scale (??) */ diff -Naurp xen/include/asm-ia64/linux-xen/asm/pal.h xen-redhat/include/asm-ia64/linux-xen/asm/pal.h --- xen/include/asm-ia64/linux-xen/asm/pal.h +++ xen-redhat/include/asm-ia64/linux-xen/asm/pal.h @@ -20,6 +20,8 @@ * 00/05/24 eranian Updated to latest PAL spec, fix structures bugs, added * 00/05/25 eranian Support for stack calls, and static physical calls * 00/06/18 eranian Support for stacked physical calls + * 06/10/26 rja Support for Intel Itanium Architecture Software Developer's + * Manual Rev 2.2 (Jan 2006) */ /* @@ -30,7 +32,7 @@ #define PAL_CACHE_FLUSH 1 /* flush i/d cache */ #define PAL_CACHE_INFO 2 /* get detailed i/d cache info */ #define PAL_CACHE_INIT 3 /* initialize i/d cache */ -#define PAL_CACHE_SUMMARY 4 /* get summary of cache heirarchy */ +#define PAL_CACHE_SUMMARY 4 /* get summary of cache hierarchy */ #define PAL_MEM_ATTRIB 5 /* list supported memory attributes */ #define PAL_PTCE_INFO 6 /* purge TLB info */ #define PAL_VM_INFO 7 /* return supported virtual memory features */ @@ -68,6 +70,9 @@ #define PAL_SHUTDOWN 40 /* enter processor shutdown state */ #define PAL_PREFETCH_VISIBILITY 41 /* Make Processor Prefetches Visible */ #define PAL_LOGICAL_TO_PHYSICAL 42 /* returns information on logical to physical processor mapping */ +#define PAL_CACHE_SHARED_INFO 43 /* returns information on caches shared by logical processor */ +#define PAL_GET_HW_POLICY 48 /* Get current hardware resource sharing policy */ +#define PAL_SET_HW_POLICY 49 /* Set current hardware resource sharing policy */ #define PAL_COPY_PAL 256 /* relocate PAL procedures and PAL PMI */ #define PAL_HALT_INFO 257 /* return the low power capabilities of processor */ @@ -75,6 +80,14 @@ #define PAL_CACHE_READ 259 /* read tag & data of cacheline for diagnostic testing */ #define PAL_CACHE_WRITE 260 /* write tag & data of cacheline for diagnostic testing */ #define PAL_VM_TR_READ 261 /* read contents of translation register */ +#define PAL_GET_PSTATE 262 /* get the current P-state */ +#define PAL_SET_PSTATE 263 /* set the P-state */ +#define PAL_BRAND_INFO 274 /* Processor branding information */ + +#define PAL_GET_PSTATE_TYPE_LASTSET 0 +#define PAL_GET_PSTATE_TYPE_AVGANDRESET 1 +#define PAL_GET_PSTATE_TYPE_AVGNORESET 2 +#define PAL_GET_PSTATE_TYPE_INSTANT 3 #ifndef __ASSEMBLY__ @@ -98,15 +111,16 @@ typedef s64 pal_status_t; * cache without sideeffects * and "restrict" was 1 */ +#define PAL_STATUS_REQUIRES_MEMORY (-9) /* Call requires PAL memory buffer */ -/* Processor cache level in the heirarchy */ +/* Processor cache level in the hierarchy */ typedef u64 pal_cache_level_t; #define PAL_CACHE_LEVEL_L0 0 /* L0 */ #define PAL_CACHE_LEVEL_L1 1 /* L1 */ #define PAL_CACHE_LEVEL_L2 2 /* L2 */ -/* Processor cache type at a particular level in the heirarchy */ +/* Processor cache type at a particular level in the hierarchy */ typedef u64 pal_cache_type_t; #define PAL_CACHE_TYPE_INSTRUCTION 1 /* Instruction cache */ @@ -131,7 +145,7 @@ typedef u64 pal_cache_line_state_t; #define PAL_CACHE_LINE_STATE_MODIFIED 3 /* Modified */ typedef struct pal_freq_ratio { - u64 den : 32, num : 32; /* numerator & denominator */ + u32 den, num; /* numerator & denominator */ } itc_ratio, proc_ratio; typedef union pal_cache_config_info_1_s { @@ -152,10 +166,10 @@ typedef union pal_cache_config_info_1_s typedef union pal_cache_config_info_2_s { struct { - u64 cache_size : 32, /*cache size in bytes*/ + u32 cache_size; /*cache size in bytes*/ - alias_boundary : 8, /* 39-32 aliased addr + u32 alias_boundary : 8, /* 39-32 aliased addr * separation for max * performance. */ @@ -261,14 +275,14 @@ typedef struct pal_cache_protection_info #define PAL_CACHE_PROT_METHOD_ECC 3 /* ECC protection */ -/* Processor cache line identification in the heirarchy */ +/* Processor cache line identification in the hierarchy */ typedef union pal_cache_line_id_u { u64 pclid_data; struct { u64 cache_type : 8, /* 7-0 cache type */ level : 8, /* 15-8 level of the * cache in the - * heirarchy. + * hierarchy. */ way : 8, /* 23-16 way in the set */ @@ -281,7 +295,7 @@ typedef union pal_cache_line_id_u { u64 cache_type : 8, /* 7-0 cache type */ level : 8, /* 15-8 level of the * cache in the - * heirarchy. + * hierarchy. */ way : 8, /* 23-16 way in the set */ @@ -360,6 +374,7 @@ typedef u64 pal_mc_info_index_t; * dependent */ +#define PAL_TLB_CHECK_OP_PURGE 8 typedef struct pal_process_state_info_s { u64 reserved1 : 2, @@ -455,7 +470,9 @@ typedef struct pal_process_state_info_s * by the processor */ - reserved2 : 11, + se : 1, /* Shared error. MCA in a + shared structure */ + reserved2 : 10, cc : 1, /* Cache check */ tc : 1, /* TLB check */ bc : 1, /* Bus check */ @@ -486,10 +503,12 @@ typedef struct pal_cache_check_info_s { * error occurred */ wiv : 1, /* Way field valid */ - reserved2 : 10, + reserved2 : 1, + dp : 1, /* Data poisoned on MBE */ + reserved3 : 8, index : 20, /* Cache line index */ - reserved3 : 2, + reserved4 : 2, is : 1, /* instruction set (1 == ia32) */ iv : 1, /* instruction set field valid */ @@ -556,7 +575,7 @@ typedef struct pal_bus_check_info_s { type : 8, /* Bus xaction type*/ sev : 5, /* Bus error severity*/ hier : 2, /* Bus hierarchy level */ - reserved1 : 1, + dp : 1, /* Data poisoned on MBE */ bsi : 8, /* Bus error status * info */ @@ -763,7 +782,7 @@ struct ia64_pal_retval { * (generally 0) MUST be passed. Reserved parameters are not optional * parameters. */ -extern struct ia64_pal_retval ia64_pal_call_static (u64, u64, u64, u64, u64); +extern struct ia64_pal_retval ia64_pal_call_static (u64, u64, u64, u64); extern struct ia64_pal_retval ia64_pal_call_stacked (u64, u64, u64, u64); extern struct ia64_pal_retval ia64_pal_call_phys_static (u64, u64, u64, u64); extern struct ia64_pal_retval ia64_pal_call_phys_stacked (u64, u64, u64, u64); @@ -773,14 +792,7 @@ extern void ia64_load_scratch_fpregs (st #define PAL_CALL(iprv,a0,a1,a2,a3) do { \ struct ia64_fpreg fr[6]; \ ia64_save_scratch_fpregs(fr); \ - iprv = ia64_pal_call_static(a0, a1, a2, a3, 0); \ - ia64_load_scratch_fpregs(fr); \ -} while (0) - -#define PAL_CALL_IC_OFF(iprv,a0,a1,a2,a3) do { \ - struct ia64_fpreg fr[6]; \ - ia64_save_scratch_fpregs(fr); \ - iprv = ia64_pal_call_static(a0, a1, a2, a3, 1); \ + iprv = ia64_pal_call_static(a0, a1, a2, a3); \ ia64_load_scratch_fpregs(fr); \ } while (0) @@ -840,7 +852,9 @@ typedef union pal_bus_features_u { u64 pbf_req_bus_parking : 1; u64 pbf_bus_lock_mask : 1; u64 pbf_enable_half_xfer_rate : 1; - u64 pbf_reserved2 : 22; + u64 pbf_reserved2 : 20; + u64 pbf_enable_shared_line_replace : 1; + u64 pbf_enable_exclusive_line_replace : 1; u64 pbf_disable_xaction_queueing : 1; u64 pbf_disable_resp_err_check : 1; u64 pbf_disable_berr_check : 1; @@ -928,11 +942,7 @@ static inline s64 ia64_pal_cache_flush (u64 cache_type, u64 invalidate, u64 *progress, u64 *vector) { struct ia64_pal_retval iprv; -#ifdef XEN /* fix a bug in Linux... PAL has changed */ PAL_CALL(iprv, PAL_CACHE_FLUSH, cache_type, invalidate, *progress); -#else - PAL_CALL_IC_OFF(iprv, PAL_CACHE_FLUSH, cache_type, invalidate, *progress); -#endif if (vector) *vector = iprv.v0; *progress = iprv.v1; @@ -967,11 +977,12 @@ static inline s64 ia64_pal_cache_read (pal_cache_line_id_u_t line_id, u64 physical_addr) { struct ia64_pal_retval iprv; - PAL_CALL(iprv, PAL_CACHE_READ, line_id.pclid_data, physical_addr, 0); + PAL_CALL_PHYS_STK(iprv, PAL_CACHE_READ, line_id.pclid_data, + physical_addr, 0); return iprv.status; } -/* Return summary information about the heirarchy of caches controlled by the processor */ +/* Return summary information about the hierarchy of caches controlled by the processor */ static inline s64 ia64_pal_cache_summary (u64 *cache_levels, u64 *unique_caches) { @@ -989,7 +1000,8 @@ static inline s64 ia64_pal_cache_write (pal_cache_line_id_u_t line_id, u64 physical_addr, u64 data) { struct ia64_pal_retval iprv; - PAL_CALL(iprv, PAL_CACHE_WRITE, line_id.pclid_data, physical_addr, data); + PAL_CALL_PHYS_STK(iprv, PAL_CACHE_WRITE, line_id.pclid_data, + physical_addr, data); return iprv.status; } @@ -1085,6 +1097,24 @@ ia64_pal_freq_ratios (struct pal_freq_ra return iprv.status; } +/* + * Get the current hardware resource sharing policy of the processor + */ +static inline s64 +ia64_pal_get_hw_policy (u64 proc_num, u64 *cur_policy, u64 *num_impacted, + u64 *la) +{ + struct ia64_pal_retval iprv; + PAL_CALL(iprv, PAL_GET_HW_POLICY, proc_num, 0, 0); + if (cur_policy) + *cur_policy = iprv.v0; + if (num_impacted) + *num_impacted = iprv.v1; + if (la) + *la = iprv.v2; + return iprv.status; +} + /* Make the processor enter HALT or one of the implementation dependent low * power states where prefetching and execution are suspended and cache and * TLB coherency is not maintained. @@ -1118,6 +1148,34 @@ ia64_pal_halt_info (pal_power_mgmt_info_ return iprv.status; } +/* Get the current P-state information */ +static inline s64 +ia64_pal_get_pstate (u64 *pstate_index, unsigned long type) +{ + struct ia64_pal_retval iprv; + PAL_CALL_STK(iprv, PAL_GET_PSTATE, type, 0, 0); + *pstate_index = iprv.v0; + return iprv.status; +} + +/* Set the P-state */ +static inline s64 +ia64_pal_set_pstate (u64 pstate_index) +{ + struct ia64_pal_retval iprv; + PAL_CALL_STK(iprv, PAL_SET_PSTATE, pstate_index, 0, 0); + return iprv.status; +} + +/* Processor branding information*/ +static inline s64 +ia64_pal_get_brand_info (char *brand_info) +{ + struct ia64_pal_retval iprv; + PAL_CALL_STK(iprv, PAL_BRAND_INFO, 0, (u64)brand_info, 0); + return iprv.status; +} + /* Cause the processor to enter LIGHT HALT state, where prefetching and execution are * suspended, but cache and TLB coherency is maintained. */ @@ -1381,6 +1439,17 @@ ia64_pal_rse_info (u64 *num_phys_stacked return iprv.status; } +/* + * Set the current hardware resource sharing policy of the processor + */ +static inline s64 +ia64_pal_set_hw_policy (u64 policy) +{ + struct ia64_pal_retval iprv; + PAL_CALL(iprv, PAL_SET_HW_POLICY, policy, 0, 0); + return iprv.status; +} + /* Cause the processor to enter SHUTDOWN state, where prefetching and execution are * suspended, but cause cache and TLB coherency to be maintained. * This is usually called in IA-32 mode. @@ -1418,7 +1487,12 @@ typedef union pal_version_u { } pal_version_u_t; -/* Return PAL version information */ +/* + * Return PAL version information. While the documentation states that + * PAL_VERSION can be called in either physical or virtual mode, some + * implementations only allow physical calls. We don't call it very often, + * so the overhead isn't worth eliminating. + */ static inline s64 ia64_pal_version (pal_version_u_t *pal_min_version, pal_version_u_t *pal_cur_version) { @@ -1499,12 +1573,15 @@ typedef union pal_vm_info_1_u { } pal_vm_info_1_s; } pal_vm_info_1_u_t; +#define PAL_MAX_PURGES 0xFFFF /* all ones is means unlimited */ + typedef union pal_vm_info_2_u { u64 pvi2_val; struct { u64 impl_va_msb : 8, rid_size : 8, - reserved : 48; + max_purges : 16, + reserved : 32; } pal_vm_info_2_s; } pal_vm_info_2_u_t; @@ -1626,14 +1703,40 @@ ia64_pal_logical_to_phys(u64 proc_number if (iprv.status == PAL_STATUS_SUCCESS) { - if (proc_number == 0) - mapping->overview.overview_data = iprv.v0; + mapping->overview.overview_data = iprv.v0; mapping->ppli1.ppli1_data = iprv.v1; mapping->ppli2.ppli2_data = iprv.v2; } return iprv.status; } + +typedef struct pal_cache_shared_info_s +{ + u64 num_shared; + pal_proc_n_log_info1_t ppli1; + pal_proc_n_log_info2_t ppli2; +} pal_cache_shared_info_t; + +/* Get information on logical to physical processor mappings. */ +static inline s64 +ia64_pal_cache_shared_info(u64 level, + u64 type, + u64 proc_number, + pal_cache_shared_info_t *info) +{ + struct ia64_pal_retval iprv; + + PAL_CALL(iprv, PAL_CACHE_SHARED_INFO, level, type, proc_number); + + if (iprv.status == PAL_STATUS_SUCCESS) { + info->num_shared = iprv.v0; + info->ppli1.ppli1_data = iprv.v1; + info->ppli2.ppli2_data = iprv.v2; + } + + return iprv.status; +} #ifdef XEN #include <asm/vmx_pal.h> #endif diff -Naurp xen/include/asm-ia64/linux-xen/asm/processor.h xen-redhat/include/asm-ia64/linux-xen/asm/processor.h --- xen/include/asm-ia64/linux-xen/asm/processor.h +++ xen-redhat/include/asm-ia64/linux-xen/asm/processor.h @@ -292,11 +292,14 @@ struct thread_struct { #else # define INIT_THREAD_PM #endif +#ifndef XEN __u64 dbr[IA64_NUM_DBG_REGS]; __u64 ibr[IA64_NUM_DBG_REGS]; +#endif struct ia64_fpreg fph[96]; /* saved/loaded on demand */ }; +#ifndef XEN #define INIT_THREAD { \ .flags = 0, \ .on_ustack = 0, \ @@ -333,6 +336,7 @@ struct thread_struct { regs->r1 = 0; regs->r9 = 0; regs->r11 = 0; regs->r13 = 0; regs->r15 = 0; \ } \ } while (0) +#endif /* Forward declarations, a strange C thing... */ struct mm_struct; diff -Naurp xen/include/asm-ia64/linux-xen/asm/ptrace.h xen-redhat/include/asm-ia64/linux-xen/asm/ptrace.h --- xen/include/asm-ia64/linux-xen/asm/ptrace.h +++ xen-redhat/include/asm-ia64/linux-xen/asm/ptrace.h @@ -265,6 +265,10 @@ struct switch_stack { /* given a pointer to a task_struct, return the user's pt_regs */ # define ia64_task_regs(t) (((struct pt_regs *) ((char *) (t) + IA64_STK_OFFSET)) - 1) # define ia64_psr(regs) ((struct ia64_psr *) &(regs)->cr_ipsr) +#ifdef XEN +# define guest_kernel_mode(regs) (ia64_psr(regs)->cpl == 2) +# define vmx_guest_kernel_mode(regs) (ia64_psr(regs)->cpl == 0) +#endif # define user_mode(regs) (((struct ia64_psr *) &(regs)->cr_ipsr)->cpl != 0) # define user_stack(task,regs) ((long) regs - (long) task == IA64_STK_OFFSET - sizeof(*regs)) # define fsys_mode(task,regs) \ diff -Naurp xen/include/asm-ia64/linux-xen/asm/README.origin xen-redhat/include/asm-ia64/linux-xen/asm/README.origin --- xen/include/asm-ia64/linux-xen/asm/README.origin +++ xen-redhat/include/asm-ia64/linux-xen/asm/README.origin @@ -17,7 +17,6 @@ mca_asm.h -> linux/include/asm-ia64/mca meminit.h -> linux/include/asm-ia64/meminit.h numa.h -> linux/include/asm-ia64/numa.h page.h -> linux/include/asm-ia64/page.h -pal.h -> linux/include/asm-ia64/pal.h percpu.h -> linux/include/asm-ia64/percpu.h pgalloc.h -> linux/include/asm-ia64/pgalloc.h pgtable.h -> linux/include/asm-ia64/pgtable.h @@ -42,3 +41,6 @@ machvec_dig.h -> linux/include/asm-ia64 machvec_sn2.h -> linux/include/asm-ia64/machvec_sn2.h machvec_hpzx1.h -> linux/include/asm-ia64/machvec_hpzx1.h machvec_pci.h -> linux/include/asm-ia64/pci.h + +# The files below are from Linux-2.6.21 +pal.h -> linux/include/asm-ia64/pal.h diff -Naurp xen/include/asm-ia64/linux-xen/asm/sn/pcibr_provider.h xen-redhat/include/asm-ia64/linux-xen/asm/sn/pcibr_provider.h --- xen/include/asm-ia64/linux-xen/asm/sn/pcibr_provider.h +++ xen-redhat/include/asm-ia64/linux-xen/asm/sn/pcibr_provider.h @@ -10,7 +10,7 @@ #ifdef XEN #include <linux/spinlock.h> -#include <linux/pci.h> +#include <linux/linux-pci.h> #endif #include <asm/sn/intr.h> #include <asm/sn/pcibus_provider_defs.h> diff -Naurp xen/include/asm-ia64/linux-xen/asm/sn/pcidev.h xen-redhat/include/asm-ia64/linux-xen/asm/sn/pcidev.h --- xen/include/asm-ia64/linux-xen/asm/sn/pcidev.h +++ xen-redhat/include/asm-ia64/linux-xen/asm/sn/pcidev.h @@ -0,0 +1,87 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 1992 - 1997, 2000-2005 Silicon Graphics, Inc. All rights reserved. + */ +#ifndef _ASM_IA64_SN_PCI_PCIDEV_H +#define _ASM_IA64_SN_PCI_PCIDEV_H + +#ifdef XEN +#include <linux/linux-pci.h> +#else +#include <linux/pci.h> +#endif + +/* + * In ia64, pci_dev->sysdata must be a *pci_controller. To provide access to + * the pcidev_info structs for all devices under a controller, we extend the + * definition of pci_controller, via sn_pci_controller, to include a list + * of pcidev_info. + */ +struct sn_pci_controller { + struct pci_controller pci_controller; + struct list_head pcidev_info; +}; + +#define SN_PCI_CONTROLLER(dev) ((struct sn_pci_controller *) dev->sysdata) + +#define SN_PCIDEV_INFO(dev) sn_pcidev_info_get(dev) + +#define SN_PCIBUS_BUSSOFT_INFO(pci_bus) \ + (struct pcibus_info *)((struct pcibus_bussoft *)(PCI_CONTROLLER((pci_bus))->platform_data)) +/* + * Given a pci_bus, return the sn pcibus_bussoft struct. Note that + * this only works for root busses, not for busses represented by PPB's. + */ + +#define SN_PCIBUS_BUSSOFT(pci_bus) \ + ((struct pcibus_bussoft *)(PCI_CONTROLLER((pci_bus))->platform_data)) + +#define SN_PCIBUS_BUSSOFT_INFO(pci_bus) \ + (struct pcibus_info *)((struct pcibus_bussoft *)(PCI_CONTROLLER((pci_bus))->platform_data)) +/* + * Given a struct pci_dev, return the sn pcibus_bussoft struct. Note + * that this is not equivalent to SN_PCIBUS_BUSSOFT(pci_dev->bus) due + * due to possible PPB's in the path. + */ + +#define SN_PCIDEV_BUSSOFT(pci_dev) \ + (SN_PCIDEV_INFO(pci_dev)->pdi_host_pcidev_info->pdi_pcibus_info) + +#define SN_PCIDEV_BUSPROVIDER(pci_dev) \ + (SN_PCIDEV_INFO(pci_dev)->pdi_provider) + +#define PCIIO_BUS_NONE 255 /* bus 255 reserved */ +#define PCIIO_SLOT_NONE 255 +#define PCIIO_FUNC_NONE 255 +#define PCIIO_VENDOR_ID_NONE (-1) + +struct pcidev_info { + u64 pdi_pio_mapped_addr[7]; /* 6 BARs PLUS 1 ROM */ + u64 pdi_slot_host_handle; /* Bus and devfn Host pci_dev */ + + struct pcibus_bussoft *pdi_pcibus_info; /* Kernel common bus soft */ + struct pcidev_info *pdi_host_pcidev_info; /* Kernel Host pci_dev */ + struct pci_dev *pdi_linux_pcidev; /* Kernel pci_dev */ + + struct sn_irq_info *pdi_sn_irq_info; + struct sn_pcibus_provider *pdi_provider; /* sn pci ops */ + struct pci_dev *host_pci_dev; /* host bus link */ + struct list_head pdi_list; /* List of pcidev_info */ +}; + +extern void sn_irq_fixup(struct pci_dev *pci_dev, + struct sn_irq_info *sn_irq_info); +extern void sn_irq_unfixup(struct pci_dev *pci_dev); +extern struct pcidev_info * sn_pcidev_info_get(struct pci_dev *); +extern void sn_pci_controller_fixup(int segment, int busnum, + struct pci_bus *bus); +extern void sn_bus_store_sysdata(struct pci_dev *dev); +extern void sn_bus_free_sysdata(void); +extern void sn_generate_path(struct pci_bus *pci_bus, char *address); +extern void sn_pci_fixup_slot(struct pci_dev *dev); +extern void sn_pci_unfixup_slot(struct pci_dev *dev); +extern void sn_irq_lh_init(void); +#endif /* _ASM_IA64_SN_PCI_PCIDEV_H */ diff -Naurp xen/include/asm-ia64/linux-xen/asm/sn/README.origin xen-redhat/include/asm-ia64/linux-xen/asm/sn/README.origin --- xen/include/asm-ia64/linux-xen/asm/sn/README.origin +++ xen-redhat/include/asm-ia64/linux-xen/asm/sn/README.origin @@ -12,5 +12,6 @@ intr.h -> linux/include/asm-ia64/sn/in io.h -> linux/include/asm-ia64/sn/io.h nodepda.h -> linux/include/asm-ia64/sn/nodepda.h pcibr_provider.h -> linux/include/asm-ia64/sn/pcibr_provider.h +pcidev.h -> linux/include/asm-ia64/sn/pcidev.h rw_mmr.h -> linux/include/asm-ia64/sn/rw_mmr.h types.h -> linux/include/asm-ia64/sn/types.h diff -Naurp xen/include/asm-ia64/linux-xen/linux/linux-pci.h xen-redhat/include/asm-ia64/linux-xen/linux/linux-pci.h --- xen/include/asm-ia64/linux-xen/linux/linux-pci.h +++ xen-redhat/include/asm-ia64/linux-xen/linux/linux-pci.h @@ -0,0 +1,820 @@ +/* + * pci.h + * + * PCI defines and function prototypes + * Copyright 1994, Drew Eckhardt + * Copyright 1997--1999 Martin Mares <mj@ucw.cz> + * + * For more information, please consult the following manuals (look at + * http://www.pcisig.com/ for how to get them): + * + * PCI BIOS Specification + * PCI Local Bus Specification + * PCI to PCI Bridge Specification + * PCI System Design Guide + */ + +#ifndef LINUX_PCI_H +#define LINUX_PCI_H + +/* Include the pci register defines */ +#include <linux/pci_regs.h> + +/* Include the ID list */ +#include <linux/pci_ids.h> +#ifdef XEN +#include <asm/processor.h> +#endif + +/* + * The PCI interface treats multi-function devices as independent + * devices. The slot/function address of each device is encoded + * in a single byte as follows: + * + * 7:3 = slot + * 2:0 = function + */ +#define PCI_DEVFN(slot,func) ((((slot) & 0x1f) << 3) | ((func) & 0x07)) +#define PCI_SLOT(devfn) (((devfn) >> 3) & 0x1f) +#define PCI_FUNC(devfn) ((devfn) & 0x07) + +/* Ioctls for /proc/bus/pci/X/Y nodes. */ +#define PCIIOC_BASE ('P' << 24 | 'C' << 16 | 'I' << 8) +#define PCIIOC_CONTROLLER (PCIIOC_BASE | 0x00) /* Get controller for PCI device. */ +#define PCIIOC_MMAP_IS_IO (PCIIOC_BASE | 0x01) /* Set mmap state to I/O space. */ +#define PCIIOC_MMAP_IS_MEM (PCIIOC_BASE | 0x02) /* Set mmap state to MEM space. */ +#define PCIIOC_WRITE_COMBINE (PCIIOC_BASE | 0x03) /* Enable/disable write-combining. */ + +#ifdef __KERNEL__ + +#include <linux/mod_devicetable.h> + +#include <linux/types.h> +#include <linux/ioport.h> +#include <linux/list.h> +#include <linux/compiler.h> +#include <linux/errno.h> +#include <linux/device.h> + +/* File state for mmap()s on /proc/bus/pci/X/Y */ +enum pci_mmap_state { + pci_mmap_io, + pci_mmap_mem +}; + +/* This defines the direction arg to the DMA mapping routines. */ +#define PCI_DMA_BIDIRECTIONAL 0 +#define PCI_DMA_TODEVICE 1 +#define PCI_DMA_FROMDEVICE 2 +#define PCI_DMA_NONE 3 + +#define DEVICE_COUNT_COMPATIBLE 4 +#define DEVICE_COUNT_RESOURCE 12 + +typedef int __bitwise pci_power_t; + +#define PCI_D0 ((pci_power_t __force) 0) +#define PCI_D1 ((pci_power_t __force) 1) +#define PCI_D2 ((pci_power_t __force) 2) +#define PCI_D3hot ((pci_power_t __force) 3) +#define PCI_D3cold ((pci_power_t __force) 4) +#define PCI_UNKNOWN ((pci_power_t __force) 5) +#define PCI_POWER_ERROR ((pci_power_t __force) -1) + +/** The pci_channel state describes connectivity between the CPU and + * the pci device. If some PCI bus between here and the pci device + * has crashed or locked up, this info is reflected here. + */ +typedef unsigned int __bitwise pci_channel_state_t; + +enum pci_channel_state { + /* I/O channel is in normal state */ + pci_channel_io_normal = (__force pci_channel_state_t) 1, + + /* I/O to channel is blocked */ + pci_channel_io_frozen = (__force pci_channel_state_t) 2, + + /* PCI card is dead */ + pci_channel_io_perm_failure = (__force pci_channel_state_t) 3, +}; + +typedef unsigned short __bitwise pci_bus_flags_t; +enum pci_bus_flags { + PCI_BUS_FLAGS_NO_MSI = (__force pci_bus_flags_t) 1, +}; + +struct pci_cap_saved_state { + struct hlist_node next; + char cap_nr; + u32 data[0]; +}; + +/* + * The pci_dev structure is used to describe PCI devices. + */ +struct pci_dev { + struct list_head global_list; /* node in list of all PCI devices */ + struct list_head bus_list; /* node in per-bus list */ + struct pci_bus *bus; /* bus this device is on */ + struct pci_bus *subordinate; /* bus this device bridges to */ + + void *sysdata; /* hook for sys-specific extension */ + struct proc_dir_entry *procent; /* device entry in /proc/bus/pci */ + + unsigned int devfn; /* encoded device & function index */ + unsigned short vendor; + unsigned short device; + unsigned short subsystem_vendor; + unsigned short subsystem_device; + unsigned int class; /* 3 bytes: (base,sub,prog-if) */ + u8 hdr_type; /* PCI header type (`multi' flag masked out) */ + u8 rom_base_reg; /* which config register controls the ROM */ + u8 pin; /* which interrupt pin this device uses */ + + struct pci_driver *driver; /* which driver has allocated this device */ + u64 dma_mask; /* Mask of the bits of bus address this + device implements. Normally this is + 0xffffffff. You only need to change + this if your device has broken DMA + or supports 64-bit transfers. */ + + pci_power_t current_state; /* Current operating state. In ACPI-speak, + this is D0-D3, D0 being fully functional, + and D3 being off. */ + + pci_channel_state_t error_state; /* current connectivity state */ + struct device dev; /* Generic device interface */ + + /* device is compatible with these IDs */ + unsigned short vendor_compatible[DEVICE_COUNT_COMPATIBLE]; + unsigned short device_compatible[DEVICE_COUNT_COMPATIBLE]; + + int cfg_size; /* Size of configuration space */ + + /* + * Instead of touching interrupt line and base address registers + * directly, use the values stored here. They might be different! + */ + unsigned int irq; + struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */ + + /* These fields are used by common fixups */ + unsigned int transparent:1; /* Transparent PCI bridge */ + unsigned int multifunction:1;/* Part of multi-function device */ + /* keep track of device state */ + unsigned int is_enabled:1; /* pci_enable_device has been called */ + unsigned int is_busmaster:1; /* device is busmaster */ + unsigned int no_msi:1; /* device may not use msi */ + unsigned int no_d1d2:1; /* only allow d0 or d3 */ + unsigned int block_ucfg_access:1; /* userspace config space access is blocked */ + unsigned int broken_parity_status:1; /* Device generates false positive parity */ + unsigned int msi_enabled:1; + unsigned int msix_enabled:1; + + u32 saved_config_space[16]; /* config space saved at suspend time */ + struct hlist_head saved_cap_space; + struct bin_attribute *rom_attr; /* attribute descriptor for sysfs ROM entry */ + int rom_attr_enabled; /* has display of the rom attribute been enabled? */ + struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */ +}; + +#define pci_dev_g(n) list_entry(n, struct pci_dev, global_list) +#define pci_dev_b(n) list_entry(n, struct pci_dev, bus_list) +#define to_pci_dev(n) container_of(n, struct pci_dev, dev) +#define for_each_pci_dev(d) while ((d = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, d)) != NULL) + +static inline struct pci_cap_saved_state *pci_find_saved_cap( + struct pci_dev *pci_dev,char cap) +{ + struct pci_cap_saved_state *tmp; + struct hlist_node *pos; + + hlist_for_each_entry(tmp, pos, &pci_dev->saved_cap_space, next) { + if (tmp->cap_nr == cap) + return tmp; + } + return NULL; +} + +static inline void pci_add_saved_cap(struct pci_dev *pci_dev, + struct pci_cap_saved_state *new_cap) +{ + hlist_add_head(&new_cap->next, &pci_dev->saved_cap_space); +} + +static inline void pci_remove_saved_cap(struct pci_cap_saved_state *cap) +{ + hlist_del(&cap->next); +} + +/* + * For PCI devices, the region numbers are assigned this way: + * + * 0-5 standard PCI regions + * 6 expansion ROM + * 7-10 bridges: address space assigned to buses behind the bridge + */ + +#define PCI_ROM_RESOURCE 6 +#define PCI_BRIDGE_RESOURCES 7 +#define PCI_NUM_RESOURCES 11 + +#ifndef PCI_BUS_NUM_RESOURCES +#define PCI_BUS_NUM_RESOURCES 8 +#endif + +#define PCI_REGION_FLAG_MASK 0x0fU /* These bits of resource flags tell us the PCI region flags */ + +struct pci_bus { + struct list_head node; /* node in list of buses */ + struct pci_bus *parent; /* parent bus this bridge is on */ + struct list_head children; /* list of child buses */ + struct list_head devices; /* list of devices on this bus */ + struct pci_dev *self; /* bridge device as seen by parent */ + struct resource *resource[PCI_BUS_NUM_RESOURCES]; + /* address space routed to this bus */ + + struct pci_ops *ops; /* configuration access functions */ + void *sysdata; /* hook for sys-specific extension */ + struct proc_dir_entry *procdir; /* directory entry in /proc/bus/pci */ + + unsigned char number; /* bus number */ + unsigned char primary; /* number of primary bridge */ + unsigned char secondary; /* number of secondary bridge */ + unsigned char subordinate; /* max number of subordinate buses */ + + char name[48]; + + unsigned short bridge_ctl; /* manage NO_ISA/FBB/et al behaviors */ + pci_bus_flags_t bus_flags; /* Inherited by child busses */ + struct device *bridge; + struct class_device class_dev; + struct bin_attribute *legacy_io; /* legacy I/O for this bus */ + struct bin_attribute *legacy_mem; /* legacy mem */ +}; + +#define pci_bus_b(n) list_entry(n, struct pci_bus, node) +#define to_pci_bus(n) container_of(n, struct pci_bus, class_dev) + +/* + * Error values that may be returned by PCI functions. + */ +#define PCIBIOS_SUCCESSFUL 0x00 +#define PCIBIOS_FUNC_NOT_SUPPORTED 0x81 +#define PCIBIOS_BAD_VENDOR_ID 0x83 +#define PCIBIOS_DEVICE_NOT_FOUND 0x86 +#define PCIBIOS_BAD_REGISTER_NUMBER 0x87 +#define PCIBIOS_SET_FAILED 0x88 +#define PCIBIOS_BUFFER_TOO_SMALL 0x89 + +/* Low-level architecture-dependent routines */ + +struct pci_ops { + int (*read)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val); + int (*write)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val); +}; + +struct pci_raw_ops { + int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn, + int reg, int len, u32 *val); + int (*write)(unsigned int domain, unsigned int bus, unsigned int devfn, + int reg, int len, u32 val); +}; + +extern struct pci_raw_ops *raw_pci_ops; + +struct pci_bus_region { + unsigned long start; + unsigned long end; +}; + +struct pci_dynids { + spinlock_t lock; /* protects list, index */ + struct list_head list; /* for IDs added at runtime */ + unsigned int use_driver_data:1; /* pci_driver->driver_data is used */ +}; + +/* ---------------------------------------------------------------- */ +/** PCI Error Recovery System (PCI-ERS). If a PCI device driver provides + * a set fof callbacks in struct pci_error_handlers, then that device driver + * will be notified of PCI bus errors, and will be driven to recovery + * when an error occurs. + */ + +typedef unsigned int __bitwise pci_ers_result_t; + +enum pci_ers_result { + /* no result/none/not supported in device driver */ + PCI_ERS_RESULT_NONE = (__force pci_ers_result_t) 1, + + /* Device driver can recover without slot reset */ + PCI_ERS_RESULT_CAN_RECOVER = (__force pci_ers_result_t) 2, + + /* Device driver wants slot to be reset. */ + PCI_ERS_RESULT_NEED_RESET = (__force pci_ers_result_t) 3, + + /* Device has completely failed, is unrecoverable */ + PCI_ERS_RESULT_DISCONNECT = (__force pci_ers_result_t) 4, + + /* Device driver is fully recovered and operational */ + PCI_ERS_RESULT_RECOVERED = (__force pci_ers_result_t) 5, +}; + +/* PCI bus error event callbacks */ +struct pci_error_handlers +{ + /* PCI bus error detected on this device */ + pci_ers_result_t (*error_detected)(struct pci_dev *dev, + enum pci_channel_state error); + + /* MMIO has been re-enabled, but not DMA */ + pci_ers_result_t (*mmio_enabled)(struct pci_dev *dev); + + /* PCI Express link has been reset */ + pci_ers_result_t (*link_reset)(struct pci_dev *dev); + + /* PCI slot has been reset */ + pci_ers_result_t (*slot_reset)(struct pci_dev *dev); + + /* Device driver may resume normal operations */ + void (*resume)(struct pci_dev *dev); +}; + +/* ---------------------------------------------------------------- */ + +struct module; +struct pci_driver { + struct list_head node; + char *name; + const struct pci_device_id *id_table; /* must be non-NULL for probe to be called */ + int (*probe) (struct pci_dev *dev, const struct pci_device_id *id); /* New device inserted */ + void (*remove) (struct pci_dev *dev); /* Device removed (NULL if not a hot-plug capable driver) */ + int (*suspend) (struct pci_dev *dev, pm_message_t state); /* Device suspended */ + int (*suspend_late) (struct pci_dev *dev, pm_message_t state); + int (*resume_early) (struct pci_dev *dev); + int (*resume) (struct pci_dev *dev); /* Device woken up */ + int (*enable_wake) (struct pci_dev *dev, pci_power_t state, int enable); /* Enable wake event */ + void (*shutdown) (struct pci_dev *dev); + + struct pci_error_handlers *err_handler; + struct device_driver driver; + struct pci_dynids dynids; + + int multithread_probe; +}; + +#define to_pci_driver(drv) container_of(drv,struct pci_driver, driver) + +/** + * PCI_DEVICE - macro used to describe a specific pci device + * @vend: the 16 bit PCI Vendor ID + * @dev: the 16 bit PCI Device ID + * + * This macro is used to create a struct pci_device_id that matches a + * specific device. The subvendor and subdevice fields will be set to + * PCI_ANY_ID. + */ +#define PCI_DEVICE(vend,dev) \ + .vendor = (vend), .device = (dev), \ + .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID + +/** + * PCI_DEVICE_CLASS - macro used to describe a specific pci device class + * @dev_class: the class, subclass, prog-if triple for this device + * @dev_class_mask: the class mask for this device + * + * This macro is used to create a struct pci_device_id that matches a + * specific PCI class. The vendor, device, subvendor, and subdevice + * fields will be set to PCI_ANY_ID. + */ +#define PCI_DEVICE_CLASS(dev_class,dev_class_mask) \ + .class = (dev_class), .class_mask = (dev_class_mask), \ + .vendor = PCI_ANY_ID, .device = PCI_ANY_ID, \ + .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID + +/* + * pci_module_init is obsolete, this stays here till we fix up all usages of it + * in the tree. + */ +#define pci_module_init pci_register_driver + +/* these external functions are only available when PCI support is enabled */ +#ifdef CONFIG_PCI + +extern struct bus_type pci_bus_type; + +/* Do NOT directly access these two variables, unless you are arch specific pci + * code, or pci core code. */ +extern struct list_head pci_root_buses; /* list of all known PCI buses */ +extern struct list_head pci_devices; /* list of all devices */ + +void pcibios_fixup_bus(struct pci_bus *); +int __must_check pcibios_enable_device(struct pci_dev *, int mask); +char *pcibios_setup (char *str); + +/* Used only when drivers/pci/setup.c is used */ +void pcibios_align_resource(void *, struct resource *, resource_size_t, + resource_size_t); +void pcibios_update_irq(struct pci_dev *, int irq); + +/* Generic PCI functions used internally */ + +extern struct pci_bus *pci_find_bus(int domain, int busnr); +void pci_bus_add_devices(struct pci_bus *bus); +struct pci_bus *pci_scan_bus_parented(struct device *parent, int bus, struct pci_ops *ops, void *sysdata); +static inline struct pci_bus *pci_scan_bus(int bus, struct pci_ops *ops, void *sysdata) +{ + struct pci_bus *root_bus; + root_bus = pci_scan_bus_parented(NULL, bus, ops, sysdata); + if (root_bus) + pci_bus_add_devices(root_bus); + return root_bus; +} +struct pci_bus *pci_create_bus(struct device *parent, int bus, struct pci_ops *ops, void *sysdata); +struct pci_bus * pci_add_new_bus(struct pci_bus *parent, struct pci_dev *dev, int busnr); +int pci_scan_slot(struct pci_bus *bus, int devfn); +struct pci_dev * pci_scan_single_device(struct pci_bus *bus, int devfn); +void pci_device_add(struct pci_dev *dev, struct pci_bus *bus); +unsigned int pci_scan_child_bus(struct pci_bus *bus); +int __must_check pci_bus_add_device(struct pci_dev *dev); +void pci_read_bridge_bases(struct pci_bus *child); +struct resource *pci_find_parent_resource(const struct pci_dev *dev, struct resource *res); +int pci_get_interrupt_pin(struct pci_dev *dev, struct pci_dev **bridge); +extern struct pci_dev *pci_dev_get(struct pci_dev *dev); +extern void pci_dev_put(struct pci_dev *dev); +extern void pci_remove_bus(struct pci_bus *b); +extern void pci_remove_bus_device(struct pci_dev *dev); +extern void pci_stop_bus_device(struct pci_dev *dev); +void pci_setup_cardbus(struct pci_bus *bus); +extern void pci_sort_breadthfirst(void); + +/* Generic PCI functions exported to card drivers */ + +struct pci_dev *pci_find_device (unsigned int vendor, unsigned int device, const struct pci_dev *from); +struct pci_dev *pci_find_device_reverse (unsigned int vendor, unsigned int device, const struct pci_dev *from); +struct pci_dev *pci_find_slot (unsigned int bus, unsigned int devfn); +int pci_find_capability (struct pci_dev *dev, int cap); +int pci_find_next_capability (struct pci_dev *dev, u8 pos, int cap); +int pci_find_ext_capability (struct pci_dev *dev, int cap); +struct pci_bus *pci_find_next_bus(const struct pci_bus *from); + +struct pci_dev *pci_get_device(unsigned int vendor, unsigned int device, + struct pci_dev *from); +struct pci_dev *pci_get_device_reverse(unsigned int vendor, unsigned int device, + struct pci_dev *from); + +struct pci_dev *pci_get_subsys (unsigned int vendor, unsigned int device, + unsigned int ss_vendor, unsigned int ss_device, + struct pci_dev *from); +struct pci_dev *pci_get_slot (struct pci_bus *bus, unsigned int devfn); +struct pci_dev *pci_get_bus_and_slot (unsigned int bus, unsigned int devfn); +struct pci_dev *pci_get_class (unsigned int class, struct pci_dev *from); +int pci_dev_present(const struct pci_device_id *ids); + +int pci_bus_read_config_byte (struct pci_bus *bus, unsigned int devfn, int where, u8 *val); +int pci_bus_read_config_word (struct pci_bus *bus, unsigned int devfn, int where, u16 *val); +int pci_bus_read_config_dword (struct pci_bus *bus, unsigned int devfn, int where, u32 *val); +int pci_bus_write_config_byte (struct pci_bus *bus, unsigned int devfn, int where, u8 val); +int pci_bus_write_config_word (struct pci_bus *bus, unsigned int devfn, int where, u16 val); +int pci_bus_write_config_dword (struct pci_bus *bus, unsigned int devfn, int where, u32 val); + +static inline int pci_read_config_byte(struct pci_dev *dev, int where, u8 *val) +{ + return pci_bus_read_config_byte (dev->bus, dev->devfn, where, val); +} +static inline int pci_read_config_word(struct pci_dev *dev, int where, u16 *val) +{ + return pci_bus_read_config_word (dev->bus, dev->devfn, where, val); +} +static inline int pci_read_config_dword(struct pci_dev *dev, int where, u32 *val) +{ + return pci_bus_read_config_dword (dev->bus, dev->devfn, where, val); +} +static inline int pci_write_config_byte(struct pci_dev *dev, int where, u8 val) +{ + return pci_bus_write_config_byte (dev->bus, dev->devfn, where, val); +} +static inline int pci_write_config_word(struct pci_dev *dev, int where, u16 val) +{ + return pci_bus_write_config_word (dev->bus, dev->devfn, where, val); +} +static inline int pci_write_config_dword(struct pci_dev *dev, int where, u32 val) +{ + return pci_bus_write_config_dword (dev->bus, dev->devfn, where, val); +} + +int __must_check pci_enable_device(struct pci_dev *dev); +int __must_check pci_enable_device_bars(struct pci_dev *dev, int mask); +void pci_disable_device(struct pci_dev *dev); +void pci_set_master(struct pci_dev *dev); +#define HAVE_PCI_SET_MWI +int __must_check pci_set_mwi(struct pci_dev *dev); +void pci_clear_mwi(struct pci_dev *dev); +void pci_intx(struct pci_dev *dev, int enable); +int pci_set_dma_mask(struct pci_dev *dev, u64 mask); +int pci_set_consistent_dma_mask(struct pci_dev *dev, u64 mask); +void pci_update_resource(struct pci_dev *dev, struct resource *res, int resno); +int __must_check pci_assign_resource(struct pci_dev *dev, int i); +int __must_check pci_assign_resource_fixed(struct pci_dev *dev, int i); +void pci_restore_bars(struct pci_dev *dev); + +/* ROM control related routines */ +void __iomem __must_check *pci_map_rom(struct pci_dev *pdev, size_t *size); +void __iomem __must_check *pci_map_rom_copy(struct pci_dev *pdev, size_t *size); +void pci_unmap_rom(struct pci_dev *pdev, void __iomem *rom); +void pci_remove_rom(struct pci_dev *pdev); + +/* Power management related routines */ +int pci_save_state(struct pci_dev *dev); +int pci_restore_state(struct pci_dev *dev); +int pci_set_power_state(struct pci_dev *dev, pci_power_t state); +pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state); +int pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable); + +/* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */ +void pci_bus_assign_resources(struct pci_bus *bus); +void pci_bus_size_bridges(struct pci_bus *bus); +int pci_claim_resource(struct pci_dev *, int); +void pci_assign_unassigned_resources(void); +void pdev_enable_device(struct pci_dev *); +void pdev_sort_resources(struct pci_dev *, struct resource_list *); +void pci_fixup_irqs(u8 (*)(struct pci_dev *, u8 *), + int (*)(struct pci_dev *, u8, u8)); +#define HAVE_PCI_REQ_REGIONS 2 +int __must_check pci_request_regions(struct pci_dev *, const char *); +void pci_release_regions(struct pci_dev *); +int __must_check pci_request_region(struct pci_dev *, int, const char *); +void pci_release_region(struct pci_dev *, int); + +/* drivers/pci/bus.c */ +int __must_check pci_bus_alloc_resource(struct pci_bus *bus, + struct resource *res, resource_size_t size, + resource_size_t align, resource_size_t min, + unsigned int type_mask, + void (*alignf)(void *, struct resource *, + resource_size_t, resource_size_t), + void *alignf_data); +void pci_enable_bridges(struct pci_bus *bus); + +/* Proper probing supporting hot-pluggable devices */ +int __must_check __pci_register_driver(struct pci_driver *, struct module *); +static inline int __must_check pci_register_driver(struct pci_driver *driver) +{ + return __pci_register_driver(driver, THIS_MODULE); +} + +void pci_unregister_driver(struct pci_driver *); +void pci_remove_behind_bridge(struct pci_dev *); +struct pci_driver *pci_dev_driver(const struct pci_dev *); +const struct pci_device_id *pci_match_device(struct pci_driver *drv, struct pci_dev *dev); +const struct pci_device_id *pci_match_id(const struct pci_device_id *ids, struct pci_dev *dev); +int pci_scan_bridge(struct pci_bus *bus, struct pci_dev * dev, int max, int pass); + +void pci_walk_bus(struct pci_bus *top, void (*cb)(struct pci_dev *, void *), + void *userdata); +int pci_cfg_space_size(struct pci_dev *dev); +unsigned char pci_bus_max_busnr(struct pci_bus* bus); + +/* kmem_cache style wrapper around pci_alloc_consistent() */ + +#include <linux/dmapool.h> + +#define pci_pool dma_pool +#define pci_pool_create(name, pdev, size, align, allocation) \ + dma_pool_create(name, &pdev->dev, size, align, allocation) +#define pci_pool_destroy(pool) dma_pool_destroy(pool) +#define pci_pool_alloc(pool, flags, handle) dma_pool_alloc(pool, flags, handle) +#define pci_pool_free(pool, vaddr, addr) dma_pool_free(pool, vaddr, addr) + +enum pci_dma_burst_strategy { + PCI_DMA_BURST_INFINITY, /* make bursts as large as possible, + strategy_parameter is N/A */ + PCI_DMA_BURST_BOUNDARY, /* disconnect at every strategy_parameter + byte boundaries */ + PCI_DMA_BURST_MULTIPLE, /* disconnect at some multiple of + strategy_parameter byte boundaries */ +}; + +#if defined(CONFIG_ISA) || defined(CONFIG_EISA) +extern struct pci_dev *isa_bridge; +#endif + +struct msix_entry { + u16 vector; /* kernel uses to write allocated vector */ + u16 entry; /* driver uses to specify entry, OS writes */ +}; + + +#ifndef CONFIG_PCI_MSI +static inline void pci_scan_msi_device(struct pci_dev *dev) {} +static inline int pci_enable_msi(struct pci_dev *dev) {return -1;} +static inline void pci_disable_msi(struct pci_dev *dev) {} +static inline int pci_enable_msix(struct pci_dev* dev, + struct msix_entry *entries, int nvec) {return -1;} +static inline void pci_disable_msix(struct pci_dev *dev) {} +static inline void msi_remove_pci_irq_vectors(struct pci_dev *dev) {} +#else +extern void pci_scan_msi_device(struct pci_dev *dev); +extern int pci_enable_msi(struct pci_dev *dev); +extern void pci_disable_msi(struct pci_dev *dev); +extern int pci_enable_msix(struct pci_dev* dev, + struct msix_entry *entries, int nvec); +extern void pci_disable_msix(struct pci_dev *dev); +extern void msi_remove_pci_irq_vectors(struct pci_dev *dev); +#endif + +#ifdef CONFIG_HT_IRQ +/* The functions a driver should call */ +int ht_create_irq(struct pci_dev *dev, int idx); +void ht_destroy_irq(unsigned int irq); +#endif /* CONFIG_HT_IRQ */ + +extern void pci_block_user_cfg_access(struct pci_dev *dev); +extern void pci_unblock_user_cfg_access(struct pci_dev *dev); + +/* + * PCI domain support. Sometimes called PCI segment (eg by ACPI), + * a PCI domain is defined to be a set of PCI busses which share + * configuration space. + */ +#ifndef CONFIG_PCI_DOMAINS +static inline int pci_domain_nr(struct pci_bus *bus) { return 0; } +static inline int pci_proc_domain(struct pci_bus *bus) +{ + return 0; +} +#endif + +#else /* CONFIG_PCI is not enabled */ + +/* + * If the system does not have PCI, clearly these return errors. Define + * these as simple inline functions to avoid hair in drivers. + */ + +#define _PCI_NOP(o,s,t) \ + static inline int pci_##o##_config_##s (struct pci_dev *dev, int where, t val) \ + { return PCIBIOS_FUNC_NOT_SUPPORTED; } +#define _PCI_NOP_ALL(o,x) _PCI_NOP(o,byte,u8 x) \ + _PCI_NOP(o,word,u16 x) \ + _PCI_NOP(o,dword,u32 x) +_PCI_NOP_ALL(read, *) +_PCI_NOP_ALL(write,) + +static inline struct pci_dev *pci_find_device(unsigned int vendor, unsigned int device, const struct pci_dev *from) +{ return NULL; } + +static inline struct pci_dev *pci_find_slot(unsigned int bus, unsigned int devfn) +{ return NULL; } + +static inline struct pci_dev *pci_get_device(unsigned int vendor, + unsigned int device, struct pci_dev *from) +{ return NULL; } + +static inline struct pci_dev *pci_get_device_reverse(unsigned int vendor, + unsigned int device, struct pci_dev *from) +{ return NULL; } + +static inline struct pci_dev *pci_get_subsys (unsigned int vendor, unsigned int device, +unsigned int ss_vendor, unsigned int ss_device, struct pci_dev *from) +{ return NULL; } + +static inline struct pci_dev *pci_get_class(unsigned int class, struct pci_dev *from) +{ return NULL; } + +#define pci_dev_present(ids) (0) +#define pci_dev_put(dev) do { } while (0) + +static inline void pci_set_master(struct pci_dev *dev) { } +static inline int pci_enable_device(struct pci_dev *dev) { return -EIO; } +static inline void pci_disable_device(struct pci_dev *dev) { } +static inline int pci_set_dma_mask(struct pci_dev *dev, u64 mask) { return -EIO; } +static inline int pci_assign_resource(struct pci_dev *dev, int i) { return -EBUSY;} +static inline int __pci_register_driver(struct pci_driver *drv, struct module *owner) { return 0;} +static inline int pci_register_driver(struct pci_driver *drv) { return 0;} +static inline void pci_unregister_driver(struct pci_driver *drv) { } +static inline int pci_find_capability (struct pci_dev *dev, int cap) {return 0; } +static inline int pci_find_next_capability (struct pci_dev *dev, u8 post, int cap) { return 0; } +static inline int pci_find_ext_capability (struct pci_dev *dev, int cap) {return 0; } +static inline const struct pci_device_id *pci_match_device(const struct pci_device_id *ids, const struct pci_dev *dev) { return NULL; } + +/* Power management related routines */ +static inline int pci_save_state(struct pci_dev *dev) { return 0; } +static inline int pci_restore_state(struct pci_dev *dev) { return 0; } +static inline int pci_set_power_state(struct pci_dev *dev, pci_power_t state) { return 0; } +static inline pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state) { return PCI_D0; } +static inline int pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable) { return 0; } + +#define isa_bridge ((struct pci_dev *)NULL) + +#define pci_dma_burst_advice(pdev, strat, strategy_parameter) do { } while (0) + +static inline void pci_block_user_cfg_access(struct pci_dev *dev) { } +static inline void pci_unblock_user_cfg_access(struct pci_dev *dev) { } + +#endif /* CONFIG_PCI */ + +/* Include architecture-dependent settings and functions */ + +#include <asm/pci.h> + +/* these helpers provide future and backwards compatibility + * for accessing popular PCI BAR info */ +#define pci_resource_start(dev,bar) ((dev)->resource[(bar)].start) +#define pci_resource_end(dev,bar) ((dev)->resource[(bar)].end) +#define pci_resource_flags(dev,bar) ((dev)->resource[(bar)].flags) +#define pci_resource_len(dev,bar) \ + ((pci_resource_start((dev),(bar)) == 0 && \ + pci_resource_end((dev),(bar)) == \ + pci_resource_start((dev),(bar))) ? 0 : \ + \ + (pci_resource_end((dev),(bar)) - \ + pci_resource_start((dev),(bar)) + 1)) + +/* Similar to the helpers above, these manipulate per-pci_dev + * driver-specific data. They are really just a wrapper around + * the generic device structure functions of these calls. + */ +static inline void *pci_get_drvdata (struct pci_dev *pdev) +{ + return dev_get_drvdata(&pdev->dev); +} + +static inline void pci_set_drvdata (struct pci_dev *pdev, void *data) +{ + dev_set_drvdata(&pdev->dev, data); +} + +/* If you want to know what to call your pci_dev, ask this function. + * Again, it's a wrapper around the generic device. + */ +static inline char *pci_name(struct pci_dev *pdev) +{ + return pdev->dev.bus_id; +} + + +/* Some archs don't want to expose struct resource to userland as-is + * in sysfs and /proc + */ +#ifndef HAVE_ARCH_PCI_RESOURCE_TO_USER +static inline void pci_resource_to_user(const struct pci_dev *dev, int bar, + const struct resource *rsrc, resource_size_t *start, + resource_size_t *end) +{ + *start = rsrc->start; + *end = rsrc->end; +} +#endif /* HAVE_ARCH_PCI_RESOURCE_TO_USER */ + + +/* + * The world is not perfect and supplies us with broken PCI devices. + * For at least a part of these bugs we need a work-around, so both + * generic (drivers/pci/quirks.c) and per-architecture code can define + * fixup hooks to be called for particular buggy devices. + */ + +struct pci_fixup { + u16 vendor, device; /* You can use PCI_ANY_ID here of course */ + void (*hook)(struct pci_dev *dev); +}; + +enum pci_fixup_pass { + pci_fixup_early, /* Before probing BARs */ + pci_fixup_header, /* After reading configuration header */ + pci_fixup_final, /* Final phase of device fixups */ + pci_fixup_enable, /* pci_enable_device() time */ +}; + +/* Anonymous variables would be nice... */ +#define DECLARE_PCI_FIXUP_SECTION(section, name, vendor, device, hook) \ + static const struct pci_fixup __pci_fixup_##name __attribute_used__ \ + __attribute__((__section__(#section))) = { vendor, device, hook }; +#define DECLARE_PCI_FIXUP_EARLY(vendor, device, hook) \ + DECLARE_PCI_FIXUP_SECTION(.pci_fixup_early, \ + vendor##device##hook, vendor, device, hook) +#define DECLARE_PCI_FIXUP_HEADER(vendor, device, hook) \ + DECLARE_PCI_FIXUP_SECTION(.pci_fixup_header, \ + vendor##device##hook, vendor, device, hook) +#define DECLARE_PCI_FIXUP_FINAL(vendor, device, hook) \ + DECLARE_PCI_FIXUP_SECTION(.pci_fixup_final, \ + vendor##device##hook, vendor, device, hook) +#define DECLARE_PCI_FIXUP_ENABLE(vendor, device, hook) \ + DECLARE_PCI_FIXUP_SECTION(.pci_fixup_enable, \ + vendor##device##hook, vendor, device, hook) + + +void pci_fixup_device(enum pci_fixup_pass pass, struct pci_dev *dev); + +extern int pci_pci_problems; +#define PCIPCI_FAIL 1 /* No PCI PCI DMA */ +#define PCIPCI_TRITON 2 +#define PCIPCI_NATOMA 4 +#define PCIPCI_VIAETBF 8 +#define PCIPCI_VSFX 16 +#define PCIPCI_ALIMAGIK 32 /* Need low latency setting */ +#define PCIAGP_FAIL 64 /* No PCI to AGP DMA */ + +#endif /* __KERNEL__ */ +#endif /* LINUX_PCI_H */ diff -Naurp xen/include/asm-ia64/linux-xen/linux/pci.h xen-redhat/include/asm-ia64/linux-xen/linux/pci.h --- xen/include/asm-ia64/linux-xen/linux/pci.h +++ xen-redhat/include/asm-ia64/linux-xen/linux/pci.h @@ -1,820 +0,0 @@ -/* - * pci.h - * - * PCI defines and function prototypes - * Copyright 1994, Drew Eckhardt - * Copyright 1997--1999 Martin Mares <mj@ucw.cz> - * - * For more information, please consult the following manuals (look at - * http://www.pcisig.com/ for how to get them): - * - * PCI BIOS Specification - * PCI Local Bus Specification - * PCI to PCI Bridge Specification - * PCI System Design Guide - */ - -#ifndef LINUX_PCI_H -#define LINUX_PCI_H - -/* Include the pci register defines */ -#include <linux/pci_regs.h> - -/* Include the ID list */ -#include <linux/pci_ids.h> -#ifdef XEN -#include <asm/processor.h> -#endif - -/* - * The PCI interface treats multi-function devices as independent - * devices. The slot/function address of each device is encoded - * in a single byte as follows: - * - * 7:3 = slot - * 2:0 = function - */ -#define PCI_DEVFN(slot,func) ((((slot) & 0x1f) << 3) | ((func) & 0x07)) -#define PCI_SLOT(devfn) (((devfn) >> 3) & 0x1f) -#define PCI_FUNC(devfn) ((devfn) & 0x07) - -/* Ioctls for /proc/bus/pci/X/Y nodes. */ -#define PCIIOC_BASE ('P' << 24 | 'C' << 16 | 'I' << 8) -#define PCIIOC_CONTROLLER (PCIIOC_BASE | 0x00) /* Get controller for PCI device. */ -#define PCIIOC_MMAP_IS_IO (PCIIOC_BASE | 0x01) /* Set mmap state to I/O space. */ -#define PCIIOC_MMAP_IS_MEM (PCIIOC_BASE | 0x02) /* Set mmap state to MEM space. */ -#define PCIIOC_WRITE_COMBINE (PCIIOC_BASE | 0x03) /* Enable/disable write-combining. */ - -#ifdef __KERNEL__ - -#include <linux/mod_devicetable.h> - -#include <linux/types.h> -#include <linux/ioport.h> -#include <linux/list.h> -#include <linux/compiler.h> -#include <linux/errno.h> -#include <linux/device.h> - -/* File state for mmap()s on /proc/bus/pci/X/Y */ -enum pci_mmap_state { - pci_mmap_io, - pci_mmap_mem -}; - -/* This defines the direction arg to the DMA mapping routines. */ -#define PCI_DMA_BIDIRECTIONAL 0 -#define PCI_DMA_TODEVICE 1 -#define PCI_DMA_FROMDEVICE 2 -#define PCI_DMA_NONE 3 - -#define DEVICE_COUNT_COMPATIBLE 4 -#define DEVICE_COUNT_RESOURCE 12 - -typedef int __bitwise pci_power_t; - -#define PCI_D0 ((pci_power_t __force) 0) -#define PCI_D1 ((pci_power_t __force) 1) -#define PCI_D2 ((pci_power_t __force) 2) -#define PCI_D3hot ((pci_power_t __force) 3) -#define PCI_D3cold ((pci_power_t __force) 4) -#define PCI_UNKNOWN ((pci_power_t __force) 5) -#define PCI_POWER_ERROR ((pci_power_t __force) -1) - -/** The pci_channel state describes connectivity between the CPU and - * the pci device. If some PCI bus between here and the pci device - * has crashed or locked up, this info is reflected here. - */ -typedef unsigned int __bitwise pci_channel_state_t; - -enum pci_channel_state { - /* I/O channel is in normal state */ - pci_channel_io_normal = (__force pci_channel_state_t) 1, - - /* I/O to channel is blocked */ - pci_channel_io_frozen = (__force pci_channel_state_t) 2, - - /* PCI card is dead */ - pci_channel_io_perm_failure = (__force pci_channel_state_t) 3, -}; - -typedef unsigned short __bitwise pci_bus_flags_t; -enum pci_bus_flags { - PCI_BUS_FLAGS_NO_MSI = (__force pci_bus_flags_t) 1, -}; - -struct pci_cap_saved_state { - struct hlist_node next; - char cap_nr; - u32 data[0]; -}; - -/* - * The pci_dev structure is used to describe PCI devices. - */ -struct pci_dev { - struct list_head global_list; /* node in list of all PCI devices */ - struct list_head bus_list; /* node in per-bus list */ - struct pci_bus *bus; /* bus this device is on */ - struct pci_bus *subordinate; /* bus this device bridges to */ - - void *sysdata; /* hook for sys-specific extension */ - struct proc_dir_entry *procent; /* device entry in /proc/bus/pci */ - - unsigned int devfn; /* encoded device & function index */ - unsigned short vendor; - unsigned short device; - unsigned short subsystem_vendor; - unsigned short subsystem_device; - unsigned int class; /* 3 bytes: (base,sub,prog-if) */ - u8 hdr_type; /* PCI header type (`multi' flag masked out) */ - u8 rom_base_reg; /* which config register controls the ROM */ - u8 pin; /* which interrupt pin this device uses */ - - struct pci_driver *driver; /* which driver has allocated this device */ - u64 dma_mask; /* Mask of the bits of bus address this - device implements. Normally this is - 0xffffffff. You only need to change - this if your device has broken DMA - or supports 64-bit transfers. */ - - pci_power_t current_state; /* Current operating state. In ACPI-speak, - this is D0-D3, D0 being fully functional, - and D3 being off. */ - - pci_channel_state_t error_state; /* current connectivity state */ - struct device dev; /* Generic device interface */ - - /* device is compatible with these IDs */ - unsigned short vendor_compatible[DEVICE_COUNT_COMPATIBLE]; - unsigned short device_compatible[DEVICE_COUNT_COMPATIBLE]; - - int cfg_size; /* Size of configuration space */ - - /* - * Instead of touching interrupt line and base address registers - * directly, use the values stored here. They might be different! - */ - unsigned int irq; - struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */ - - /* These fields are used by common fixups */ - unsigned int transparent:1; /* Transparent PCI bridge */ - unsigned int multifunction:1;/* Part of multi-function device */ - /* keep track of device state */ - unsigned int is_enabled:1; /* pci_enable_device has been called */ - unsigned int is_busmaster:1; /* device is busmaster */ - unsigned int no_msi:1; /* device may not use msi */ - unsigned int no_d1d2:1; /* only allow d0 or d3 */ - unsigned int block_ucfg_access:1; /* userspace config space access is blocked */ - unsigned int broken_parity_status:1; /* Device generates false positive parity */ - unsigned int msi_enabled:1; - unsigned int msix_enabled:1; - - u32 saved_config_space[16]; /* config space saved at suspend time */ - struct hlist_head saved_cap_space; - struct bin_attribute *rom_attr; /* attribute descriptor for sysfs ROM entry */ - int rom_attr_enabled; /* has display of the rom attribute been enabled? */ - struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */ -}; - -#define pci_dev_g(n) list_entry(n, struct pci_dev, global_list) -#define pci_dev_b(n) list_entry(n, struct pci_dev, bus_list) -#define to_pci_dev(n) container_of(n, struct pci_dev, dev) -#define for_each_pci_dev(d) while ((d = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, d)) != NULL) - -static inline struct pci_cap_saved_state *pci_find_saved_cap( - struct pci_dev *pci_dev,char cap) -{ - struct pci_cap_saved_state *tmp; - struct hlist_node *pos; - - hlist_for_each_entry(tmp, pos, &pci_dev->saved_cap_space, next) { - if (tmp->cap_nr == cap) - return tmp; - } - return NULL; -} - -static inline void pci_add_saved_cap(struct pci_dev *pci_dev, - struct pci_cap_saved_state *new_cap) -{ - hlist_add_head(&new_cap->next, &pci_dev->saved_cap_space); -} - -static inline void pci_remove_saved_cap(struct pci_cap_saved_state *cap) -{ - hlist_del(&cap->next); -} - -/* - * For PCI devices, the region numbers are assigned this way: - * - * 0-5 standard PCI regions - * 6 expansion ROM - * 7-10 bridges: address space assigned to buses behind the bridge - */ - -#define PCI_ROM_RESOURCE 6 -#define PCI_BRIDGE_RESOURCES 7 -#define PCI_NUM_RESOURCES 11 - -#ifndef PCI_BUS_NUM_RESOURCES -#define PCI_BUS_NUM_RESOURCES 8 -#endif - -#define PCI_REGION_FLAG_MASK 0x0fU /* These bits of resource flags tell us the PCI region flags */ - -struct pci_bus { - struct list_head node; /* node in list of buses */ - struct pci_bus *parent; /* parent bus this bridge is on */ - struct list_head children; /* list of child buses */ - struct list_head devices; /* list of devices on this bus */ - struct pci_dev *self; /* bridge device as seen by parent */ - struct resource *resource[PCI_BUS_NUM_RESOURCES]; - /* address space routed to this bus */ - - struct pci_ops *ops; /* configuration access functions */ - void *sysdata; /* hook for sys-specific extension */ - struct proc_dir_entry *procdir; /* directory entry in /proc/bus/pci */ - - unsigned char number; /* bus number */ - unsigned char primary; /* number of primary bridge */ - unsigned char secondary; /* number of secondary bridge */ - unsigned char subordinate; /* max number of subordinate buses */ - - char name[48]; - - unsigned short bridge_ctl; /* manage NO_ISA/FBB/et al behaviors */ - pci_bus_flags_t bus_flags; /* Inherited by child busses */ - struct device *bridge; - struct class_device class_dev; - struct bin_attribute *legacy_io; /* legacy I/O for this bus */ - struct bin_attribute *legacy_mem; /* legacy mem */ -}; - -#define pci_bus_b(n) list_entry(n, struct pci_bus, node) -#define to_pci_bus(n) container_of(n, struct pci_bus, class_dev) - -/* - * Error values that may be returned by PCI functions. - */ -#define PCIBIOS_SUCCESSFUL 0x00 -#define PCIBIOS_FUNC_NOT_SUPPORTED 0x81 -#define PCIBIOS_BAD_VENDOR_ID 0x83 -#define PCIBIOS_DEVICE_NOT_FOUND 0x86 -#define PCIBIOS_BAD_REGISTER_NUMBER 0x87 -#define PCIBIOS_SET_FAILED 0x88 -#define PCIBIOS_BUFFER_TOO_SMALL 0x89 - -/* Low-level architecture-dependent routines */ - -struct pci_ops { - int (*read)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val); - int (*write)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val); -}; - -struct pci_raw_ops { - int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn, - int reg, int len, u32 *val); - int (*write)(unsigned int domain, unsigned int bus, unsigned int devfn, - int reg, int len, u32 val); -}; - -extern struct pci_raw_ops *raw_pci_ops; - -struct pci_bus_region { - unsigned long start; - unsigned long end; -}; - -struct pci_dynids { - spinlock_t lock; /* protects list, index */ - struct list_head list; /* for IDs added at runtime */ - unsigned int use_driver_data:1; /* pci_driver->driver_data is used */ -}; - -/* ---------------------------------------------------------------- */ -/** PCI Error Recovery System (PCI-ERS). If a PCI device driver provides - * a set fof callbacks in struct pci_error_handlers, then that device driver - * will be notified of PCI bus errors, and will be driven to recovery - * when an error occurs. - */ - -typedef unsigned int __bitwise pci_ers_result_t; - -enum pci_ers_result { - /* no result/none/not supported in device driver */ - PCI_ERS_RESULT_NONE = (__force pci_ers_result_t) 1, - - /* Device driver can recover without slot reset */ - PCI_ERS_RESULT_CAN_RECOVER = (__force pci_ers_result_t) 2, - - /* Device driver wants slot to be reset. */ - PCI_ERS_RESULT_NEED_RESET = (__force pci_ers_result_t) 3, - - /* Device has completely failed, is unrecoverable */ - PCI_ERS_RESULT_DISCONNECT = (__force pci_ers_result_t) 4, - - /* Device driver is fully recovered and operational */ - PCI_ERS_RESULT_RECOVERED = (__force pci_ers_result_t) 5, -}; - -/* PCI bus error event callbacks */ -struct pci_error_handlers -{ - /* PCI bus error detected on this device */ - pci_ers_result_t (*error_detected)(struct pci_dev *dev, - enum pci_channel_state error); - - /* MMIO has been re-enabled, but not DMA */ - pci_ers_result_t (*mmio_enabled)(struct pci_dev *dev); - - /* PCI Express link has been reset */ - pci_ers_result_t (*link_reset)(struct pci_dev *dev); - - /* PCI slot has been reset */ - pci_ers_result_t (*slot_reset)(struct pci_dev *dev); - - /* Device driver may resume normal operations */ - void (*resume)(struct pci_dev *dev); -}; - -/* ---------------------------------------------------------------- */ - -struct module; -struct pci_driver { - struct list_head node; - char *name; - const struct pci_device_id *id_table; /* must be non-NULL for probe to be called */ - int (*probe) (struct pci_dev *dev, const struct pci_device_id *id); /* New device inserted */ - void (*remove) (struct pci_dev *dev); /* Device removed (NULL if not a hot-plug capable driver) */ - int (*suspend) (struct pci_dev *dev, pm_message_t state); /* Device suspended */ - int (*suspend_late) (struct pci_dev *dev, pm_message_t state); - int (*resume_early) (struct pci_dev *dev); - int (*resume) (struct pci_dev *dev); /* Device woken up */ - int (*enable_wake) (struct pci_dev *dev, pci_power_t state, int enable); /* Enable wake event */ - void (*shutdown) (struct pci_dev *dev); - - struct pci_error_handlers *err_handler; - struct device_driver driver; - struct pci_dynids dynids; - - int multithread_probe; -}; - -#define to_pci_driver(drv) container_of(drv,struct pci_driver, driver) - -/** - * PCI_DEVICE - macro used to describe a specific pci device - * @vend: the 16 bit PCI Vendor ID - * @dev: the 16 bit PCI Device ID - * - * This macro is used to create a struct pci_device_id that matches a - * specific device. The subvendor and subdevice fields will be set to - * PCI_ANY_ID. - */ -#define PCI_DEVICE(vend,dev) \ - .vendor = (vend), .device = (dev), \ - .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID - -/** - * PCI_DEVICE_CLASS - macro used to describe a specific pci device class - * @dev_class: the class, subclass, prog-if triple for this device - * @dev_class_mask: the class mask for this device - * - * This macro is used to create a struct pci_device_id that matches a - * specific PCI class. The vendor, device, subvendor, and subdevice - * fields will be set to PCI_ANY_ID. - */ -#define PCI_DEVICE_CLASS(dev_class,dev_class_mask) \ - .class = (dev_class), .class_mask = (dev_class_mask), \ - .vendor = PCI_ANY_ID, .device = PCI_ANY_ID, \ - .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID - -/* - * pci_module_init is obsolete, this stays here till we fix up all usages of it - * in the tree. - */ -#define pci_module_init pci_register_driver - -/* these external functions are only available when PCI support is enabled */ -#ifdef CONFIG_PCI - -extern struct bus_type pci_bus_type; - -/* Do NOT directly access these two variables, unless you are arch specific pci - * code, or pci core code. */ -extern struct list_head pci_root_buses; /* list of all known PCI buses */ -extern struct list_head pci_devices; /* list of all devices */ - -void pcibios_fixup_bus(struct pci_bus *); -int __must_check pcibios_enable_device(struct pci_dev *, int mask); -char *pcibios_setup (char *str); - -/* Used only when drivers/pci/setup.c is used */ -void pcibios_align_resource(void *, struct resource *, resource_size_t, - resource_size_t); -void pcibios_update_irq(struct pci_dev *, int irq); - -/* Generic PCI functions used internally */ - -extern struct pci_bus *pci_find_bus(int domain, int busnr); -void pci_bus_add_devices(struct pci_bus *bus); -struct pci_bus *pci_scan_bus_parented(struct device *parent, int bus, struct pci_ops *ops, void *sysdata); -static inline struct pci_bus *pci_scan_bus(int bus, struct pci_ops *ops, void *sysdata) -{ - struct pci_bus *root_bus; - root_bus = pci_scan_bus_parented(NULL, bus, ops, sysdata); - if (root_bus) - pci_bus_add_devices(root_bus); - return root_bus; -} -struct pci_bus *pci_create_bus(struct device *parent, int bus, struct pci_ops *ops, void *sysdata); -struct pci_bus * pci_add_new_bus(struct pci_bus *parent, struct pci_dev *dev, int busnr); -int pci_scan_slot(struct pci_bus *bus, int devfn); -struct pci_dev * pci_scan_single_device(struct pci_bus *bus, int devfn); -void pci_device_add(struct pci_dev *dev, struct pci_bus *bus); -unsigned int pci_scan_child_bus(struct pci_bus *bus); -int __must_check pci_bus_add_device(struct pci_dev *dev); -void pci_read_bridge_bases(struct pci_bus *child); -struct resource *pci_find_parent_resource(const struct pci_dev *dev, struct resource *res); -int pci_get_interrupt_pin(struct pci_dev *dev, struct pci_dev **bridge); -extern struct pci_dev *pci_dev_get(struct pci_dev *dev); -extern void pci_dev_put(struct pci_dev *dev); -extern void pci_remove_bus(struct pci_bus *b); -extern void pci_remove_bus_device(struct pci_dev *dev); -extern void pci_stop_bus_device(struct pci_dev *dev); -void pci_setup_cardbus(struct pci_bus *bus); -extern void pci_sort_breadthfirst(void); - -/* Generic PCI functions exported to card drivers */ - -struct pci_dev *pci_find_device (unsigned int vendor, unsigned int device, const struct pci_dev *from); -struct pci_dev *pci_find_device_reverse (unsigned int vendor, unsigned int device, const struct pci_dev *from); -struct pci_dev *pci_find_slot (unsigned int bus, unsigned int devfn); -int pci_find_capability (struct pci_dev *dev, int cap); -int pci_find_next_capability (struct pci_dev *dev, u8 pos, int cap); -int pci_find_ext_capability (struct pci_dev *dev, int cap); -struct pci_bus *pci_find_next_bus(const struct pci_bus *from); - -struct pci_dev *pci_get_device(unsigned int vendor, unsigned int device, - struct pci_dev *from); -struct pci_dev *pci_get_device_reverse(unsigned int vendor, unsigned int device, - struct pci_dev *from); - -struct pci_dev *pci_get_subsys (unsigned int vendor, unsigned int device, - unsigned int ss_vendor, unsigned int ss_device, - struct pci_dev *from); -struct pci_dev *pci_get_slot (struct pci_bus *bus, unsigned int devfn); -struct pci_dev *pci_get_bus_and_slot (unsigned int bus, unsigned int devfn); -struct pci_dev *pci_get_class (unsigned int class, struct pci_dev *from); -int pci_dev_present(const struct pci_device_id *ids); - -int pci_bus_read_config_byte (struct pci_bus *bus, unsigned int devfn, int where, u8 *val); -int pci_bus_read_config_word (struct pci_bus *bus, unsigned int devfn, int where, u16 *val); -int pci_bus_read_config_dword (struct pci_bus *bus, unsigned int devfn, int where, u32 *val); -int pci_bus_write_config_byte (struct pci_bus *bus, unsigned int devfn, int where, u8 val); -int pci_bus_write_config_word (struct pci_bus *bus, unsigned int devfn, int where, u16 val); -int pci_bus_write_config_dword (struct pci_bus *bus, unsigned int devfn, int where, u32 val); - -static inline int pci_read_config_byte(struct pci_dev *dev, int where, u8 *val) -{ - return pci_bus_read_config_byte (dev->bus, dev->devfn, where, val); -} -static inline int pci_read_config_word(struct pci_dev *dev, int where, u16 *val) -{ - return pci_bus_read_config_word (dev->bus, dev->devfn, where, val); -} -static inline int pci_read_config_dword(struct pci_dev *dev, int where, u32 *val) -{ - return pci_bus_read_config_dword (dev->bus, dev->devfn, where, val); -} -static inline int pci_write_config_byte(struct pci_dev *dev, int where, u8 val) -{ - return pci_bus_write_config_byte (dev->bus, dev->devfn, where, val); -} -static inline int pci_write_config_word(struct pci_dev *dev, int where, u16 val) -{ - return pci_bus_write_config_word (dev->bus, dev->devfn, where, val); -} -static inline int pci_write_config_dword(struct pci_dev *dev, int where, u32 val) -{ - return pci_bus_write_config_dword (dev->bus, dev->devfn, where, val); -} - -int __must_check pci_enable_device(struct pci_dev *dev); -int __must_check pci_enable_device_bars(struct pci_dev *dev, int mask); -void pci_disable_device(struct pci_dev *dev); -void pci_set_master(struct pci_dev *dev); -#define HAVE_PCI_SET_MWI -int __must_check pci_set_mwi(struct pci_dev *dev); -void pci_clear_mwi(struct pci_dev *dev); -void pci_intx(struct pci_dev *dev, int enable); -int pci_set_dma_mask(struct pci_dev *dev, u64 mask); -int pci_set_consistent_dma_mask(struct pci_dev *dev, u64 mask); -void pci_update_resource(struct pci_dev *dev, struct resource *res, int resno); -int __must_check pci_assign_resource(struct pci_dev *dev, int i); -int __must_check pci_assign_resource_fixed(struct pci_dev *dev, int i); -void pci_restore_bars(struct pci_dev *dev); - -/* ROM control related routines */ -void __iomem __must_check *pci_map_rom(struct pci_dev *pdev, size_t *size); -void __iomem __must_check *pci_map_rom_copy(struct pci_dev *pdev, size_t *size); -void pci_unmap_rom(struct pci_dev *pdev, void __iomem *rom); -void pci_remove_rom(struct pci_dev *pdev); - -/* Power management related routines */ -int pci_save_state(struct pci_dev *dev); -int pci_restore_state(struct pci_dev *dev); -int pci_set_power_state(struct pci_dev *dev, pci_power_t state); -pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state); -int pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable); - -/* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */ -void pci_bus_assign_resources(struct pci_bus *bus); -void pci_bus_size_bridges(struct pci_bus *bus); -int pci_claim_resource(struct pci_dev *, int); -void pci_assign_unassigned_resources(void); -void pdev_enable_device(struct pci_dev *); -void pdev_sort_resources(struct pci_dev *, struct resource_list *); -void pci_fixup_irqs(u8 (*)(struct pci_dev *, u8 *), - int (*)(struct pci_dev *, u8, u8)); -#define HAVE_PCI_REQ_REGIONS 2 -int __must_check pci_request_regions(struct pci_dev *, const char *); -void pci_release_regions(struct pci_dev *); -int __must_check pci_request_region(struct pci_dev *, int, const char *); -void pci_release_region(struct pci_dev *, int); - -/* drivers/pci/bus.c */ -int __must_check pci_bus_alloc_resource(struct pci_bus *bus, - struct resource *res, resource_size_t size, - resource_size_t align, resource_size_t min, - unsigned int type_mask, - void (*alignf)(void *, struct resource *, - resource_size_t, resource_size_t), - void *alignf_data); -void pci_enable_bridges(struct pci_bus *bus); - -/* Proper probing supporting hot-pluggable devices */ -int __must_check __pci_register_driver(struct pci_driver *, struct module *); -static inline int __must_check pci_register_driver(struct pci_driver *driver) -{ - return __pci_register_driver(driver, THIS_MODULE); -} - -void pci_unregister_driver(struct pci_driver *); -void pci_remove_behind_bridge(struct pci_dev *); -struct pci_driver *pci_dev_driver(const struct pci_dev *); -const struct pci_device_id *pci_match_device(struct pci_driver *drv, struct pci_dev *dev); -const struct pci_device_id *pci_match_id(const struct pci_device_id *ids, struct pci_dev *dev); -int pci_scan_bridge(struct pci_bus *bus, struct pci_dev * dev, int max, int pass); - -void pci_walk_bus(struct pci_bus *top, void (*cb)(struct pci_dev *, void *), - void *userdata); -int pci_cfg_space_size(struct pci_dev *dev); -unsigned char pci_bus_max_busnr(struct pci_bus* bus); - -/* kmem_cache style wrapper around pci_alloc_consistent() */ - -#include <linux/dmapool.h> - -#define pci_pool dma_pool -#define pci_pool_create(name, pdev, size, align, allocation) \ - dma_pool_create(name, &pdev->dev, size, align, allocation) -#define pci_pool_destroy(pool) dma_pool_destroy(pool) -#define pci_pool_alloc(pool, flags, handle) dma_pool_alloc(pool, flags, handle) -#define pci_pool_free(pool, vaddr, addr) dma_pool_free(pool, vaddr, addr) - -enum pci_dma_burst_strategy { - PCI_DMA_BURST_INFINITY, /* make bursts as large as possible, - strategy_parameter is N/A */ - PCI_DMA_BURST_BOUNDARY, /* disconnect at every strategy_parameter - byte boundaries */ - PCI_DMA_BURST_MULTIPLE, /* disconnect at some multiple of - strategy_parameter byte boundaries */ -}; - -#if defined(CONFIG_ISA) || defined(CONFIG_EISA) -extern struct pci_dev *isa_bridge; -#endif - -struct msix_entry { - u16 vector; /* kernel uses to write allocated vector */ - u16 entry; /* driver uses to specify entry, OS writes */ -}; - - -#ifndef CONFIG_PCI_MSI -static inline void pci_scan_msi_device(struct pci_dev *dev) {} -static inline int pci_enable_msi(struct pci_dev *dev) {return -1;} -static inline void pci_disable_msi(struct pci_dev *dev) {} -static inline int pci_enable_msix(struct pci_dev* dev, - struct msix_entry *entries, int nvec) {return -1;} -static inline void pci_disable_msix(struct pci_dev *dev) {} -static inline void msi_remove_pci_irq_vectors(struct pci_dev *dev) {} -#else -extern void pci_scan_msi_device(struct pci_dev *dev); -extern int pci_enable_msi(struct pci_dev *dev); -extern void pci_disable_msi(struct pci_dev *dev); -extern int pci_enable_msix(struct pci_dev* dev, - struct msix_entry *entries, int nvec); -extern void pci_disable_msix(struct pci_dev *dev); -extern void msi_remove_pci_irq_vectors(struct pci_dev *dev); -#endif - -#ifdef CONFIG_HT_IRQ -/* The functions a driver should call */ -int ht_create_irq(struct pci_dev *dev, int idx); -void ht_destroy_irq(unsigned int irq); -#endif /* CONFIG_HT_IRQ */ - -extern void pci_block_user_cfg_access(struct pci_dev *dev); -extern void pci_unblock_user_cfg_access(struct pci_dev *dev); - -/* - * PCI domain support. Sometimes called PCI segment (eg by ACPI), - * a PCI domain is defined to be a set of PCI busses which share - * configuration space. - */ -#ifndef CONFIG_PCI_DOMAINS -static inline int pci_domain_nr(struct pci_bus *bus) { return 0; } -static inline int pci_proc_domain(struct pci_bus *bus) -{ - return 0; -} -#endif - -#else /* CONFIG_PCI is not enabled */ - -/* - * If the system does not have PCI, clearly these return errors. Define - * these as simple inline functions to avoid hair in drivers. - */ - -#define _PCI_NOP(o,s,t) \ - static inline int pci_##o##_config_##s (struct pci_dev *dev, int where, t val) \ - { return PCIBIOS_FUNC_NOT_SUPPORTED; } -#define _PCI_NOP_ALL(o,x) _PCI_NOP(o,byte,u8 x) \ - _PCI_NOP(o,word,u16 x) \ - _PCI_NOP(o,dword,u32 x) -_PCI_NOP_ALL(read, *) -_PCI_NOP_ALL(write,) - -static inline struct pci_dev *pci_find_device(unsigned int vendor, unsigned int device, const struct pci_dev *from) -{ return NULL; } - -static inline struct pci_dev *pci_find_slot(unsigned int bus, unsigned int devfn) -{ return NULL; } - -static inline struct pci_dev *pci_get_device(unsigned int vendor, - unsigned int device, struct pci_dev *from) -{ return NULL; } - -static inline struct pci_dev *pci_get_device_reverse(unsigned int vendor, - unsigned int device, struct pci_dev *from) -{ return NULL; } - -static inline struct pci_dev *pci_get_subsys (unsigned int vendor, unsigned int device, -unsigned int ss_vendor, unsigned int ss_device, struct pci_dev *from) -{ return NULL; } - -static inline struct pci_dev *pci_get_class(unsigned int class, struct pci_dev *from) -{ return NULL; } - -#define pci_dev_present(ids) (0) -#define pci_dev_put(dev) do { } while (0) - -static inline void pci_set_master(struct pci_dev *dev) { } -static inline int pci_enable_device(struct pci_dev *dev) { return -EIO; } -static inline void pci_disable_device(struct pci_dev *dev) { } -static inline int pci_set_dma_mask(struct pci_dev *dev, u64 mask) { return -EIO; } -static inline int pci_assign_resource(struct pci_dev *dev, int i) { return -EBUSY;} -static inline int __pci_register_driver(struct pci_driver *drv, struct module *owner) { return 0;} -static inline int pci_register_driver(struct pci_driver *drv) { return 0;} -static inline void pci_unregister_driver(struct pci_driver *drv) { } -static inline int pci_find_capability (struct pci_dev *dev, int cap) {return 0; } -static inline int pci_find_next_capability (struct pci_dev *dev, u8 post, int cap) { return 0; } -static inline int pci_find_ext_capability (struct pci_dev *dev, int cap) {return 0; } -static inline const struct pci_device_id *pci_match_device(const struct pci_device_id *ids, const struct pci_dev *dev) { return NULL; } - -/* Power management related routines */ -static inline int pci_save_state(struct pci_dev *dev) { return 0; } -static inline int pci_restore_state(struct pci_dev *dev) { return 0; } -static inline int pci_set_power_state(struct pci_dev *dev, pci_power_t state) { return 0; } -static inline pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state) { return PCI_D0; } -static inline int pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable) { return 0; } - -#define isa_bridge ((struct pci_dev *)NULL) - -#define pci_dma_burst_advice(pdev, strat, strategy_parameter) do { } while (0) - -static inline void pci_block_user_cfg_access(struct pci_dev *dev) { } -static inline void pci_unblock_user_cfg_access(struct pci_dev *dev) { } - -#endif /* CONFIG_PCI */ - -/* Include architecture-dependent settings and functions */ - -#include <asm/pci.h> - -/* these helpers provide future and backwards compatibility - * for accessing popular PCI BAR info */ -#define pci_resource_start(dev,bar) ((dev)->resource[(bar)].start) -#define pci_resource_end(dev,bar) ((dev)->resource[(bar)].end) -#define pci_resource_flags(dev,bar) ((dev)->resource[(bar)].flags) -#define pci_resource_len(dev,bar) \ - ((pci_resource_start((dev),(bar)) == 0 && \ - pci_resource_end((dev),(bar)) == \ - pci_resource_start((dev),(bar))) ? 0 : \ - \ - (pci_resource_end((dev),(bar)) - \ - pci_resource_start((dev),(bar)) + 1)) - -/* Similar to the helpers above, these manipulate per-pci_dev - * driver-specific data. They are really just a wrapper around - * the generic device structure functions of these calls. - */ -static inline void *pci_get_drvdata (struct pci_dev *pdev) -{ - return dev_get_drvdata(&pdev->dev); -} - -static inline void pci_set_drvdata (struct pci_dev *pdev, void *data) -{ - dev_set_drvdata(&pdev->dev, data); -} - -/* If you want to know what to call your pci_dev, ask this function. - * Again, it's a wrapper around the generic device. - */ -static inline char *pci_name(struct pci_dev *pdev) -{ - return pdev->dev.bus_id; -} - - -/* Some archs don't want to expose struct resource to userland as-is - * in sysfs and /proc - */ -#ifndef HAVE_ARCH_PCI_RESOURCE_TO_USER -static inline void pci_resource_to_user(const struct pci_dev *dev, int bar, - const struct resource *rsrc, resource_size_t *start, - resource_size_t *end) -{ - *start = rsrc->start; - *end = rsrc->end; -} -#endif /* HAVE_ARCH_PCI_RESOURCE_TO_USER */ - - -/* - * The world is not perfect and supplies us with broken PCI devices. - * For at least a part of these bugs we need a work-around, so both - * generic (drivers/pci/quirks.c) and per-architecture code can define - * fixup hooks to be called for particular buggy devices. - */ - -struct pci_fixup { - u16 vendor, device; /* You can use PCI_ANY_ID here of course */ - void (*hook)(struct pci_dev *dev); -}; - -enum pci_fixup_pass { - pci_fixup_early, /* Before probing BARs */ - pci_fixup_header, /* After reading configuration header */ - pci_fixup_final, /* Final phase of device fixups */ - pci_fixup_enable, /* pci_enable_device() time */ -}; - -/* Anonymous variables would be nice... */ -#define DECLARE_PCI_FIXUP_SECTION(section, name, vendor, device, hook) \ - static const struct pci_fixup __pci_fixup_##name __attribute_used__ \ - __attribute__((__section__(#section))) = { vendor, device, hook }; -#define DECLARE_PCI_FIXUP_EARLY(vendor, device, hook) \ - DECLARE_PCI_FIXUP_SECTION(.pci_fixup_early, \ - vendor##device##hook, vendor, device, hook) -#define DECLARE_PCI_FIXUP_HEADER(vendor, device, hook) \ - DECLARE_PCI_FIXUP_SECTION(.pci_fixup_header, \ - vendor##device##hook, vendor, device, hook) -#define DECLARE_PCI_FIXUP_FINAL(vendor, device, hook) \ - DECLARE_PCI_FIXUP_SECTION(.pci_fixup_final, \ - vendor##device##hook, vendor, device, hook) -#define DECLARE_PCI_FIXUP_ENABLE(vendor, device, hook) \ - DECLARE_PCI_FIXUP_SECTION(.pci_fixup_enable, \ - vendor##device##hook, vendor, device, hook) - - -void pci_fixup_device(enum pci_fixup_pass pass, struct pci_dev *dev); - -extern int pci_pci_problems; -#define PCIPCI_FAIL 1 /* No PCI PCI DMA */ -#define PCIPCI_TRITON 2 -#define PCIPCI_NATOMA 4 -#define PCIPCI_VIAETBF 8 -#define PCIPCI_VSFX 16 -#define PCIPCI_ALIMAGIK 32 /* Need low latency setting */ -#define PCIAGP_FAIL 64 /* No PCI to AGP DMA */ - -#endif /* __KERNEL__ */ -#endif /* LINUX_PCI_H */ diff -Naurp xen/include/asm-ia64/mm.h xen-redhat/include/asm-ia64/mm.h --- xen/include/asm-ia64/mm.h +++ xen-redhat/include/asm-ia64/mm.h @@ -417,7 +417,7 @@ extern unsigned long totalram_pages; extern int nr_swap_pages; extern void alloc_dom_xen_and_dom_io(void); -extern void mm_teardown(struct domain* d); +extern int mm_teardown(struct domain* d); extern void mm_final_teardown(struct domain* d); extern struct page_info * assign_new_domain_page(struct domain *d, unsigned long mpaddr); extern void assign_new_domain0_page(struct domain *d, unsigned long mpaddr); @@ -508,4 +508,6 @@ int steal_page( #define domain_get_maximum_gpfn(d) (-ENOSYS) +extern struct domain *dom_xen, *dom_io; /* for vmcoreinfo */ + #endif /* __ASM_IA64_MM_H__ */ diff -Naurp xen/include/asm-ia64/regionreg.h xen-redhat/include/asm-ia64/regionreg.h --- xen/include/asm-ia64/regionreg.h +++ xen-redhat/include/asm-ia64/regionreg.h @@ -76,7 +76,8 @@ extern int deallocate_rid_range(struct d struct vcpu; extern void init_all_rr(struct vcpu *v); -extern int set_metaphysical_rr0(void); +extern void set_virtual_rr0(void); +extern void set_metaphysical_rr0(void); extern void load_region_regs(struct vcpu *v); diff -Naurp xen/include/asm-ia64/shadow.h xen-redhat/include/asm-ia64/shadow.h --- xen/include/asm-ia64/shadow.h +++ xen-redhat/include/asm-ia64/shadow.h @@ -40,8 +40,8 @@ * Utilities to change relationship of gpfn->mfn for designated domain, * which is required by gnttab transfer, balloon, device model and etc. */ -void guest_physmap_add_page(struct domain *d, unsigned long gpfn, unsigned long mfn); -void guest_physmap_remove_page(struct domain *d, unsigned long gpfn, unsigned long mfn); +int guest_physmap_add_page(struct domain *d, unsigned long gpfn, unsigned long mfn, int order); +void guest_physmap_remove_page(struct domain *d, unsigned long gpfn, unsigned long mfn, int order); static inline int shadow_mode_enabled(struct domain *d) diff -Naurp xen/include/asm-ia64/vmmu.h xen-redhat/include/asm-ia64/vmmu.h --- xen/include/asm-ia64/vmmu.h +++ xen-redhat/include/asm-ia64/vmmu.h @@ -24,12 +24,8 @@ #define XEN_TLBthash_H #define MAX_CCN_DEPTH (15) // collision chain depth -#define VCPU_VTLB_SHIFT (20) // 1M for VTLB -#define VCPU_VTLB_SIZE (1UL<<VCPU_VTLB_SHIFT) -#define VCPU_VTLB_ORDER (VCPU_VTLB_SHIFT - PAGE_SHIFT) -#define VCPU_VHPT_SHIFT (24) // 16M for VTLB -#define VCPU_VHPT_SIZE (1UL<<VCPU_VHPT_SHIFT) -#define VCPU_VHPT_ORDER (VCPU_VHPT_SHIFT - PAGE_SHIFT) +#define DEFAULT_VTLB_SZ (14) // 16K hash + 16K c-chain for VTLB +#define DEFAULT_VHPT_SZ (23) // 8M hash + 8M c-chain for VHPT #define VTLB(v,_x) (v->arch.vtlb._x) #define VHPT(v,_x) (v->arch.vhpt._x) #ifndef __ASSEMBLY__ @@ -195,15 +191,17 @@ typedef struct thash_cb { u64 hash_sz; // size of above data. void *cch_buf; // base address of collision chain. u64 cch_sz; // size of above data. + u64 cch_free_idx; // index of free entry. thash_data_t *cch_freelist; - thash_data_t *cch_rec_head; // cch recycle header PTA pta; } thash_cb_t; /* - * Initialize internal control data before service. + * Allocate and initialize internal control data before service. */ -extern void thash_init(thash_cb_t *hcb, u64 sz); +extern int thash_alloc(thash_cb_t *hcb, u64 sz, char *what); + +extern void thash_free(thash_cb_t *hcb); /* * Insert an entry to hash table. @@ -279,6 +277,7 @@ extern void thash_purge_and_insert(struc * */ extern void thash_purge_all(struct vcpu *v); +extern void vmx_vcpu_flush_vtlb_all(struct vcpu *v); /* * Lookup the hash table and its collision chain to find an entry diff -Naurp xen/include/asm-ia64/vmx.h xen-redhat/include/asm-ia64/vmx.h --- xen/include/asm-ia64/vmx.h +++ xen-redhat/include/asm-ia64/vmx.h @@ -22,6 +22,8 @@ #ifndef _ASM_IA64_VT_H #define _ASM_IA64_VT_H +#include <asm/ia64_int.h> + #include <public/hvm/ioreq.h> #define vmx_user_mode(regs) (((struct ia64_psr *)&(regs)->cr_ipsr)->vm == 1) @@ -36,7 +38,7 @@ extern void vmx_load_state(struct vcpu * extern void vmx_setup_platform(struct domain *d); extern void vmx_do_launch(struct vcpu *v); extern void vmx_io_assist(struct vcpu *v); -extern int ia64_hypercall (struct pt_regs *regs); +extern IA64FAULT ia64_hypercall(struct pt_regs *regs); extern void vmx_save_state(struct vcpu *v); extern void vmx_load_state(struct vcpu *v); extern void show_registers(struct pt_regs *regs); @@ -50,12 +52,15 @@ extern void set_ifa_itir_iha (struct vcp extern void inject_guest_interruption(struct vcpu *vcpu, u64 vec); extern void set_illegal_op_isr (struct vcpu *vcpu); extern void illegal_op (struct vcpu *vcpu); +extern void set_rsv_reg_field_isr (struct vcpu *vcpu); +extern void rsv_reg_field (struct vcpu *vcpu); extern void vmx_relinquish_guest_resources(struct domain *d); extern void vmx_relinquish_vcpu_resources(struct vcpu *v); extern void vmx_die_if_kernel(char *str, struct pt_regs *regs, long err); extern void vmx_send_assist_req(struct vcpu *v); extern void deliver_pal_init(struct vcpu *vcpu); extern void vmx_pend_pal_init(struct domain *d); +extern void vmx_lazy_load_fpu(struct vcpu *vcpu); static inline vcpu_iodata_t *get_vio(struct domain *d, unsigned long cpu) { diff -Naurp xen/include/asm-ia64/vmx_pal_vsa.h xen-redhat/include/asm-ia64/vmx_pal_vsa.h --- xen/include/asm-ia64/vmx_pal_vsa.h +++ xen-redhat/include/asm-ia64/vmx_pal_vsa.h @@ -28,6 +28,14 @@ #ifndef __ASSEMBLY__ extern u64 ia64_call_vsa(u64 proc, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7); + +/* entry points in assembly code for calling vps services */ + +extern char vmx_vps_sync_read; +extern char vmx_vps_sync_write; +extern char vmx_vps_resume_normal; +extern char vmx_vps_resume_handler; + extern u64 __vsa_base; #endif /* __ASSEMBLY__ */ @@ -38,6 +46,8 @@ extern u64 __vsa_base; #define PAL_VPS_SET_PENDING_INTERRUPT 0x1000 #define PAL_VPS_THASH 0x1400 #define PAL_VPS_TTAG 0x1800 +#define PAL_VPS_RESTORE 0x1c00 +#define PAL_VPS_SAVE 0x2000 #endif /* _PAL_VSA_H_ */ diff -Naurp xen/include/asm-ia64/vmx_phy_mode.h xen-redhat/include/asm-ia64/vmx_phy_mode.h --- xen/include/asm-ia64/vmx_phy_mode.h +++ xen-redhat/include/asm-ia64/vmx_phy_mode.h @@ -120,4 +120,6 @@ extern void physical_tlb_miss(VCPU *vcpu #define GUEST_VIRT 1 /* Guest in virtual mode */ #define GUEST_PHYS 2 /* Guest in physical mode, requiring emulation */ +#define PAL_INIT_ENTRY 0x80000000ffffffa0 + #endif /* _PHY_MODE_H_ */ diff -Naurp xen/include/asm-ia64/vmx_platform.h xen-redhat/include/asm-ia64/vmx_platform.h --- xen/include/asm-ia64/vmx_platform.h +++ xen-redhat/include/asm-ia64/vmx_platform.h @@ -22,6 +22,7 @@ #include <public/xen.h> #include <public/hvm/params.h> #include <asm/viosapic.h> +#include <asm/hvm/vacpi.h> struct mmio_list; typedef struct virtual_platform_def { unsigned long buffered_io_va; @@ -33,6 +34,7 @@ typedef struct virtual_platform_def { struct mmio_list *mmio; /* One IOSAPIC now... */ struct viosapic viosapic; + struct vacpi vacpi; } vir_plat_t; static inline int __fls(uint32_t word) diff -Naurp xen/include/asm-ia64/vmx_vcpu.h xen-redhat/include/asm-ia64/vmx_vcpu.h --- xen/include/asm-ia64/vmx_vcpu.h +++ xen-redhat/include/asm-ia64/vmx_vcpu.h @@ -331,34 +331,22 @@ static inline IA64FAULT vmx_vcpu_get_cpu static inline IA64FAULT vmx_vcpu_set_dbr(VCPU * vcpu, u64 reg, u64 val) { - // TODO: unimplemented DBRs return a reserved register fault - // TODO: Should set Logical CPU state, not just physical - ia64_set_dbr(reg, val); - return IA64_NO_FAULT; + return vcpu_set_dbr(vcpu, reg, val); } static inline IA64FAULT vmx_vcpu_set_ibr(VCPU * vcpu, u64 reg, u64 val) { - // TODO: unimplemented IBRs return a reserved register fault - // TODO: Should set Logical CPU state, not just physical - ia64_set_ibr(reg, val); - return IA64_NO_FAULT; + return vcpu_set_ibr(vcpu, reg, val); } static inline IA64FAULT vmx_vcpu_get_dbr(VCPU * vcpu, u64 reg, u64 * pval) { - // TODO: unimplemented DBRs return a reserved register fault - u64 val = ia64_get_dbr(reg); - *pval = val; - return IA64_NO_FAULT; + return vcpu_get_dbr(vcpu, reg, pval); } static inline IA64FAULT vmx_vcpu_get_ibr(VCPU * vcpu, u64 reg, u64 * pval) { - // TODO: unimplemented IBRs return a reserved register fault - u64 val = ia64_get_ibr(reg); - *pval = val; - return IA64_NO_FAULT; + return vcpu_get_ibr(vcpu, reg, pval); } /************************************************************************** diff -Naurp xen/include/asm-ia64/xenprocessor.h xen-redhat/include/asm-ia64/xenprocessor.h --- xen/include/asm-ia64/xenprocessor.h +++ xen-redhat/include/asm-ia64/xenprocessor.h @@ -237,6 +237,10 @@ typedef union { u64 itir; } ia64_itir_t; -#define dump_execution_state() printk("FIXME: implement ia64 dump_execution_state()\n"); +#define dump_execution_state() \ + do { \ + printk("FIXME: implement ia64 dump_execution_state()\n"); \ + dump_stack(); \ + } while (0) #endif // _ASM_IA64_XENPROCESSOR_H diff -Naurp xen/include/asm-x86/acpi.h xen-redhat/include/asm-x86/acpi.h --- xen/include/asm-x86/acpi.h +++ xen-redhat/include/asm-x86/acpi.h @@ -178,4 +178,6 @@ extern void acpi_reserve_bootmem(void); extern u8 x86_acpiid_to_apicid[]; #define MAX_LOCAL_APIC 256 +extern int acpi_dmar_init(void); + #endif /*_ASM_ACPI_H*/ diff -Naurp xen/include/asm-x86/amd-iommu.h xen-redhat/include/asm-x86/amd-iommu.h --- xen/include/asm-x86/amd-iommu.h +++ xen-redhat/include/asm-x86/amd-iommu.h @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2007 Advanced Micro Devices, Inc. + * Author: Leo Duran <leo.duran@amd.com> + * Author: Wei Wang <wei.wang2@amd.com> - adapted to xen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _ASM_X86_64_AMD_IOMMU_H +#define _ASM_X86_64_AMD_IOMMU_H + +#include <xen/init.h> +#include <xen/types.h> +#include <xen/list.h> +#include <xen/spinlock.h> +#include <asm/hvm/svm/amd-iommu-defs.h> + +#define iommu_found() (!list_empty(&amd_iommu_head)) + +extern struct list_head amd_iommu_head; + +extern int __init amd_iov_detect(void); + +struct table_struct { + void *buffer; + unsigned long entries; + unsigned long alloc_size; +}; + +struct amd_iommu { + struct list_head list; + spinlock_t lock; /* protect iommu */ + + u16 bdf; + u8 cap_offset; + u8 revision; + u8 unit_id; + u8 msi_number; + + u8 pte_not_present_cached; + u8 ht_tunnel_support; + u8 iotlb_support; + + u8 isochronous; + u8 coherent; + u8 res_pass_pw; + u8 pass_pw; + u8 ht_tunnel_enable; + + void *mmio_base; + unsigned long mmio_base_phys; + + struct table_struct dev_table; + struct table_struct cmd_buffer; + u32 cmd_buffer_tail; + struct table_struct event_log; + u32 event_log_head; + + int exclusion_enable; + int exclusion_allow_all; + uint64_t exclusion_base; + uint64_t exclusion_limit; + + int msi_cap; + int maskbit; + + int enabled; + int vector; +}; + +struct ivrs_mappings { + u16 dte_requestor_id; + u8 dte_sys_mgt_enable; + u8 dte_allow_exclusion; + u8 unity_map_enable; + u8 write_permission; + u8 read_permission; + unsigned long addr_range_start; + unsigned long addr_range_length; + struct amd_iommu *iommu; + + /* per device interrupt remapping table */ + void *intremap_table; + spinlock_t intremap_lock; + + /* interrupt remapping settings */ + u8 dte_lint1_pass; + u8 dte_lint0_pass; + u8 dte_nmi_pass; + u8 dte_ext_int_pass; + u8 dte_init_pass; +}; +#endif /* _ASM_X86_64_AMD_IOMMU_H */ diff -Naurp xen/include/asm-x86/apic.h xen-redhat/include/asm-x86/apic.h --- xen/include/asm-x86/apic.h +++ xen-redhat/include/asm-x86/apic.h @@ -2,9 +2,7 @@ #define __ASM_APIC_H #include <xen/config.h> -#include <asm/fixmap.h> #include <asm/apicdef.h> -#include <asm/processor.h> #include <asm/system.h> #define Dprintk(x...) @@ -51,11 +49,7 @@ static __inline u32 apic_read(unsigned l return *((volatile u32 *)(APIC_BASE+reg)); } -static __inline__ void apic_wait_icr_idle(void) -{ - while ( apic_read( APIC_ICR ) & APIC_ICR_BUSY ) - cpu_relax(); -} +void apic_wait_icr_idle(void); int get_physical_broadcast(void); diff -Naurp xen/include/asm-x86/config.h xen-redhat/include/asm-x86/config.h --- xen/include/asm-x86/config.h +++ xen-redhat/include/asm-x86/config.h @@ -382,4 +382,6 @@ extern unsigned long xen_phys_start, xen #define ELFSIZE 32 #endif +#define ARCH_CRASH_SAVE_VMCOREINFO + #endif /* __X86_CONFIG_H__ */ diff -Naurp xen/include/asm-x86/cpufeature.h xen-redhat/include/asm-x86/cpufeature.h --- xen/include/asm-x86/cpufeature.h +++ xen-redhat/include/asm-x86/cpufeature.h @@ -31,7 +31,7 @@ #define X86_FEATURE_PSE36 (0*32+17) /* 36-bit PSEs */ #define X86_FEATURE_PN (0*32+18) /* Processor serial number */ #define X86_FEATURE_CLFLSH (0*32+19) /* Supports the CLFLUSH instruction */ -#define X86_FEATURE_DTES (0*32+21) /* Debug Trace Store */ +#define X86_FEATURE_DS (0*32+21) /* Debug Store */ #define X86_FEATURE_ACPI (0*32+22) /* ACPI via MSR */ #define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */ #define X86_FEATURE_FXSR (0*32+24) /* FXSAVE and FXRSTOR instructions (fast save and restore */ @@ -42,6 +42,7 @@ #define X86_FEATURE_HT (0*32+28) /* Hyper-Threading */ #define X86_FEATURE_ACC (0*32+29) /* Automatic clock control */ #define X86_FEATURE_IA64 (0*32+30) /* IA-64 processor */ +#define X86_FEATURE_PBE (0*32+31) /* Pending Break Enable */ /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ /* Don't duplicate feature flags which are redundant with Intel! */ @@ -49,6 +50,8 @@ #define X86_FEATURE_MP (1*32+19) /* MP Capable. */ #define X86_FEATURE_NX (1*32+20) /* Execute Disable */ #define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ +#define X86_FEATURE_FFXSR (1*32+25) /* FFXSR instruction optimizations */ +#define X86_FEATURE_PAGE1GB (1*32+26) /* 1Gb large page support */ #define X86_FEATURE_RDTSCP (1*32+27) /* RDTSCP */ #define X86_FEATURE_LM (1*32+29) /* Long Mode (x86-64) */ #define X86_FEATURE_3DNOWEXT (1*32+30) /* AMD 3DNow! extensions */ @@ -71,29 +74,65 @@ #define X86_FEATURE_P3 (3*32+ 6) /* P3 */ #define X86_FEATURE_P4 (3*32+ 7) /* P4 */ #define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */ +#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* Streaming SIMD Extensions-3 */ +#define X86_FEATURE_DTES64 (4*32+ 2) /* 64-bit Debug Store */ #define X86_FEATURE_MWAIT (4*32+ 3) /* Monitor/Mwait support */ #define X86_FEATURE_DSCPL (4*32+ 4) /* CPL Qualified Debug Store */ #define X86_FEATURE_VMXE (4*32+ 5) /* Virtual Machine Extensions */ +#define X86_FEATURE_SMXE (4*32+ 6) /* Safer Mode Extensions */ #define X86_FEATURE_EST (4*32+ 7) /* Enhanced SpeedStep */ #define X86_FEATURE_TM2 (4*32+ 8) /* Thermal Monitor 2 */ #define X86_FEATURE_CID (4*32+10) /* Context ID */ +#define X86_FEATURE_FMA (4*32+12) /* Fused multiply-add */ #define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B */ #define X86_FEATURE_XTPR (4*32+14) /* Send Task Priority Messages */ +#define X86_FEATURE_PDCM (4*32+15) /* Perf/Debug Capability MSR */ +#define X86_FEATURE_DCA (4*32+18) /* Direct Cache Access */ +#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */ +#define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */ +#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ +#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ +#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ +#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running under some hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ #define X86_FEATURE_XSTORE (5*32+ 2) /* on-CPU RNG present (xstore insn) */ #define X86_FEATURE_XSTORE_EN (5*32+ 3) /* on-CPU RNG enabled */ #define X86_FEATURE_XCRYPT (5*32+ 6) /* on-CPU crypto (xcrypt insn) */ #define X86_FEATURE_XCRYPT_EN (5*32+ 7) /* on-CPU crypto enabled */ +#define X86_FEATURE_ACE2 (5*32+ 8) /* Advanced Cryptography Engine v2 */ +#define X86_FEATURE_ACE2_EN (5*32+ 9) /* ACE v2 enabled */ +#define X86_FEATURE_PHE (5*32+ 10) /* PadLock Hash Engine */ +#define X86_FEATURE_PHE_EN (5*32+ 11) /* PHE enabled */ +#define X86_FEATURE_PMM (5*32+ 12) /* PadLock Montgomery Multiplier */ +#define X86_FEATURE_PMM_EN (5*32+ 13) /* PMM enabled */ + /* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ #define X86_FEATURE_LAHF_LM (6*32+ 0) /* LAHF/SAHF in long mode */ #define X86_FEATURE_CMP_LEGACY (6*32+ 1) /* If yes HyperThreading not valid */ -#define X86_FEATURE_SVME (6*32+ 2) /* Secure Virtual Machine */ -#define X86_FEATURE_FFXSR (6*32+25) /* FFXSR instruction optimizations */ +#define X86_FEATURE_SVM (6*32+ 2) /* Secure virtual machine */ +#define X86_FEATURE_EXTAPIC (6*32+ 3) /* Extended APIC space */ +#define X86_FEATURE_CR8_LEGACY (6*32+ 4) /* CR8 in 32-bit mode */ +#define X86_FEATURE_ABM (6*32+ 5) /* Advanced bit manipulation */ +#define X86_FEATURE_SSE4A (6*32+ 6) /* SSE-4A */ +#define X86_FEATURE_MISALIGNSSE (6*32+ 7) /* Misaligned SSE mode */ +#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */ +#define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */ +#define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */ +#define X86_FEATURE_XOP (6*32+11) /* extended AVX instructions */ +#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */ +#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ +#define X86_FEATURE_LWP (6*32+15) /* Light Weight Profiling */ +#define X86_FEATURE_FMA4 (6*32+16) /* 4 operands MAC instructions */ +#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ +#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */ +#define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */ +#define X86_FEATURE_PERFCTR_CORE (6*32+ 23) /* core perf counter extensions */ +#define X86_FEATURE_PERFCTR_NB (6*32+ 24) /* NB perf counter extensions */ #define cpu_has(c, bit) test_bit(bit, (c)->x86_capability) #define boot_cpu_has(bit) test_bit(bit, boot_cpu_data.x86_capability) @@ -105,6 +144,7 @@ #define cpu_has_tsc boot_cpu_has(X86_FEATURE_TSC) #define cpu_has_pae boot_cpu_has(X86_FEATURE_PAE) #define cpu_has_pge boot_cpu_has(X86_FEATURE_PGE) +#define cpu_has_pat boot_cpu_has(X86_FEATURE_PAT) #define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC) #define cpu_has_sep boot_cpu_has(X86_FEATURE_SEP) #define cpu_has_mtrr boot_cpu_has(X86_FEATURE_MTRR) @@ -121,6 +161,8 @@ #define cpu_has_cyrix_arr boot_cpu_has(X86_FEATURE_CYRIX_ARR) #define cpu_has_centaur_mcr boot_cpu_has(X86_FEATURE_CENTAUR_MCR) #define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLSH) +#define cpu_has_page1gb 0 +#define cpu_has_efer (boot_cpu_data.x86_capability[1] & 0x20100800) #else /* __x86_64__ */ #define cpu_has_vme 0 #define cpu_has_de 1 @@ -128,6 +170,7 @@ #define cpu_has_tsc 1 #define cpu_has_pae 1 #define cpu_has_pge 1 +#define cpu_has_pat 1 #define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC) #define cpu_has_sep 0 #define cpu_has_mtrr 1 @@ -144,8 +187,12 @@ #define cpu_has_cyrix_arr 0 #define cpu_has_centaur_mcr 0 #define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLSH) +#define cpu_has_page1gb boot_cpu_has(X86_FEATURE_PAGE1GB) +#define cpu_has_efer 1 #endif +#define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON) + #endif /* __ASM_I386_CPUFEATURE_H */ /* diff -Naurp xen/include/asm-x86/debugger.h xen-redhat/include/asm-x86/debugger.h --- xen/include/asm-x86/debugger.h +++ xen-redhat/include/asm-x86/debugger.h @@ -68,6 +68,8 @@ static inline int debugger_trap_entry( if ( guest_kernel_mode(v, regs) && v->domain->debugger_attached && ((vector == TRAP_int3) || (vector == TRAP_debug)) ) { + if ( vector != TRAP_debug ) /* domain pause is good enough */ + current->arch.gdbsx_vcpu_event = vector; domain_pause_for_debugger(); return 1; } diff -Naurp xen/include/asm-x86/desc.h xen-redhat/include/asm-x86/desc.h --- xen/include/asm-x86/desc.h +++ xen-redhat/include/asm-x86/desc.h @@ -34,11 +34,9 @@ #define FLAT_COMPAT_USER_CS FLAT_COMPAT_RING3_CS #define FLAT_COMPAT_USER_SS FLAT_COMPAT_RING3_SS -#define __FIRST_TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8) -#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY + 2) - -#define __TSS(n) (((n)<<2) + __FIRST_TSS_ENTRY) -#define __LDT(n) (((n)<<2) + __FIRST_LDT_ENTRY) +#define TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8) +#define LDT_ENTRY (TSS_ENTRY + 2) +#define PER_CPU_GDT_ENTRY (LDT_ENTRY + 2) #elif defined(__i386__) @@ -49,19 +47,17 @@ #define FLAT_COMPAT_USER_DS FLAT_USER_DS #define FLAT_COMPAT_USER_SS FLAT_USER_SS -#define __DOUBLEFAULT_TSS_ENTRY FIRST_RESERVED_GDT_ENTRY - -#define __FIRST_TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8) -#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY + 1) +#define DOUBLEFAULT_TSS_ENTRY FIRST_RESERVED_GDT_ENTRY -#define __TSS(n) (((n)<<1) + __FIRST_TSS_ENTRY) -#define __LDT(n) (((n)<<1) + __FIRST_LDT_ENTRY) +#define TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8) +#define LDT_ENTRY (TSS_ENTRY + 1) +#define PER_CPU_GDT_ENTRY (LDT_ENTRY + 1) #endif #ifndef __ASSEMBLY__ -#define load_TR(n) __asm__ __volatile__ ("ltr %%ax" : : "a" (__TSS(n)<<3) ) +#define load_TR(n) __asm__ __volatile__ ("ltr %%ax" : : "a" (TSS_ENTRY<<3) ) #if defined(__x86_64__) #define GUEST_KERNEL_RPL(d) (is_pv_32bit_domain(d) ? 1 : 3) @@ -194,20 +190,25 @@ __asm__ __volatile__ ("movw %w3,0(%2)\n\ "rorl $16,%%eax" \ : "=m"(*(n)) : "a" (addr), "r"(n), "ir"(limit), "i"(type|0x80)) +DECLARE_PER_CPU(struct tss_struct *, doublefault_tss); + #endif -extern struct desc_struct gdt_table[]; +struct desc_ptr { + unsigned short limit; + unsigned long base; +} __attribute__((__packed__)) ; + +extern struct desc_struct boot_cpu_gdt_table[]; +DECLARE_PER_CPU(struct desc_struct *, gdt_table); #ifdef CONFIG_COMPAT -extern struct desc_struct compat_gdt_table[]; +extern struct desc_struct boot_cpu_compat_gdt_table[]; +DECLARE_PER_CPU(struct desc_struct *, compat_gdt_table); #else -# define compat_gdt_table gdt_table +# define boot_cpu_compat_gdt_table boot_cpu_gdt_table +# define per_cpu__compat_gdt_table per_cpu__gdt_table #endif -struct Xgt_desc_struct { - unsigned short size; - unsigned long address __attribute__((packed)); -}; - extern void set_intr_gate(unsigned int irq, void * addr); extern void set_system_gate(unsigned int n, void *addr); extern void set_task_gate(unsigned int n, unsigned int sel); diff -Naurp xen/include/asm-x86/domain.h xen-redhat/include/asm-x86/domain.h --- xen/include/asm-x86/domain.h +++ xen-redhat/include/asm-x86/domain.h @@ -16,7 +16,6 @@ #define is_pv_32on64_domain(d) (0) #endif #define is_pv_32on64_vcpu(v) (is_pv_32on64_domain((v)->domain)) -#define IS_COMPAT(d) (is_pv_32on64_domain(d)) struct trap_bounce { uint32_t error_code; @@ -139,6 +138,13 @@ struct p2m_domain { struct page_info * (*alloc_page )(struct domain *d); void (*free_page )(struct domain *d, struct page_info *pg); + int (*set_entry )(struct domain *d, unsigned long gfn, + mfn_t mfn, int order, u32 l1e_flags); + mfn_t (*get_entry )(struct domain *d, unsigned long gfn); + mfn_t (*get_entry_fast)(unsigned long gfn); + + void (*change_entry_type_global)(struct domain *d, + u32 l1e_flags); /* Highest guest frame that's ever been mapped in the p2m */ unsigned long max_mapped_pfn; @@ -212,15 +218,22 @@ struct arch_domain /* I/O-port admin-specified access capabilities. */ struct rangeset *ioport_caps; + uint32_t pci_cf8; struct hvm_domain hvm_domain; + /* pass-throughed device list */ + struct list_head pdev_list; + struct paging_domain paging; struct p2m_domain p2m ; /* Shadow translated domain: P2M mapping */ pagetable_t phys_table; + int vector_pirq[NR_VECTORS]; + int pirq_vector[NR_IRQS]; + /* Pseudophysical e820 map (XENMEM_memory_map). */ struct e820entry e820[3]; unsigned int nr_e820; @@ -232,6 +245,17 @@ struct arch_domain bool_t is_32bit_pv; /* Is shared-info page in 32-bit format? */ bool_t has_32bit_shinfo; + + /* Continuable domain_relinquish_resources(). */ + enum { + RELMEM_not_started, + RELMEM_xen, + RELMEM_l4, + RELMEM_l3, + RELMEM_l2, + RELMEM_done, + } relmem; + struct list_head relmem_list; } __cacheline_aligned; #ifdef CONFIG_X86_PAE @@ -268,6 +292,9 @@ struct arch_vcpu void (*ctxt_switch_from) (struct vcpu *); void (*ctxt_switch_to) (struct vcpu *); + /* Record information required to continue execution after migration */ + void *continue_info; + /* Bounce information for propagating an exception to guest OS. */ struct trap_bounce trap_bounce; @@ -309,12 +336,27 @@ struct arch_vcpu /* Guest-specified relocation of vcpu_info. */ unsigned long vcpu_info_mfn; + + uint32_t gdbsx_vcpu_event; } __cacheline_aligned; -/* shorthands to improve code legibility */ +/* Shorthands to improve code legibility. */ #define hvm_vmx hvm_vcpu.u.vmx #define hvm_svm hvm_vcpu.u.svm +/* Continue the current hypercall via func(data) on specified cpu. */ +int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data); + +/* Clean up CR4 bits that are not under guest control. */ + unsigned long pv_guest_cr4_fixup(unsigned long guest_cr4); + +/* Convert between guest-visible and real CR4 values. */ +#define pv_guest_cr4_to_real_cr4(c) \ + ((c) | (mmu_cr4_features & (X86_CR4_PGE | X86_CR4_PSE))) +#define real_cr4_to_pv_guest_cr4(c) \ + ((c) & ~(X86_CR4_PGE | X86_CR4_PSE)) + + #endif /* __ASM_DOMAIN_H__ */ /* diff -Naurp xen/include/asm-x86/e820.h xen-redhat/include/asm-x86/e820.h --- xen/include/asm-x86/e820.h +++ xen-redhat/include/asm-x86/e820.h @@ -22,6 +22,7 @@ struct e820map { struct e820entry map[E820MAX]; }; +extern int reserve_e820_ram(struct e820map *e820, uint64_t s, uint64_t e); extern unsigned long init_e820(const char *, struct e820entry *, int *); extern struct e820map e820; diff -Naurp xen/include/asm-x86/event.h xen-redhat/include/asm-x86/event.h --- xen/include/asm-x86/event.h +++ xen-redhat/include/asm-x86/event.h @@ -10,7 +10,6 @@ #define __ASM_EVENT_H__ #include <xen/shared.h> -#include <asm/hvm/irq.h> /* cpu_has_pending_irq() */ static inline void vcpu_kick(struct vcpu *v) { @@ -31,7 +30,12 @@ static inline void vcpu_kick(struct vcpu static inline void vcpu_mark_events_pending(struct vcpu *v) { - if ( !test_and_set_bit(0, &vcpu_info(v, evtchn_upcall_pending)) ) + if ( test_and_set_bit(0, &vcpu_info(v, evtchn_upcall_pending)) ) + return; + + if ( is_hvm_vcpu(v) ) + hvm_assert_evtchn_irq(v); + else vcpu_kick(v); } diff -Naurp xen/include/asm-x86/fixmap.h xen-redhat/include/asm-x86/fixmap.h --- xen/include/asm-x86/fixmap.h +++ xen-redhat/include/asm-x86/fixmap.h @@ -17,6 +17,9 @@ #include <asm/acpi.h> #include <asm/page.h> #include <xen/kexec.h> +#include <xen/iommu.h> +#include <asm/msi.h> +#include <asm/amd-iommu.h> /* * Here we define all the compile-time 'special' virtual @@ -40,6 +43,12 @@ enum fixed_addresses { FIX_KEXEC_BASE_0, FIX_KEXEC_BASE_END = FIX_KEXEC_BASE_0 \ + ((KEXEC_XEN_NO_PAGES >> 1) * KEXEC_IMAGE_NR) - 1, + FIX_IOMMU_REGS_BASE_0, + FIX_IOMMU_REGS_END = FIX_IOMMU_REGS_BASE_0 + MAX_IOMMUS-1, + FIX_IOMMU_MMIO_BASE_0, + FIX_IOMMU_MMIO_END = FIX_IOMMU_MMIO_BASE_0 + IOMMU_PAGES-1, + FIX_MSIX_IO_RESERV_BASE, + FIX_MSIX_IO_RESERV_END = FIX_MSIX_IO_RESERV_BASE + FIX_MSIX_MAX_PAGES -1, __end_of_fixed_addresses }; diff -Naurp xen/include/asm-x86/flushtlb.h xen-redhat/include/asm-x86/flushtlb.h --- xen/include/asm-x86/flushtlb.h +++ xen-redhat/include/asm-x86/flushtlb.h @@ -74,12 +74,17 @@ extern void write_cr3(unsigned long cr3) /* Flush guest mappings from the TLB and implicitly tick the tlbflush clock. */ extern void local_flush_tlb(void); +#ifdef USER_MAPPINGS_ARE_GLOBAL +#define local_flush_tlb_pge() local_flush_tlb() +#else #define local_flush_tlb_pge() \ do { \ - __pge_off(); \ + unsigned long cr4 = read_cr4(); \ + write_cr4(cr4 & ~X86_CR4_PGE); \ local_flush_tlb(); \ - __pge_on(); \ + write_cr4(cr4); \ } while ( 0 ) +#endif #define local_flush_tlb_one(__addr) \ __asm__ __volatile__("invlpg %0": :"m" (*(char *) (__addr))) diff -Naurp xen/include/asm-x86/guest_access.h xen-redhat/include/asm-x86/guest_access.h --- xen/include/asm-x86/guest_access.h +++ xen-redhat/include/asm-x86/guest_access.h @@ -17,7 +17,8 @@ /* Offset the given guest handle into the array it refers to. */ #define guest_handle_add_offset(hnd, nr) ((hnd).p += (nr)) - +#define guest_handle_subtract_offset(hnd, nr) ((hnd).p -= (nr)) + /* Cast a guest handle to the specified type of handle. */ #define guest_handle_cast(hnd, type) ({ \ type *_x = (hnd).p; \ diff -Naurp xen/include/asm-x86/hvm/domain.h xen-redhat/include/asm-x86/hvm/domain.h --- xen/include/asm-x86/hvm/domain.h +++ xen-redhat/include/asm-x86/hvm/domain.h @@ -27,6 +27,7 @@ #include <asm/hvm/io.h> #include <public/hvm/params.h> #include <public/hvm/save.h> +#include <xen/hvm/iommu.h> struct hvm_ioreq_page { spinlock_t lock; @@ -45,6 +46,7 @@ struct hvm_domain { spinlock_t vapic_access_lock; int physmap_changed_for_vlapic_access : 1; struct page_info *apic_access_page; + unsigned long vmx_apic_access_mfn; struct hvm_io_handler io_handler; @@ -54,12 +56,28 @@ struct hvm_domain { struct hvm_hw_vpic vpic[2]; /* 0=master; 1=slave */ struct hvm_hw_vioapic vioapic; + /* VCPU which is current target for 8259 interrupts. */ + struct vcpu *i8259_target; + /* hvm_print_line() logging. */ char pbuf[80]; int pbuf_idx; spinlock_t pbuf_lock; uint64_t params[HVM_NR_PARAMS]; + + /* Pass-through */ + struct hvm_iommu hvm_iommu; + + /* hypervisor intercepted msix table */ + struct list_head msixtbl_list; + spinlock_t msixtbl_list_lock; + +#if CONFIG_PAGING_LEVELS == 3 + bool_t amd_npt_4gb_warning; +#endif + + unsigned long vmx_vpid_base; }; #endif /* __ASM_X86_HVM_DOMAIN_H__ */ diff -Naurp xen/include/asm-x86/hvm/hvm.h xen-redhat/include/asm-x86/hvm/hvm.h --- xen/include/asm-x86/hvm/hvm.h +++ xen-redhat/include/asm-x86/hvm/hvm.h @@ -55,6 +55,14 @@ typedef struct segment_register { u64 base; } __attribute__ ((packed)) segment_register_t; +/* Interrupt acknowledgement sources. */ +enum hvm_intack { + hvm_intack_none, + hvm_intack_pic, + hvm_intack_lapic, + hvm_intack_nmi +}; + /* * The hardware virtual machine (HVM) interface abstracts away from the * x86/x86_64 CPU virtualization assist specifics. Currently this interface @@ -63,14 +71,22 @@ typedef struct segment_register { struct hvm_function_table { char *name; + /* Support Hardware-Assisted Paging? */ + int hap_supported; + + /* Support 1GB host page table? */ + int hap_1gb_pgtb; + /* * Disable HVM functionality */ void (*disable)(void); /* - * Initialise/destroy HVM VCPU resources + * Initialise/destroy HVM domain/vcpu resources */ + int (*domain_initialise)(struct domain *d); + void (*domain_destroy)(struct domain *d); int (*vcpu_initialise)(struct vcpu *v); void (*vcpu_destroy)(struct vcpu *v); @@ -104,12 +120,14 @@ struct hvm_function_table { int (*long_mode_enabled)(struct vcpu *v); int (*pae_enabled)(struct vcpu *v); int (*nx_enabled)(struct vcpu *v); - int (*interrupts_enabled)(struct vcpu *v); + int (*interrupts_enabled)(struct vcpu *v, enum hvm_intack); int (*guest_x86_mode)(struct vcpu *v); unsigned long (*get_guest_ctrl_reg)(struct vcpu *v, unsigned int num); unsigned long (*get_segment_base)(struct vcpu *v, enum x86_segment seg); void (*get_segment_register)(struct vcpu *v, enum x86_segment seg, struct segment_register *reg); + void (*set_segment_register)(struct vcpu *v, enum x86_segment seg, + struct segment_register *reg); /* * Re-set the value of CR3 that Xen runs on when handling VM exits @@ -122,6 +140,12 @@ struct hvm_function_table { void (*update_guest_cr3)(struct vcpu *v); /* + * Called to inform the HVM layer that the guest loaded cr3, and setup + * page tables accordingly. Operates on the current VCPU. + */ + int (*set_cr3)(unsigned long value); + + /* * Called to ensure than all guest-specific mappings in a tagged TLB * are flushed; does *not* flush Xen's TLB entries, and on * processors without a tagged TLB it will be a noop. @@ -149,7 +173,9 @@ struct hvm_function_table { void (*init_hypercall_page)(struct domain *d, void *hypercall_page); - int (*event_injection_faulted)(struct vcpu *v); + int (*event_pending)(struct vcpu *v); + + void (*update_guest_cr)(struct vcpu *v, unsigned int cr); }; extern struct hvm_function_table hvm_funcs; @@ -178,7 +204,11 @@ hvm_load_cpu_guest_regs(struct vcpu *v, hvm_funcs.load_cpu_guest_regs(v, r); } -void hvm_set_guest_time(struct vcpu *v, u64 gtime); +void hvm_set_guest_tsc(struct vcpu *v, u64 guest_tsc); +u64 hvm_get_guest_tsc(struct vcpu *v); + +void hvm_init_guest_time(struct domain *d); +void hvm_set_guest_time(struct vcpu *v, u64 guest_time); u64 hvm_get_guest_time(struct vcpu *v); static inline int @@ -197,16 +227,16 @@ hvm_long_mode_enabled(struct vcpu *v) #define hvm_long_mode_enabled(v) (v,0) #endif - static inline int +static inline int hvm_pae_enabled(struct vcpu *v) { return hvm_funcs.pae_enabled(v); } static inline int -hvm_interrupts_enabled(struct vcpu *v) +hvm_interrupts_enabled(struct vcpu *v, enum hvm_intack type) { - return hvm_funcs.interrupts_enabled(v); + return hvm_funcs.interrupts_enabled(v, type); } static inline int @@ -238,6 +268,12 @@ hvm_update_vtpr(struct vcpu *v, unsigned void hvm_update_guest_cr3(struct vcpu *v, unsigned long guest_cr3); +static inline void hvm_update_guest_cr(struct vcpu *v, unsigned int cr) +{ + if ( hvm_funcs.update_guest_cr ) + hvm_funcs.update_guest_cr(v, cr); +} + static inline void hvm_flush_guest_tlbs(void) { @@ -267,6 +303,19 @@ hvm_get_segment_register(struct vcpu *v, hvm_funcs.get_segment_register(v, seg, reg); } +static inline void +hvm_set_segment_register(struct vcpu *v, enum x86_segment seg, + struct segment_register *reg) +{ + hvm_funcs.set_segment_register(v, seg, reg); +} + +static inline int +hvm_set_cr3(unsigned long value) +{ + return hvm_funcs.set_cr3(value); +} + void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx); void hvm_stts(struct vcpu *v); @@ -288,9 +337,9 @@ hvm_inject_exception(unsigned int trapnr int hvm_bringup_ap(int vcpuid, int trampoline_vector); -static inline int hvm_event_injection_faulted(struct vcpu *v) +static inline int hvm_event_pending(struct vcpu *v) { - return hvm_funcs.event_injection_faulted(v); + return hvm_funcs.event_pending(v); } /* These reserved bits in lower 32 remain 0 after any load of CR0 */ @@ -315,4 +364,23 @@ static inline int hvm_event_injection_fa /* These exceptions must always be intercepted. */ #define HVM_TRAP_MASK (1U << TRAP_machine_check) +#define HVM_IDENT_PT_PAGE 0xE8000 + +enum hvm_task_switch_reason { TSW_jmp, TSW_iret, TSW_call_or_int }; +void hvm_task_switch( + uint16_t tss_sel, enum hvm_task_switch_reason taskswitch_reason, + int32_t errcode); + +enum hvm_access_type { + hvm_access_insn_fetch, hvm_access_read, hvm_access_write +}; +int hvm_virtual_to_linear_addr( + enum x86_segment seg, + struct segment_register *reg, + unsigned long offset, + unsigned int bytes, + enum hvm_access_type access_type, + unsigned int addr_size, + unsigned long *linear_addr); + #endif /* __ASM_X86_HVM_HVM_H__ */ diff -Naurp xen/include/asm-x86/hvm/io.h xen-redhat/include/asm-x86/hvm/io.h --- xen/include/asm-x86/hvm/io.h +++ xen-redhat/include/asm-x86/hvm/io.h @@ -86,14 +86,14 @@ struct hvm_io_op { #define HVM_MMIO 1 typedef int (*intercept_action_t)(ioreq_t *); -typedef unsigned long (*hvm_mmio_read_t)(struct vcpu *v, - unsigned long addr, - unsigned long length); - -typedef void (*hvm_mmio_write_t)(struct vcpu *v, +typedef int (*hvm_mmio_read_t)(struct vcpu *v, unsigned long addr, unsigned long length, - unsigned long val); + unsigned long *val); +typedef int (*hvm_mmio_write_t)(struct vcpu *v, + unsigned long addr, + unsigned long length, + unsigned long val); typedef int (*hvm_mmio_check_t)(struct vcpu *v, unsigned long addr); @@ -137,13 +137,6 @@ static inline int register_portio_handle return register_io_handler(d, addr, size, action, HVM_PORTIO); } -#if defined(__i386__) || defined(__x86_64__) -static inline int irq_masked(unsigned long eflags) -{ - return ((eflags & X86_EFLAGS_IF) == 0); -} -#endif - extern void send_pio_req(unsigned long port, unsigned long count, int size, paddr_t value, int dir, int df, int value_is_ptr); void send_timeoffset_req(unsigned long timeoff); @@ -151,6 +144,9 @@ void send_invalidate_req(void); extern void handle_mmio(unsigned long gpa); extern void hvm_interrupt_post(struct vcpu *v, int vector, int type); extern void hvm_io_assist(void); +void hvm_dpci_eoi(struct domain *d, unsigned int guest_irq, + union vioapic_redir_entry *ent); +extern void hvm_dpci_msi_eoi(struct domain *d, int vector); #endif /* __ASM_X86_HVM_IO_H__ */ diff -Naurp xen/include/asm-x86/hvm/iommu.h xen-redhat/include/asm-x86/hvm/iommu.h --- xen/include/asm-x86/hvm/iommu.h +++ xen-redhat/include/asm-x86/hvm/iommu.h @@ -0,0 +1,40 @@ +#ifndef __ASM_X86_HVM_IOMMU_H__ +#define __ASM_X86_HVM_IOMMU_H__ + +struct iommu_ops; +extern struct iommu_ops intel_iommu_ops; +extern struct iommu_ops amd_iommu_ops; +extern int intel_vtd_setup(void); +extern int amd_iov_detect(void); + +static inline struct iommu_ops *iommu_get_ops(void) +{ + switch ( boot_cpu_data.x86_vendor ) + { + case X86_VENDOR_INTEL: + return &intel_iommu_ops; + case X86_VENDOR_AMD: + return &amd_iommu_ops; + default: + BUG(); + } + + return NULL; +} + +static inline int iommu_hardware_setup(void) +{ + switch ( boot_cpu_data.x86_vendor ) + { + case X86_VENDOR_INTEL: + return intel_vtd_setup(); + case X86_VENDOR_AMD: + return amd_iov_detect(); + default: + BUG(); + } + + return 0; +} + +#endif /* __ASM_X86_HVM_IOMMU_H__ */ diff -Naurp xen/include/asm-x86/hvm/irq.h xen-redhat/include/asm-x86/hvm/irq.h --- xen/include/asm-x86/hvm/irq.h +++ xen-redhat/include/asm-x86/hvm/irq.h @@ -24,10 +24,12 @@ #include <xen/types.h> #include <xen/spinlock.h> +#include <asm/irq.h> +#include <asm/hvm/hvm.h> #include <asm/hvm/vpic.h> #include <asm/hvm/vioapic.h> #include <public/hvm/save.h> - +#include <xen/hvm/irq.h> struct hvm_irq { /* @@ -89,6 +91,8 @@ struct hvm_irq { /* Last VCPU that was delivered a LowestPrio interrupt. */ u8 round_robin_prev_vcpu; + + struct hvm_irq_dpci *dpci; }; #define hvm_pci_intx_gsi(dev, intx) \ @@ -112,12 +116,23 @@ void hvm_isa_irq_deassert( void hvm_set_pci_link_route(struct domain *d, u8 link, u8 isa_irq); -void hvm_set_callback_irq_level(void); +void hvm_maybe_deassert_evtchn_irq(void); +void hvm_assert_evtchn_irq(struct vcpu *v); void hvm_set_callback_via(struct domain *d, uint64_t via); -int cpu_get_interrupt(struct vcpu *v, int *type); -int cpu_has_pending_irq(struct vcpu *v); -int get_isa_irq_vector(struct vcpu *vcpu, int irq, int type); +/* Check/Acknowledge next pending interrupt. */ +enum hvm_intack hvm_vcpu_has_pending_irq(struct vcpu *v); +int hvm_vcpu_ack_pending_irq( + struct vcpu *v, enum hvm_intack type, int *vector); + +int get_isa_irq_vector(struct vcpu *vcpu, int isa_irq, enum hvm_intack src); int is_isa_irq_masked(struct vcpu *v, int isa_irq); +/* + * Currently IA64 Xen doesn't support MSI. So for x86, we define this macro + * to control the conditional compilation of some MSI-related functions. + * This macro will be removed once IA64 has MSI support. + */ +#define SUPPORT_MSI_REMAPPING 1 + #endif /* __ASM_X86_HVM_IRQ_H__ */ diff -Naurp xen/include/asm-x86/hvm/support.h xen-redhat/include/asm-x86/hvm/support.h --- xen/include/asm-x86/hvm/support.h +++ xen-redhat/include/asm-x86/hvm/support.h @@ -210,6 +210,8 @@ __initcall(__hvm_register_##_x##_save_an /* Entry points for saving and restoring HVM domain state */ size_t hvm_save_size(struct domain *d); int hvm_save(struct domain *d, hvm_domain_context_t *h); +int hvm_save_one(struct domain *d, uint16_t typecode, uint16_t instance, + XEN_GUEST_HANDLE_64(uint8_t) handle); int hvm_load(struct domain *d, hvm_domain_context_t *h); /* End of save/restore */ diff -Naurp xen/include/asm-x86/hvm/svm/amd-iommu-acpi.h xen-redhat/include/asm-x86/hvm/svm/amd-iommu-acpi.h --- xen/include/asm-x86/hvm/svm/amd-iommu-acpi.h +++ xen-redhat/include/asm-x86/hvm/svm/amd-iommu-acpi.h @@ -0,0 +1,185 @@ +/* + * Copyright (C) 2007 Advanced Micro Devices, Inc. + * Author: Leo Duran <leo.duran@amd.com> + * Author: Wei Wang <wei.wang2@amd.com> - adapted to xen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _ASM_X86_64_AMD_IOMMU_ACPI_H +#define _ASM_X86_64_AMD_IOMMU_ACPI_H + +#include <xen/acpi.h> + +/* I/O Virtualization Reporting Structure */ +#define AMD_IOMMU_ACPI_IVRS_SIG "IVRS" +#define AMD_IOMMU_ACPI_IVHD_TYPE 0x10 +#define AMD_IOMMU_ACPI_IVMD_ALL_TYPE 0x20 +#define AMD_IOMMU_ACPI_IVMD_ONE_TYPE 0x21 +#define AMD_IOMMU_ACPI_IVMD_RANGE_TYPE 0x22 +#define AMD_IOMMU_ACPI_IVMD_IOMMU_TYPE 0x23 + +/* 4-byte Device Entries */ +#define AMD_IOMMU_ACPI_IVHD_DEV_U32_PAD 0 +#define AMD_IOMMU_ACPI_IVHD_DEV_SELECT 2 +#define AMD_IOMMU_ACPI_IVHD_DEV_RANGE_START 3 +#define AMD_IOMMU_ACPI_IVHD_DEV_RANGE_END 4 + +/* 8-byte Device Entries */ +#define AMD_IOMMU_ACPI_IVHD_DEV_U64_PAD 64 +#define AMD_IOMMU_ACPI_IVHD_DEV_ALIAS_SELECT 66 +#define AMD_IOMMU_ACPI_IVHD_DEV_ALIAS_RANGE 67 +#define AMD_IOMMU_ACPI_IVHD_DEV_EXT_SELECT 70 +#define AMD_IOMMU_ACPI_IVHD_DEV_EXT_RANGE 71 +#define AMD_IOMMU_ACPI_IVHD_DEV_SPECIAL 72 + +/* IVHD IOMMU Flags */ +#define AMD_IOMMU_ACPI_COHERENT_MASK 0x20 +#define AMD_IOMMU_ACPI_COHERENT_SHIFT 5 +#define AMD_IOMMU_ACPI_IOTLB_SUP_MASK 0x10 +#define AMD_IOMMU_ACPI_IOTLB_SUP_SHIFT 4 +#define AMD_IOMMU_ACPI_ISOC_MASK 0x08 +#define AMD_IOMMU_ACPI_ISOC_SHIFT 3 +#define AMD_IOMMU_ACPI_RES_PASS_PW_MASK 0x04 +#define AMD_IOMMU_ACPI_RES_PASS_PW_SHIFT 2 +#define AMD_IOMMU_ACPI_PASS_PW_MASK 0x02 +#define AMD_IOMMU_ACPI_PASS_PW_SHIFT 1 +#define AMD_IOMMU_ACPI_HT_TUN_ENB_MASK 0x01 +#define AMD_IOMMU_ACPI_HT_TUN_ENB_SHIFT 0 + +/* IVHD Device Flags */ +#define AMD_IOMMU_ACPI_LINT1_PASS_MASK 0x80 +#define AMD_IOMMU_ACPI_LINT1_PASS_SHIFT 7 +#define AMD_IOMMU_ACPI_LINT0_PASS_MASK 0x40 +#define AMD_IOMMU_ACPI_LINT0_PASS_SHIFT 6 +#define AMD_IOMMU_ACPI_SYS_MGT_MASK 0x30 +#define AMD_IOMMU_ACPI_SYS_MGT_SHIFT 4 +#define AMD_IOMMU_ACPI_NMI_PASS_MASK 0x04 +#define AMD_IOMMU_ACPI_NMI_PASS_SHIFT 2 +#define AMD_IOMMU_ACPI_EINT_PASS_MASK 0x02 +#define AMD_IOMMU_ACPI_EINT_PASS_SHIFT 1 +#define AMD_IOMMU_ACPI_INIT_PASS_MASK 0x01 +#define AMD_IOMMU_ACPI_INIT_PASS_SHIFT 0 + +/* IVHD Device Extended Flags */ +#define AMD_IOMMU_ACPI_ATS_DISABLED_MASK 0x80000000 +#define AMD_IOMMU_ACPI_ATS_DISABLED_SHIFT 31 + +/* IVMD Device Flags */ +#define AMD_IOMMU_ACPI_EXCLUSION_RANGE_MASK 0x08 +#define AMD_IOMMU_ACPI_EXCLUSION_RANGE_SHIFT 3 +#define AMD_IOMMU_ACPI_IW_PERMISSION_MASK 0x04 +#define AMD_IOMMU_ACPI_IW_PERMISSION_SHIFT 2 +#define AMD_IOMMU_ACPI_IR_PERMISSION_MASK 0x02 +#define AMD_IOMMU_ACPI_IR_PERMISSION_SHIFT 1 +#define AMD_IOMMU_ACPI_UNITY_MAPPING_MASK 0x01 +#define AMD_IOMMU_ACPI_UNITY_MAPPING_SHIFT 0 + +#define ACPI_OEM_ID_SIZE 6 +#define ACPI_OEM_TABLE_ID_SIZE 8 + +#pragma pack(1) +struct acpi_ivrs_table_header { + struct acpi_table_header acpi_header; + u32 io_info; + u8 reserved[8]; +}; + +struct acpi_ivrs_block_header { + u8 type; + u8 flags; + u16 length; + u16 dev_id; +}; + +struct acpi_ivhd_block_header { + struct acpi_ivrs_block_header header; + u16 cap_offset; + u64 mmio_base; + u16 pci_segment; + u16 iommu_info; + u8 reserved[4]; +}; + +struct acpi_ivhd_device_header { + u8 type; + u16 dev_id; + u8 flags; +}; + +struct acpi_ivhd_device_trailer { + u8 type; + u16 dev_id; + u8 reserved; +}; + +struct acpi_ivhd_device_range { + struct acpi_ivhd_device_header header; + struct acpi_ivhd_device_trailer trailer; +}; + +struct acpi_ivhd_device_alias { + struct acpi_ivhd_device_header header; + u8 reserved1; + u16 dev_id; + u8 reserved2; +}; + +struct acpi_ivhd_device_alias_range { + struct acpi_ivhd_device_alias alias; + struct acpi_ivhd_device_trailer trailer; +}; + +struct acpi_ivhd_device_extended { + struct acpi_ivhd_device_header header; + u32 ext_flags; +}; + +struct acpi_ivhd_device_extended_range { + struct acpi_ivhd_device_extended extended; + struct acpi_ivhd_device_trailer trailer; +}; + +struct acpi_ivhd_device_special { + struct acpi_ivhd_device_header header; + u8 handle; + u16 dev_id; + u8 variety; +}; + +union acpi_ivhd_device { + struct acpi_ivhd_device_header header; + struct acpi_ivhd_device_range range; + struct acpi_ivhd_device_alias alias; + struct acpi_ivhd_device_alias_range alias_range; + struct acpi_ivhd_device_extended extended; + struct acpi_ivhd_device_extended_range extended_range; + struct acpi_ivhd_device_special special; +}; + +struct acpi_ivmd_block_header { + struct acpi_ivrs_block_header header; + union { + u16 last_dev_id; + u16 cap_offset; + u16 reserved1; + }; + u64 reserved2; + u64 start_addr; + u64 mem_length; +}; +#pragma pack() + +#endif /* _ASM_X86_64_AMD_IOMMU_ACPI_H */ diff -Naurp xen/include/asm-x86/hvm/svm/amd-iommu-defs.h xen-redhat/include/asm-x86/hvm/svm/amd-iommu-defs.h --- xen/include/asm-x86/hvm/svm/amd-iommu-defs.h +++ xen-redhat/include/asm-x86/hvm/svm/amd-iommu-defs.h @@ -0,0 +1,415 @@ +/* + * Copyright (C) 2007 Advanced Micro Devices, Inc. + * Author: Leo Duran <leo.duran@amd.com> + * Author: Wei Wang <wei.wang2@amd.com> - adapted to xen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _ASM_X86_64_AMD_IOMMU_DEFS_H +#define _ASM_X86_64_AMD_IOMMU_DEFS_H + +/* IOMMU Command Buffer entries: in power of 2 increments, minimum of 256 */ +#define IOMMU_CMD_BUFFER_DEFAULT_ENTRIES 512 + +/* IOMMU Event Log entries: in power of 2 increments, minimum of 256 */ +#define IOMMU_EVENT_LOG_DEFAULT_ENTRIES 512 + +#define PTE_PER_TABLE_SHIFT 9 +#define PTE_PER_TABLE_SIZE (1 << PTE_PER_TABLE_SHIFT) +#define PTE_PER_TABLE_MASK (~(PTE_PER_TABLE_SIZE - 1)) +#define PTE_PER_TABLE_ALIGN(entries) \ + (((entries) + PTE_PER_TABLE_SIZE - 1) & PTE_PER_TABLE_MASK) +#define PTE_PER_TABLE_ALLOC(entries) \ + PAGE_SIZE * (PTE_PER_TABLE_ALIGN(entries) >> PTE_PER_TABLE_SHIFT) + +#define PCI_MIN_CAP_OFFSET 0x40 +#define PCI_MAX_CAP_BLOCKS 48 +#define PCI_CAP_PTR_MASK 0xFC + +/* IOMMU Capability */ +#define PCI_CAP_ID_MASK 0x000000FF +#define PCI_CAP_ID_SHIFT 0 +#define PCI_CAP_NEXT_PTR_MASK 0x0000FF00 +#define PCI_CAP_NEXT_PTR_SHIFT 8 +#define PCI_CAP_TYPE_MASK 0x00070000 +#define PCI_CAP_TYPE_SHIFT 16 +#define PCI_CAP_REV_MASK 0x00F80000 +#define PCI_CAP_REV_SHIFT 19 +#define PCI_CAP_IOTLB_MASK 0x01000000 +#define PCI_CAP_IOTLB_SHIFT 24 +#define PCI_CAP_HT_TUNNEL_MASK 0x02000000 +#define PCI_CAP_HT_TUNNEL_SHIFT 25 +#define PCI_CAP_NP_CACHE_MASK 0x04000000 +#define PCI_CAP_NP_CACHE_SHIFT 26 +#define PCI_CAP_RESET_MASK 0x80000000 +#define PCI_CAP_RESET_SHIFT 31 + +#define PCI_CAP_TYPE_IOMMU 0x3 + +#define PCI_CAP_MMIO_BAR_LOW_OFFSET 0x04 +#define PCI_CAP_MMIO_BAR_HIGH_OFFSET 0x08 +#define PCI_CAP_MMIO_BAR_LOW_MASK 0xFFFFC000 +#define IOMMU_MMIO_REGION_LENGTH 0x4000 + +#define PCI_CAP_RANGE_OFFSET 0x0C +#define PCI_CAP_BUS_NUMBER_MASK 0x0000FF00 +#define PCI_CAP_BUS_NUMBER_SHIFT 8 +#define PCI_CAP_FIRST_DEVICE_MASK 0x00FF0000 +#define PCI_CAP_FIRST_DEVICE_SHIFT 16 +#define PCI_CAP_LAST_DEVICE_MASK 0xFF000000 +#define PCI_CAP_LAST_DEVICE_SHIFT 24 + +#define PCI_CAP_UNIT_ID_MASK 0x0000001F +#define PCI_CAP_UNIT_ID_SHIFT 0 +#define PCI_MISC_INFO_OFFSET 0x10 +#define PCI_CAP_MSI_NUMBER_MASK 0x0000001F +#define PCI_CAP_MSI_NUMBER_SHIFT 0 + +/* Device Table */ +#define IOMMU_DEV_TABLE_BASE_LOW_OFFSET 0x00 +#define IOMMU_DEV_TABLE_BASE_HIGH_OFFSET 0x04 +#define IOMMU_DEV_TABLE_BASE_LOW_MASK 0xFFFFF000 +#define IOMMU_DEV_TABLE_BASE_LOW_SHIFT 12 +#define IOMMU_DEV_TABLE_BASE_HIGH_MASK 0x000FFFFF +#define IOMMU_DEV_TABLE_BASE_HIGH_SHIFT 0 +#define IOMMU_DEV_TABLE_SIZE_MASK 0x000001FF +#define IOMMU_DEV_TABLE_SIZE_SHIFT 0 + +#define IOMMU_DEV_TABLE_ENTRIES_PER_BUS 256 +#define IOMMU_DEV_TABLE_ENTRY_SIZE 32 +#define IOMMU_DEV_TABLE_U32_PER_ENTRY (IOMMU_DEV_TABLE_ENTRY_SIZE / 4) + +#define IOMMU_DEV_TABLE_SYS_MGT_DMA_ABORTED 0x0 +#define IOMMU_DEV_TABLE_SYS_MGT_MSG_FORWARDED 0x1 +#define IOMMU_DEV_TABLE_SYS_MGT_INT_FORWARDED 0x2 +#define IOMMU_DEV_TABLE_SYS_MGT_DMA_FORWARDED 0x3 + +#define IOMMU_DEV_TABLE_IO_CONTROL_ABORTED 0x0 +#define IOMMU_DEV_TABLE_IO_CONTROL_FORWARDED 0x1 +#define IOMMU_DEV_TABLE_IO_CONTROL_TRANSLATED 0x2 + +#define IOMMU_DEV_TABLE_INT_CONTROL_ABORTED 0x0 +#define IOMMU_DEV_TABLE_INT_CONTROL_FORWARDED 0x1 +#define IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED 0x2 + +/* DeviceTable Entry[31:0] */ +#define IOMMU_DEV_TABLE_VALID_MASK 0x00000001 +#define IOMMU_DEV_TABLE_VALID_SHIFT 0 +#define IOMMU_DEV_TABLE_TRANSLATION_VALID_MASK 0x00000002 +#define IOMMU_DEV_TABLE_TRANSLATION_VALID_SHIFT 1 +#define IOMMU_DEV_TABLE_PAGING_MODE_MASK 0x00000E00 +#define IOMMU_DEV_TABLE_PAGING_MODE_SHIFT 9 +#define IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_MASK 0xFFFFF000 +#define IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_SHIFT 12 + +/* DeviceTable Entry[63:32] */ +#define IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_MASK 0x000FFFFF +#define IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_SHIFT 0 +#define IOMMU_DEV_TABLE_IO_READ_PERMISSION_MASK 0x20000000 +#define IOMMU_DEV_TABLE_IO_READ_PERMISSION_SHIFT 29 +#define IOMMU_DEV_TABLE_IO_WRITE_PERMISSION_MASK 0x40000000 +#define IOMMU_DEV_TABLE_IO_WRITE_PERMISSION_SHIFT 30 + +/* DeviceTable Entry[95:64] */ +#define IOMMU_DEV_TABLE_DOMAIN_ID_MASK 0x0000FFFF +#define IOMMU_DEV_TABLE_DOMAIN_ID_SHIFT 0 + +/* DeviceTable Entry[127:96] */ +#define IOMMU_DEV_TABLE_IOTLB_SUPPORT_MASK 0x00000001 +#define IOMMU_DEV_TABLE_IOTLB_SUPPORT_SHIFT 0 +#define IOMMU_DEV_TABLE_SUPRESS_LOGGED_PAGES_MASK 0x00000002 +#define IOMMU_DEV_TABLE_SUPRESS_LOGGED_PAGES_SHIFT 1 +#define IOMMU_DEV_TABLE_SUPRESS_ALL_PAGES_MASK 0x00000004 +#define IOMMU_DEV_TABLE_SUPRESS_ALL_PAGES_SHIFT 2 +#define IOMMU_DEV_TABLE_IO_CONTROL_MASK 0x00000018 +#define IOMMU_DEV_TABLE_IO_CONTROL_SHIFT 3 +#define IOMMU_DEV_TABLE_IOTLB_CACHE_HINT_MASK 0x00000020 +#define IOMMU_DEV_TABLE_IOTLB_CACHE_HINT_SHIFT 5 +#define IOMMU_DEV_TABLE_SNOOP_DISABLE_MASK 0x00000040 +#define IOMMU_DEV_TABLE_SNOOP_DISABLE_SHIFT 6 +#define IOMMU_DEV_TABLE_ALLOW_EXCLUSION_MASK 0x00000080 +#define IOMMU_DEV_TABLE_ALLOW_EXCLUSION_SHIFT 7 +#define IOMMU_DEV_TABLE_SYS_MGT_MSG_ENABLE_MASK 0x00000300 +#define IOMMU_DEV_TABLE_SYS_MGT_MSG_ENABLE_SHIFT 8 + +/* DeviceTable Entry[159:128] */ +#define IOMMU_DEV_TABLE_INT_VALID_MASK 0x00000001 +#define IOMMU_DEV_TABLE_INT_VALID_SHIFT 0 +#define IOMMU_DEV_TABLE_INT_TABLE_LENGTH_MASK 0x0000001E +#define IOMMU_DEV_TABLE_INT_TABLE_LENGTH_SHIFT 1 +#define IOMMU_DEV_TABLE_INT_TABLE_IGN_UNMAPPED_MASK 0x0000000020 +#define IOMMU_DEV_TABLE_INT_TABLE_IGN_UNMAPPED_SHIFT 5 +#define IOMMU_DEV_TABLE_INT_TABLE_PTR_LOW_MASK 0xFFFFFFC0 +#define IOMMU_DEV_TABLE_INT_TABLE_PTR_LOW_SHIFT 6 + +/* DeviceTable Entry[191:160] */ +#define IOMMU_DEV_TABLE_INT_TABLE_PTR_HIGH_MASK 0x000FFFFF +#define IOMMU_DEV_TABLE_INT_TABLE_PTR_HIGH_SHIFT 0 +#define IOMMU_DEV_TABLE_INIT_PASSTHRU_MASK 0x01000000 +#define IOMMU_DEV_TABLE_INIT_PASSTHRU_SHIFT 24 +#define IOMMU_DEV_TABLE_EINT_PASSTHRU_MASK 0x02000000 +#define IOMMU_DEV_TABLE_EINT_PASSTHRU_SHIFT 25 +#define IOMMU_DEV_TABLE_NMI_PASSTHRU_MASK 0x04000000 +#define IOMMU_DEV_TABLE_NMI_PASSTHRU_SHIFT 26 +#define IOMMU_DEV_TABLE_INT_CONTROL_MASK 0x30000000 +#define IOMMU_DEV_TABLE_INT_CONTROL_SHIFT 28 +#define IOMMU_DEV_TABLE_LINT0_ENABLE_MASK 0x40000000 +#define IOMMU_DEV_TABLE_LINT0_ENABLE_SHIFT 30 +#define IOMMU_DEV_TABLE_LINT1_ENABLE_MASK 0x80000000 +#define IOMMU_DEV_TABLE_LINT1_ENABLE_SHIFT 31 + +/* Command Buffer */ +#define IOMMU_CMD_BUFFER_BASE_LOW_OFFSET 0x08 +#define IOMMU_CMD_BUFFER_BASE_HIGH_OFFSET 0x0C +#define IOMMU_CMD_BUFFER_HEAD_OFFSET 0x2000 +#define IOMMU_CMD_BUFFER_TAIL_OFFSET 0x2008 +#define IOMMU_CMD_BUFFER_BASE_LOW_MASK 0xFFFFF000 +#define IOMMU_CMD_BUFFER_BASE_LOW_SHIFT 12 +#define IOMMU_CMD_BUFFER_BASE_HIGH_MASK 0x000FFFFF +#define IOMMU_CMD_BUFFER_BASE_HIGH_SHIFT 0 +#define IOMMU_CMD_BUFFER_LENGTH_MASK 0x0F000000 +#define IOMMU_CMD_BUFFER_LENGTH_SHIFT 24 +#define IOMMU_CMD_BUFFER_HEAD_MASK 0x0007FFF0 +#define IOMMU_CMD_BUFFER_HEAD_SHIFT 4 +#define IOMMU_CMD_BUFFER_TAIL_MASK 0x0007FFF0 +#define IOMMU_CMD_BUFFER_TAIL_SHIFT 4 + +#define IOMMU_CMD_BUFFER_ENTRY_SIZE 16 +#define IOMMU_CMD_BUFFER_POWER_OF2_ENTRIES_PER_PAGE 8 +#define IOMMU_CMD_BUFFER_U32_PER_ENTRY (IOMMU_CMD_BUFFER_ENTRY_SIZE / 4) + +#define IOMMU_CMD_OPCODE_MASK 0xF0000000 +#define IOMMU_CMD_OPCODE_SHIFT 28 +#define IOMMU_CMD_COMPLETION_WAIT 0x1 +#define IOMMU_CMD_INVALIDATE_DEVTAB_ENTRY 0x2 +#define IOMMU_CMD_INVALIDATE_IOMMU_PAGES 0x3 +#define IOMMU_CMD_INVALIDATE_IOTLB_PAGES 0x4 +#define IOMMU_CMD_INVALIDATE_INT_TABLE 0x5 + +/* COMPLETION_WAIT command */ +#define IOMMU_COMP_WAIT_DATA_BUFFER_SIZE 8 +#define IOMMU_COMP_WAIT_DATA_BUFFER_ALIGNMENT 8 +#define IOMMU_COMP_WAIT_S_FLAG_MASK 0x00000001 +#define IOMMU_COMP_WAIT_S_FLAG_SHIFT 0 +#define IOMMU_COMP_WAIT_I_FLAG_MASK 0x00000002 +#define IOMMU_COMP_WAIT_I_FLAG_SHIFT 1 +#define IOMMU_COMP_WAIT_F_FLAG_MASK 0x00000004 +#define IOMMU_COMP_WAIT_F_FLAG_SHIFT 2 +#define IOMMU_COMP_WAIT_ADDR_LOW_MASK 0xFFFFFFF8 +#define IOMMU_COMP_WAIT_ADDR_LOW_SHIFT 3 +#define IOMMU_COMP_WAIT_ADDR_HIGH_MASK 0x000FFFFF +#define IOMMU_COMP_WAIT_ADDR_HIGH_SHIFT 0 + +/* INVALIDATE_IOMMU_PAGES command */ +#define IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_MASK 0x0000FFFF +#define IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_SHIFT 0 +#define IOMMU_INV_IOMMU_PAGES_S_FLAG_MASK 0x00000001 +#define IOMMU_INV_IOMMU_PAGES_S_FLAG_SHIFT 0 +#define IOMMU_INV_IOMMU_PAGES_PDE_FLAG_MASK 0x00000002 +#define IOMMU_INV_IOMMU_PAGES_PDE_FLAG_SHIFT 1 +#define IOMMU_INV_IOMMU_PAGES_ADDR_LOW_MASK 0xFFFFF000 +#define IOMMU_INV_IOMMU_PAGES_ADDR_LOW_SHIFT 12 +#define IOMMU_INV_IOMMU_PAGES_ADDR_HIGH_MASK 0xFFFFFFFF +#define IOMMU_INV_IOMMU_PAGES_ADDR_HIGH_SHIFT 0 + +/* INVALIDATE_DEVTAB_ENTRY command */ +#define IOMMU_INV_DEVTAB_ENTRY_DEVICE_ID_MASK 0x0000FFFF +#define IOMMU_INV_DEVTAB_ENTRY_DEVICE_ID_SHIFT 0 + +/* INVALIDATE_INTERRUPT_TABLE command */ +#define IOMMU_INV_INT_TABLE_DEVICE_ID_MASK 0x0000FFFF +#define IOMMU_INV_INT_TABLE_DEVICE_ID_SHIFT 0 + +/* Event Log */ +#define IOMMU_EVENT_LOG_BASE_LOW_OFFSET 0x10 +#define IOMMU_EVENT_LOG_BASE_HIGH_OFFSET 0x14 +#define IOMMU_EVENT_LOG_HEAD_OFFSET 0x2010 +#define IOMMU_EVENT_LOG_TAIL_OFFSET 0x2018 +#define IOMMU_EVENT_LOG_BASE_LOW_MASK 0xFFFFF000 +#define IOMMU_EVENT_LOG_BASE_LOW_SHIFT 12 +#define IOMMU_EVENT_LOG_BASE_HIGH_MASK 0x000FFFFF +#define IOMMU_EVENT_LOG_BASE_HIGH_SHIFT 0 +#define IOMMU_EVENT_LOG_LENGTH_MASK 0x0F000000 +#define IOMMU_EVENT_LOG_LENGTH_SHIFT 24 +#define IOMMU_EVENT_LOG_HEAD_MASK 0x0007FFF0 +#define IOMMU_EVENT_LOG_HEAD_SHIFT 4 +#define IOMMU_EVENT_LOG_TAIL_MASK 0x0007FFF0 +#define IOMMU_EVENT_LOG_TAIL_SHIFT 4 + +#define IOMMU_EVENT_LOG_ENTRY_SIZE 16 +#define IOMMU_EVENT_LOG_POWER_OF2_ENTRIES_PER_PAGE 8 +#define IOMMU_EVENT_LOG_U32_PER_ENTRY (IOMMU_EVENT_LOG_ENTRY_SIZE / 4) + +#define IOMMU_EVENT_CODE_MASK 0xF0000000 +#define IOMMU_EVENT_CODE_SHIFT 28 +#define IOMMU_EVENT_ILLEGAL_DEV_TABLE_ENTRY 0x1 +#define IOMMU_EVENT_IO_PAGE_FALT 0x2 +#define IOMMU_EVENT_DEV_TABLE_HW_ERROR 0x3 +#define IOMMU_EVENT_PAGE_TABLE_HW_ERROR 0x4 +#define IOMMU_EVENT_ILLEGAL_COMMAND_ERROR 0x5 +#define IOMMU_EVENT_COMMAND_HW_ERROR 0x6 +#define IOMMU_EVENT_IOTLB_INV_TIMEOUT 0x7 +#define IOMMU_EVENT_INVALID_DEV_REQUEST 0x8 + +#define IOMMU_EVENT_DOMAIN_ID_MASK 0x0000FFFF +#define IOMMU_EVENT_DOMAIN_ID_SHIFT 0 +#define IOMMU_EVENT_DEVICE_ID_MASK 0x0000FFFF +#define IOMMU_EVENT_DEVICE_ID_SHIFT 0 + +/* Control Register */ +#define IOMMU_CONTROL_MMIO_OFFSET 0x18 +#define IOMMU_CONTROL_TRANSLATION_ENABLE_MASK 0x00000001 +#define IOMMU_CONTROL_TRANSLATION_ENABLE_SHIFT 0 +#define IOMMU_CONTROL_HT_TUNNEL_TRANSLATION_MASK 0x00000002 +#define IOMMU_CONTROL_HT_TUNNEL_TRANSLATION_SHIFT 1 +#define IOMMU_CONTROL_EVENT_LOG_ENABLE_MASK 0x00000004 +#define IOMMU_CONTROL_EVENT_LOG_ENABLE_SHIFT 2 +#define IOMMU_CONTROL_EVENT_LOG_INT_MASK 0x00000008 +#define IOMMU_CONTROL_EVENT_LOG_INT_SHIFT 3 +#define IOMMU_CONTROL_COMP_WAIT_INT_MASK 0x00000010 +#define IOMMU_CONTROL_COMP_WAIT_INT_SHIFT 4 +#define IOMMU_CONTROL_TRANSLATION_CHECK_DISABLE_MASK 0x00000020 +#define IOMMU_CONTROL_TRANSLATION_CHECK_DISABLE_SHIFT 5 +#define IOMMU_CONTROL_INVALIDATION_TIMEOUT_MASK 0x000000C0 +#define IOMMU_CONTROL_INVALIDATION_TIMEOUT_SHIFT 6 +#define IOMMU_CONTROL_PASS_POSTED_WRITE_MASK 0x00000100 +#define IOMMU_CONTROL_PASS_POSTED_WRITE_SHIFT 8 +#define IOMMU_CONTROL_RESP_PASS_POSTED_WRITE_MASK 0x00000200 +#define IOMMU_CONTROL_RESP_PASS_POSTED_WRITE_SHIFT 9 +#define IOMMU_CONTROL_COHERENT_MASK 0x00000400 +#define IOMMU_CONTROL_COHERENT_SHIFT 10 +#define IOMMU_CONTROL_ISOCHRONOUS_MASK 0x00000800 +#define IOMMU_CONTROL_ISOCHRONOUS_SHIFT 11 +#define IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_MASK 0x00001000 +#define IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_SHIFT 12 +#define IOMMU_CONTROL_RESTART_MASK 0x80000000 +#define IOMMU_CONTROL_RESTART_SHIFT 31 + +/* Exclusion Register */ +#define IOMMU_EXCLUSION_BASE_LOW_OFFSET 0x20 +#define IOMMU_EXCLUSION_BASE_HIGH_OFFSET 0x24 +#define IOMMU_EXCLUSION_LIMIT_LOW_OFFSET 0x28 +#define IOMMU_EXCLUSION_LIMIT_HIGH_OFFSET 0x2C +#define IOMMU_EXCLUSION_BASE_LOW_MASK 0xFFFFF000 +#define IOMMU_EXCLUSION_BASE_LOW_SHIFT 12 +#define IOMMU_EXCLUSION_BASE_HIGH_MASK 0xFFFFFFFF +#define IOMMU_EXCLUSION_BASE_HIGH_SHIFT 0 +#define IOMMU_EXCLUSION_RANGE_ENABLE_MASK 0x00000001 +#define IOMMU_EXCLUSION_RANGE_ENABLE_SHIFT 0 +#define IOMMU_EXCLUSION_ALLOW_ALL_MASK 0x00000002 +#define IOMMU_EXCLUSION_ALLOW_ALL_SHIFT 1 +#define IOMMU_EXCLUSION_LIMIT_LOW_MASK 0xFFFFF000 +#define IOMMU_EXCLUSION_LIMIT_LOW_SHIFT 12 +#define IOMMU_EXCLUSION_LIMIT_HIGH_MASK 0xFFFFFFFF +#define IOMMU_EXCLUSION_LIMIT_HIGH_SHIFT 0 + +/* Status Register*/ +#define IOMMU_STATUS_MMIO_OFFSET 0x2020 +#define IOMMU_STATUS_EVENT_OVERFLOW_MASK 0x00000001 +#define IOMMU_STATUS_EVENT_OVERFLOW_SHIFT 0 +#define IOMMU_STATUS_EVENT_LOG_INT_MASK 0x00000002 +#define IOMMU_STATUS_EVENT_LOG_INT_SHIFT 1 +#define IOMMU_STATUS_COMP_WAIT_INT_MASK 0x00000004 +#define IOMMU_STATUS_COMP_WAIT_INT_SHIFT 2 +#define IOMMU_STATUS_EVENT_LOG_RUN_MASK 0x00000008 +#define IOMMU_STATUS_EVENT_LOG_RUN_SHIFT 3 +#define IOMMU_STATUS_CMD_BUFFER_RUN_MASK 0x00000010 +#define IOMMU_STATUS_CMD_BUFFER_RUN_SHIFT 4 + +/* I/O Page Table */ +#define IOMMU_PAGE_TABLE_ENTRY_SIZE 8 +#define IOMMU_PAGE_TABLE_U32_PER_ENTRY (IOMMU_PAGE_TABLE_ENTRY_SIZE / 4) +#define IOMMU_PAGE_TABLE_ALIGNMENT 4096 + +#define IOMMU_PTE_PRESENT_MASK 0x00000001 +#define IOMMU_PTE_PRESENT_SHIFT 0 +#define IOMMU_PTE_NEXT_LEVEL_MASK 0x00000E00 +#define IOMMU_PTE_NEXT_LEVEL_SHIFT 9 +#define IOMMU_PTE_ADDR_LOW_MASK 0xFFFFF000 +#define IOMMU_PTE_ADDR_LOW_SHIFT 12 +#define IOMMU_PTE_ADDR_HIGH_MASK 0x000FFFFF +#define IOMMU_PTE_ADDR_HIGH_SHIFT 0 +#define IOMMU_PTE_U_MASK 0x08000000 +#define IOMMU_PTE_U_SHIFT 7 +#define IOMMU_PTE_FC_MASK 0x10000000 +#define IOMMU_PTE_FC_SHIFT 28 +#define IOMMU_PTE_IO_READ_PERMISSION_MASK 0x20000000 +#define IOMMU_PTE_IO_READ_PERMISSION_SHIFT 29 +#define IOMMU_PTE_IO_WRITE_PERMISSION_MASK 0x40000000 +#define IOMMU_PTE_IO_WRITE_PERMISSION_SHIFT 30 + +/* I/O Page Directory */ +#define IOMMU_PAGE_DIRECTORY_ENTRY_SIZE 8 +#define IOMMU_PAGE_DIRECTORY_ALIGNMENT 4096 +#define IOMMU_PDE_PRESENT_MASK 0x00000001 +#define IOMMU_PDE_PRESENT_SHIFT 0 +#define IOMMU_PDE_NEXT_LEVEL_MASK 0x00000E00 +#define IOMMU_PDE_NEXT_LEVEL_SHIFT 9 +#define IOMMU_PDE_ADDR_LOW_MASK 0xFFFFF000 +#define IOMMU_PDE_ADDR_LOW_SHIFT 12 +#define IOMMU_PDE_ADDR_HIGH_MASK 0x000FFFFF +#define IOMMU_PDE_ADDR_HIGH_SHIFT 0 +#define IOMMU_PDE_IO_READ_PERMISSION_MASK 0x20000000 +#define IOMMU_PDE_IO_READ_PERMISSION_SHIFT 29 +#define IOMMU_PDE_IO_WRITE_PERMISSION_MASK 0x40000000 +#define IOMMU_PDE_IO_WRITE_PERMISSION_SHIFT 30 + +/* Paging modes */ +#define IOMMU_PAGING_MODE_DISABLED 0x0 +#define IOMMU_PAGING_MODE_LEVEL_0 0x0 +#define IOMMU_PAGING_MODE_LEVEL_1 0x1 +#define IOMMU_PAGING_MODE_LEVEL_2 0x2 +#define IOMMU_PAGING_MODE_LEVEL_3 0x3 +#define IOMMU_PAGING_MODE_LEVEL_4 0x4 +#define IOMMU_PAGING_MODE_LEVEL_5 0x5 +#define IOMMU_PAGING_MODE_LEVEL_6 0x6 +#define IOMMU_PAGING_MODE_LEVEL_7 0x7 + +/* Flags */ +#define IOMMU_CONTROL_DISABLED 0 +#define IOMMU_CONTROL_ENABLED 1 + +#define MMIO_PAGES_PER_IOMMU (IOMMU_MMIO_REGION_LENGTH / PAGE_SIZE_4K) +#define IOMMU_PAGES (MMIO_PAGES_PER_IOMMU * MAX_AMD_IOMMUS) +#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48 +#define MAX_AMD_IOMMUS 32 +#define IOMMU_PAGE_TABLE_LEVEL_3 3 +#define IOMMU_PAGE_TABLE_LEVEL_4 4 +#define IOMMU_IO_WRITE_ENABLED 1 +#define IOMMU_IO_READ_ENABLED 1 +#define HACK_BIOS_SETTINGS 0 + +/* interrupt remapping table */ +#define INT_REMAP_INDEX_DM_MASK 0x1C00 +#define INT_REMAP_INDEX_DM_SHIFT 10 +#define INT_REMAP_INDEX_VECTOR_MASK 0x3FC +#define INT_REMAP_INDEX_VECTOR_SHIFT 2 +#define INT_REMAP_ENTRY_REMAPEN_MASK 0x00000001 +#define INT_REMAP_ENTRY_REMAPEN_SHIFT 0 +#define INT_REMAP_ENTRY_SUPIOPF_MASK 0x00000002 +#define INT_REMAP_ENTRY_SUPIOPF_SHIFT 1 +#define INT_REMAP_ENTRY_INTTYPE_MASK 0x0000001C +#define INT_REMAP_ENTRY_INTTYPE_SHIFT 2 +#define INT_REMAP_ENTRY_REQEOI_MASK 0x00000020 +#define INT_REMAP_ENTRY_REQEOI_SHIFT 5 +#define INT_REMAP_ENTRY_DM_MASK 0x00000040 +#define INT_REMAP_ENTRY_DM_SHIFT 6 +#define INT_REMAP_ENTRY_DEST_MAST 0x0000FF00 +#define INT_REMAP_ENTRY_DEST_SHIFT 8 +#define INT_REMAP_ENTRY_VECTOR_MASK 0x00FF0000 +#define INT_REMAP_ENTRY_VECTOR_SHIFT 16 + +#endif /* _ASM_X86_64_AMD_IOMMU_DEFS_H */ diff -Naurp xen/include/asm-x86/hvm/svm/amd-iommu-proto.h xen-redhat/include/asm-x86/hvm/svm/amd-iommu-proto.h --- xen/include/asm-x86/hvm/svm/amd-iommu-proto.h +++ xen-redhat/include/asm-x86/hvm/svm/amd-iommu-proto.h @@ -0,0 +1,162 @@ +/* + * Copyright (C) 2007 Advanced Micro Devices, Inc. + * Author: Leo Duran <leo.duran@amd.com> + * Author: Wei Wang <wei.wang2@amd.com> - adapted to xen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _ASM_X86_64_AMD_IOMMU_PROTO_H +#define _ASM_X86_64_AMD_IOMMU_PROTO_H + +#include <xen/sched.h> +#include <asm/amd-iommu.h> +#include <xen/domain_page.h> + +#define for_each_amd_iommu(amd_iommu) \ + list_for_each_entry(amd_iommu, \ + &amd_iommu_head, list) + +#define DMA_32BIT_MASK 0x00000000ffffffffULL +#define PAGE_ALIGN(addr) (((addr) + PAGE_SIZE - 1) & PAGE_MASK) + +extern int amd_iommu_debug; +extern int amd_iommu_perdev_intremap; + +#define AMD_IOMMU_DEBUG(fmt, args...) \ + do \ + { \ + if ( amd_iommu_debug ) \ + printk(XENLOG_INFO "AMD-Vi: " fmt, ## args); \ + } while(0) + +/* amd-iommu-detect functions */ +int __init amd_iommu_get_ivrs_dev_entries(void); +int __init amd_iommu_detect_one_acpi(void *ivhd); +int __init amd_iommu_detect_acpi(void); + +/* amd-iommu-init functions */ +int __init amd_iommu_init(void); +int __init amd_iommu_init_one(struct amd_iommu *iommu); +int __init amd_iommu_update_ivrs_mapping_acpi(void); +void __init amd_iommu_init_cleanup(void); +int __init amd_iommu_setup_shared_tables(void); + +/* mapping functions */ +int amd_iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn); +int amd_iommu_unmap_page(struct domain *d, unsigned long gfn); +u64 amd_iommu_get_next_table_from_pte(u32 *entry); +int amd_iommu_reserve_domain_unity_map(struct domain *domain, + unsigned long phys_addr, unsigned long size, int iw, int ir); +int amd_iommu_sync_p2m(struct domain *d); +void invalidate_all_iommu_pages(struct domain *d); + +/* device table functions */ +void amd_iommu_add_dev_table_entry( + u32 *dte, u8 sys_mgt, u8 dev_ex, u8 lint1_pass, u8 lint0_pass, + u8 nmi_pass, u8 ext_int_pass, u8 init_pass); +int amd_iommu_is_dte_page_translation_valid(u32 *entry); +int get_dma_requestor_id(u16 bdf); +void invalidate_dev_table_entry(struct amd_iommu *iommu, u16 devic_id); +void amd_iommu_set_intremap_table( + u32 *dte, u64 intremap_ptr, u8 int_valid); +void amd_iommu_set_root_page_table( + u32 *dte, u64 root_ptr, u16 domain_id, u8 paging_mode, u8 valid); + + +/* send cmd to iommu */ +int send_iommu_command(struct amd_iommu *iommu, u32 cmd[]); +void flush_command_buffer(struct amd_iommu *iommu); + +/* find iommu for bdf */ +struct amd_iommu *find_iommu_for_device(int bus, int devfn); + +/*interrupt remapping */ +int __init amd_iommu_setup_intremap_table(void); +void __init amd_iommu_free_intremap_table(int bdf); +void invalidate_interrupt_table(struct amd_iommu *iommu, u16 device_id); +void amd_iommu_ioapic_update_ire(unsigned int apic, unsigned int reg, unsigned + int value); +void amd_iommu_msi_msg_update_ire(struct msi_desc *msi_desc, struct msi_msg *msg); +void * __init amd_iommu_alloc_intremap_table(void); +int __init amd_iommu_setup_ioapic_remapping(void); +void*__init amd_iommu_alloc_intremap_table(void); +void __init amd_iommu_free_intremap_table(int bdf); + + +static inline u32 get_field_from_reg_u32(u32 reg_value, u32 mask, u32 shift) +{ + u32 field; + field = (reg_value & mask) >> shift; + return field; +} + +static inline u32 set_field_in_reg_u32(u32 field, u32 reg_value, + u32 mask, u32 shift, u32 *reg) +{ + reg_value &= ~mask; + reg_value |= (field << shift) & mask; + if (reg) + *reg = reg_value; + return reg_value; +} + +static inline u8 get_field_from_byte(u8 value, u8 mask, u8 shift) +{ + u8 field; + field = (value & mask) >> shift; + return field; +} + +static inline unsigned long region_to_pages(unsigned long addr, unsigned long size) +{ + return (PAGE_ALIGN(addr + size) - (addr & PAGE_MASK)) >> PAGE_SHIFT; +} + +static inline struct page_info* alloc_amd_iommu_pgtable(void) +{ + struct page_info *pg; + void *vaddr; + + pg = alloc_domheap_page(NULL); + if ( pg == NULL ) + return 0; + vaddr = map_domain_page(page_to_mfn(pg)); + if ( !vaddr ) + return 0; + memset(vaddr, 0, PAGE_SIZE); + unmap_domain_page(vaddr); + return pg; +} + +static inline void free_amd_iommu_pgtable(struct page_info *pg) +{ + if ( pg != 0 ) + free_domheap_page(pg); +} + +static inline void* __alloc_amd_iommu_tables(int order) +{ + void *buf; + buf = alloc_xenheap_pages(order); + return buf; +} + +static inline void __free_amd_iommu_tables(void *table, int order) +{ + free_xenheap_pages(table, order); +} + +#endif /* _ASM_X86_64_AMD_IOMMU_PROTO_H */ diff -Naurp xen/include/asm-x86/hvm/svm/emulate.h xen-redhat/include/asm-x86/hvm/svm/emulate.h --- xen/include/asm-x86/hvm/svm/emulate.h +++ xen-redhat/include/asm-x86/hvm/svm/emulate.h @@ -89,14 +89,14 @@ extern unsigned int decode_src_reg(u8 pr extern unsigned long svm_rip2pointer(struct vcpu *v); extern int __get_instruction_length_from_list(struct vcpu *v, enum instruction_index *list, unsigned int list_count, - u8 *guest_eip_buf, enum instruction_index *match); + enum instruction_index *match); static inline int __get_instruction_length(struct vcpu *v, - enum instruction_index instr, u8 *guest_eip_buf) + enum instruction_index instr) { return __get_instruction_length_from_list( - v, &instr, 1, guest_eip_buf, NULL); + v, &instr, 1, NULL); } diff -Naurp xen/include/asm-x86/hvm/svm/svm.h xen-redhat/include/asm-x86/hvm/svm/svm.h --- xen/include/asm-x86/hvm/svm/svm.h +++ xen-redhat/include/asm-x86/hvm/svm/svm.h @@ -28,7 +28,7 @@ #include <asm/hvm/svm/vmcb.h> #include <asm/i387.h> -extern void svm_dump_vmcb(const char *from, struct vmcb_struct *vmcb); +void svm_dump_vmcb(const char *from, struct vmcb_struct *vmcb); #define SVM_REG_EAX (0) #define SVM_REG_ECX (1) @@ -47,4 +47,45 @@ extern void svm_dump_vmcb(const char *fr #define SVM_REG_R14 (14) #define SVM_REG_R15 (15) +static inline void svm_vmload(void *vmcb) +{ + asm volatile ( + ".byte 0x0f,0x01,0xda" /* vmload */ + : : "a" (__pa(vmcb)) : "memory" ); +} + +static inline void svm_vmsave(void *vmcb) +{ + asm volatile ( + ".byte 0x0f,0x01,0xdb" /* vmsave */ + : : "a" (__pa(vmcb)) : "memory" ); +} + +/* + * Need to re-inject a given event? We avoid re-injecting software exceptions + * and interrupts because the faulting/trapping instruction can simply be + * re-executed (neither VMX nor SVM update RIP when they VMEXIT during + * INT3/INTO/INTn). + */ +static inline int svm_event_needs_reinjection(uint8_t type, uint8_t vector) +{ + switch ( type ) + { + case EVENTTYPE_INTR: + case EVENTTYPE_NMI: + return 1; + case EVENTTYPE_EXCEPTION: + /* + * SVM uses type 3 ("HW Exception") for #OF and #BP. We explicitly + * check for these vectors, as they are really SW Exceptions. SVM has + * not updated RIP to point after the trapping instruction (INT3/INTO). + */ + return (vector != 3) && (vector != 4); + default: + /* Software exceptions/interrupts can be re-executed (e.g., INT n). */ + break; + } + return 0; +} + #endif /* __ASM_X86_HVM_SVM_H__ */ diff -Naurp xen/include/asm-x86/hvm/vcpu.h xen-redhat/include/asm-x86/hvm/vcpu.h --- xen/include/asm-x86/hvm/vcpu.h +++ xen-redhat/include/asm-x86/hvm/vcpu.h @@ -30,12 +30,14 @@ struct hvm_vcpu { unsigned long hw_cr3; /* value we give to HW to use */ - unsigned long ioflags; struct hvm_io_op io_op; struct vlapic vlapic; s64 cache_tsc_offset; u64 guest_time; + /* Is an NMI pending for delivery to this VCPU core? */ + bool_t nmi_pending; /* NB. integrate flag with save/restore */ + /* Lock and list for virtual platform timers. */ spinlock_t tm_lock; struct list_head tm_list; @@ -52,6 +54,9 @@ struct hvm_vcpu { struct arch_vmx_struct vmx; struct arch_svm_struct svm; } u; + + /* In mode delay_for_missed_ticks, VCPUs have differing guest times. */ + int64_t stime_offset; }; #define ARCH_HVM_IO_WAIT 1 /* Waiting for I/O completion */ diff -Naurp xen/include/asm-x86/hvm/vlapic.h xen-redhat/include/asm-x86/hvm/vlapic.h --- xen/include/asm-x86/hvm/vlapic.h +++ xen-redhat/include/asm-x86/hvm/vlapic.h @@ -78,7 +78,7 @@ int vlapic_set_irq(struct vlapic *vlapic int vlapic_find_highest_irr(struct vlapic *vlapic); int vlapic_has_interrupt(struct vcpu *v); -int cpu_get_apic_interrupt(struct vcpu *v, int *mode); +int cpu_get_apic_interrupt(struct vcpu *v); int vlapic_init(struct vcpu *v); void vlapic_destroy(struct vcpu *v); @@ -89,12 +89,11 @@ void vlapic_msr_set(struct vlapic *vlapi int vlapic_accept_pic_intr(struct vcpu *v); +void vlapic_adjust_i8259_target(struct domain *d); + struct vlapic *apic_round_robin( struct domain *d, uint8_t vector, uint32_t bitmap); int vlapic_match_logical_addr(struct vlapic *vlapic, uint8_t mda); -int is_lvtt(struct vcpu *v, int vector); -int is_lvtt_enabled(struct vcpu *v); - #endif /* __ASM_X86_HVM_VLAPIC_H__ */ diff -Naurp xen/include/asm-x86/hvm/vmx/vmcs.h xen-redhat/include/asm-x86/hvm/vmx/vmcs.h --- xen/include/asm-x86/hvm/vmx/vmcs.h +++ xen-redhat/include/asm-x86/hvm/vmx/vmcs.h @@ -47,6 +47,9 @@ struct vmx_msr_state { unsigned long msrs[VMX_MSR_COUNT]; }; +#define EPT_DEFAULT_MT 6 +#define EPT_DEFAULT_GAW 3 + struct arch_vmx_struct { /* Virtual address of VMCS. */ struct vmcs_struct *vmcs; @@ -62,8 +65,21 @@ struct arch_vmx_struct { int active_cpu; int launched; + union { + struct { + u64 etmt :3, + gaw :3, + rsvd :6, + asr :52; + }; + u64 eptp; + } ept_control; + /* Cache of cpu execution control. */ u32 exec_control; + u32 secondary_exec_control; + + u16 vpid; /* If there is vector installed in the INTR_INFO_FIELD. */ u32 vector_injected; @@ -101,6 +117,8 @@ void vmx_vmcs_exit(struct vcpu *v); #define CPU_BASED_MWAIT_EXITING 0x00000400 #define CPU_BASED_RDPMC_EXITING 0x00000800 #define CPU_BASED_RDTSC_EXITING 0x00001000 +#define CPU_BASED_CR3_LOAD_EXITING 0x00008000 +#define CPU_BASED_CR3_STORE_EXITING 0x00010000 #define CPU_BASED_CR8_LOAD_EXITING 0x00080000 #define CPU_BASED_CR8_STORE_EXITING 0x00100000 #define CPU_BASED_TPR_SHADOW 0x00200000 @@ -111,7 +129,7 @@ void vmx_vmcs_exit(struct vcpu *v); #define CPU_BASED_ACTIVATE_MSR_BITMAP 0x10000000 #define CPU_BASED_MONITOR_EXITING 0x20000000 #define CPU_BASED_PAUSE_EXITING 0x40000000 -#define ACTIVATE_SECONDARY_CONTROLS 0x80000000 +#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000 extern u32 vmx_cpu_based_exec_control; #define PIN_BASED_EXT_INTR_MASK 0x00000001 @@ -121,25 +139,42 @@ extern u32 vmx_pin_based_exec_control; #define VM_EXIT_IA32E_MODE 0x00000200 #define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 +#define VM_EXIT_SAVE_GUEST_PAT 0x00040000 +#define VM_EXIT_LOAD_HOST_PAT 0x00080000 extern u32 vmx_vmexit_control; #define VM_ENTRY_IA32E_MODE 0x00000200 #define VM_ENTRY_SMM 0x00000400 #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 +#define VM_ENTRY_LOAD_GUEST_PAT 0x00004000 extern u32 vmx_vmentry_control; #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 +#define SECONDARY_EXEC_ENABLE_EPT 0x00000002 +#define SECONDARY_EXEC_ENABLE_VPID 0x00000020 + extern u32 vmx_secondary_exec_control; #define cpu_has_vmx_virtualize_apic_accesses \ (vmx_secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) #define cpu_has_vmx_tpr_shadow \ (vmx_cpu_based_exec_control & CPU_BASED_TPR_SHADOW) +#define cpu_has_vmx_vnmi \ + (vmx_pin_based_exec_control & PIN_BASED_VIRTUAL_NMIS) #define cpu_has_vmx_mmap_vtpr_optimization \ (cpu_has_vmx_virtualize_apic_accesses && cpu_has_vmx_tpr_shadow) #define cpu_has_vmx_msr_bitmap \ (vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_MSR_BITMAP) +#define cpu_has_vmx_secondary_exec_control \ + (vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) +#define cpu_has_vmx_ept \ + (vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) +#define cpu_has_vmx_pat \ + (vmx_vmentry_control & VM_ENTRY_LOAD_GUEST_PAT) +#define cpu_has_vmx_vpid \ + (vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) + extern char *vmx_msr_bitmap; /* GUEST_INTERRUPTIBILITY_INFO flags. */ @@ -150,6 +185,7 @@ extern char *vmx_msr_bitmap; /* VMCS field encodings. */ enum vmcs_field { + VIRTUAL_PROCESSOR_ID = 0x00000000, GUEST_ES_SELECTOR = 0x00000800, GUEST_CS_SELECTOR = 0x00000802, GUEST_SS_SELECTOR = 0x00000804, @@ -183,10 +219,26 @@ enum vmcs_field { VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, APIC_ACCESS_ADDR = 0x00002014, APIC_ACCESS_ADDR_HIGH = 0x00002015, + EPT_POINTER = 0x0000201a, + EPT_POINTER_HIGH = 0x0000201b, + GUEST_PHYSICAL_ADDRESS = 0x00002400, + GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, VMCS_LINK_POINTER = 0x00002800, VMCS_LINK_POINTER_HIGH = 0x00002801, GUEST_IA32_DEBUGCTL = 0x00002802, GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, + GUEST_PAT = 0x00002804, + GUEST_PAT_HIGH = 0x00002805, + GUEST_PDPTR0 = 0x0000280a, + GUEST_PDPTR0_HIGH = 0x0000280b, + GUEST_PDPTR1 = 0x0000280c, + GUEST_PDPTR1_HIGH = 0x0000280d, + GUEST_PDPTR2 = 0x0000280e, + GUEST_PDPTR2_HIGH = 0x0000280f, + GUEST_PDPTR3 = 0x00002810, + GUEST_PDPTR3_HIGH = 0x00002811, + HOST_PAT = 0x00002c00, + HOST_PAT_HIGH = 0x00002c01, PIN_BASED_VM_EXEC_CONTROL = 0x00004000, CPU_BASED_VM_EXEC_CONTROL = 0x00004002, EXCEPTION_BITMAP = 0x00004004, @@ -277,6 +329,8 @@ enum vmcs_field { HOST_RIP = 0x00006c16, }; +#define VMCS_VPID_WIDTH (16) + #endif /* ASM_X86_HVM_VMX_VMCS_H__ */ /* diff -Naurp xen/include/asm-x86/hvm/vmx/vmx.h xen-redhat/include/asm-x86/hvm/vmx/vmx.h --- xen/include/asm-x86/hvm/vmx/vmx.h +++ xen-redhat/include/asm-x86/hvm/vmx/vmx.h @@ -23,15 +23,37 @@ #include <asm/types.h> #include <asm/regs.h> #include <asm/processor.h> -#include <asm/hvm/vmx/vmcs.h> #include <asm/i387.h> +#include <asm/hvm/support.h> #include <asm/hvm/trace.h> +#include <asm/hvm/vmx/vmcs.h> +#include <asm/paging.h> +#include <asm/p2m.h> + +typedef union { + struct { + u64 r : 1, + w : 1, + x : 1, + emt : 3, + igmt : 1, + sp_avail : 1, + avail1 : 4, + mfn : 45, + rsvd : 5, + avail2 : 2; + }; + u64 epte; +} ept_entry_t; + +#define EPT_TABLE_ORDER 9 + +extern mfn_t ept_get_entry(struct domain *d, unsigned long gfn, p2m_type_t *t); void vmx_asm_vmexit_handler(struct cpu_user_regs); void vmx_asm_do_vmentry(void); void vmx_intr_assist(void); void vmx_do_resume(struct vcpu *); -void set_guest_time(struct vcpu *v, u64 gtime); extern struct page_info *change_guest_physmap_for_vtpr(struct domain *d, int enable_vtpr); @@ -85,6 +107,8 @@ extern struct page_info *change_guest_ph #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 #define EXIT_REASON_APIC_ACCESS 44 +#define EXIT_REASON_EPT_VIOLATION 48 +#define EXIT_REASON_EPT_MISCONFIG 49 /* * Interruption-information format @@ -92,7 +116,9 @@ extern struct page_info *change_guest_ph #define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ #define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ #define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */ +#define INTR_INFO_NMI_UNBLOCKED_BY_IRET 0x1000 /* 12 */ #define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ +#define INTR_INFO_RESVD_BITS_MASK 0x7ffff000 #define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ #define INTR_TYPE_NMI (2 << 8) /* NMI */ @@ -153,12 +179,15 @@ extern struct page_info *change_guest_ph #define VMREAD_OPCODE ".byte 0x0f,0x78\n" #define VMRESUME_OPCODE ".byte 0x0f,0x01,0xc3\n" #define VMWRITE_OPCODE ".byte 0x0f,0x79\n" +#define INVEPT_OPCODE ".byte 0x66,0x0f,0x38,0x80\n" /* m128,r64/32 */ +#define INVVPID_OPCODE ".byte 0x66,0x0f,0x38,0x81\n" /* m128,r64/32 */ #define VMXOFF_OPCODE ".byte 0x0f,0x01,0xc4\n" #define VMXON_OPCODE ".byte 0xf3,0x0f,0xc7\n" +#define MODRM_EAX_08 ".byte 0x08\n" /* ECX, [EAX] */ #define MODRM_EAX_06 ".byte 0x30\n" /* [EAX], with reg/opcode: /6 */ #define MODRM_EAX_07 ".byte 0x38\n" /* [EAX], with reg/opcode: /7 */ -#define MODRM_EAX_ECX ".byte 0xc1\n" /* [EAX], [ECX] */ +#define MODRM_EAX_ECX ".byte 0xc1\n" /* EAX, ECX */ static inline void __vmptrld(u64 addr) { @@ -242,6 +271,42 @@ static inline void __vm_clear_bit(unsign __vmwrite(field, __vmread(field) & ~(1UL << bit)); } +static inline void __invept(int ext, u64 eptp, u64 gpa) +{ + struct { + u64 eptp, gpa; + } operand = {eptp, gpa}; + + __asm__ __volatile__ ( INVEPT_OPCODE + MODRM_EAX_08 + /* CF==1 or ZF==1 --> rc = -1 */ + "ja 1f ; ud2 ; 1:\n" + : + : "a" (&operand), "c" (ext) + : "memory"); +} + +static inline void __invvpid(int ext, u16 vpid, u64 gva) +{ + struct { + u64 vpid:16; + u64 rsvd:48; + u64 gva; + } __attribute__ ((packed)) operand = {vpid, 0, gva}; + + /* Fix up #UD exceptions which occur when TLBs are flushed before VMXON. */ + asm volatile ( "1: " INVVPID_OPCODE MODRM_EAX_08 + /* CF==1 or ZF==1 --> crash (ud2) */ + "ja 2f ; ud2 ; 2:\n" + ".section __ex_table,\"a\"\n" + " "__FIXUP_ALIGN"\n" + " "__FIXUP_WORD" 1b,2b\n" + ".previous" + : + : "a" (&operand), "c" (ext) + : "memory"); +} + static inline void __vmxoff (void) { __asm__ __volatile__ ( VMXOFF_OPCODE @@ -263,8 +328,37 @@ static inline int __vmxon (u64 addr) return rc; } -static inline void __vmx_inject_exception(struct vcpu *v, int trap, int type, - int error_code, int ilen) +static inline void ept_sync_all(void) +{ + if ( !hap_enabled(current->domain) ) + return; + + __invept(2, 0, 0); +} + +void ept_sync_domain(struct domain *d); + +static inline void vpid_sync_vcpu_gva(struct vcpu *v, unsigned long gva) +{ + if ( cpu_has_vmx_vpid ) + __invvpid(0, v->arch.hvm_vmx.vpid, (u64)gva); +} + +static inline void vpid_sync_vcpu_all(struct vcpu *v) +{ + if ( cpu_has_vmx_vpid ) + __invvpid(1, v->arch.hvm_vmx.vpid, 0); +} + +static inline void vpid_sync_all(void) +{ + if ( cpu_has_vmx_vpid ) + __invvpid(2, 0, 0); +} + + +static inline void __vmx_inject_exception( + struct vcpu *v, int trap, int type, int error_code) { unsigned long intr_fields; @@ -282,9 +376,6 @@ static inline void __vmx_inject_exceptio intr_fields |= INTR_INFO_DELIVER_CODE_MASK; } - if ( ilen ) - __vmwrite(VM_ENTRY_INSTRUCTION_LEN, ilen); - __vmwrite(VM_ENTRY_INTR_INFO_FIELD, intr_fields); if (trap == TRAP_page_fault) @@ -297,21 +388,91 @@ static inline void vmx_inject_hw_excepti struct vcpu *v, int trap, int error_code) { v->arch.hvm_vmx.vector_injected = 1; - __vmx_inject_exception(v, trap, INTR_TYPE_HW_EXCEPTION, error_code, 0); + __vmx_inject_exception(v, trap, INTR_TYPE_HW_EXCEPTION, error_code); } -static inline void vmx_inject_sw_exception( - struct vcpu *v, int trap, int instruction_len) +static inline void vmx_inject_extint(struct vcpu *v, int trap) { - v->arch.hvm_vmx.vector_injected = 1; - __vmx_inject_exception(v, trap, INTR_TYPE_SW_EXCEPTION, - VMX_DELIVER_NO_ERROR_CODE, - instruction_len); + __vmx_inject_exception(v, trap, INTR_TYPE_EXT_INTR, + VMX_DELIVER_NO_ERROR_CODE); } -static inline void vmx_inject_extint(struct vcpu *v, int trap, int error_code) +static inline void vmx_inject_nmi(struct vcpu *v) +{ + __vmx_inject_exception(v, 2, INTR_TYPE_NMI, + VMX_DELIVER_NO_ERROR_CODE); +} + +void ept_p2m_init(struct domain *d); + +/* EPT violation qualifications definitions */ +/* bit offset 0 in exit qualification */ +#define _EPT_READ_VIOLATION 0 +#define EPT_READ_VIOLATION (1UL<<_EPT_READ_VIOLATION) +/* bit offset 1 in exit qualification */ +#define _EPT_WRITE_VIOLATION 1 +#define EPT_WRITE_VIOLATION (1UL<<_EPT_WRITE_VIOLATION) +/* bit offset 2 in exit qualification */ +#define _EPT_EXEC_VIOLATION 2 +#define EPT_EXEC_VIOLATION (1UL<<_EPT_EXEC_VIOLATION) + +/* bit offset 3 in exit qualification */ +#define _EPT_EFFECTIVE_READ 3 +#define EPT_EFFECTIVE_READ (1UL<<_EPT_EFFECTIVE_READ) +/* bit offset 4 in exit qualification */ +#define _EPT_EFFECTIVE_WRITE 4 +#define EPT_EFFECTIVE_WRITE (1UL<<_EPT_EFFECTIVE_WRITE) +/* bit offset 5 in exit qualification */ +#define _EPT_EFFECTIVE_EXEC 5 +#define EPT_EFFECTIVE_EXEC (1UL<<_EPT_EFFECTIVE_EXEC) + +/* bit offset 6 in exit qualification */ +#define _EPT_GAW_VIOLATION 6 +#define EPT_GAW_VIOLATION (1UL<<_EPT_GAW_VIOLATION) + +/* bits offset 7 & 8 in exit qualification */ +#define _EPT_GLA_VALIDITY 7 +#define EPT_GLA_VALIDITY_MASK (3UL<<_EPT_GLA_VALIDITY) +/* gla != gpa, when load PDPTR */ +#define EPT_GLA_VALIDITY_PDPTR_LOAD (0UL<<_EPT_GLA_VALIDITY) +/* gla != gpa, during guest page table walking */ +#define EPT_GLA_VALIDITY_GPT_WALK (1UL<<_EPT_GLA_VALIDITY) +/* reserved */ +#define EPT_GLA_VALIDITY_RSVD (2UL<<_EPT_GLA_VALIDITY) +/* gla == gpa, normal case */ +#define EPT_GLA_VALIDITY_MATCH (3UL<<_EPT_GLA_VALIDITY) + +#define EPT_EFFECTIVE_MASK (EPT_EFFECTIVE_READ | \ + EPT_EFFECTIVE_WRITE | \ + EPT_EFFECTIVE_EXEC) + +#define EPT_PAGETABLE_ENTRIES 512 + +/* + * Need to re-inject a given event? We avoid re-injecting software exceptions + * and interrupts because the faulting/trapping instruction can simply be + * re-executed (neither VMX nor SVM update RIP when they VMEXIT during + * INT3/INTO/INTn). + */ +static inline int vmx_event_needs_reinjection(uint8_t type, uint8_t vector) { - __vmx_inject_exception(v, trap, INTR_TYPE_EXT_INTR, error_code, 0); + switch ( type << 8 ) + { + case INTR_TYPE_EXT_INTR: + case INTR_TYPE_NMI: + return 1; + case INTR_TYPE_HW_EXCEPTION: + /* + * SVM uses type 3 ("HW Exception") for #OF and #BP. We explicitly + * check for these vectors, as they are really SW Exceptions. SVM has + * not updated RIP to point after the trapping instruction (INT3/INTO). + */ + return (vector != 3) && (vector != 4); + default: + /* Software exceptions/interrupts can be re-executed (e.g., INT n). */ + break; + } + return 0; } #endif /* __ASM_X86_HVM_VMX_VMX_H__ */ diff -Naurp xen/include/asm-x86/hvm/vpic.h xen-redhat/include/asm-x86/hvm/vpic.h --- xen/include/asm-x86/hvm/vpic.h +++ xen-redhat/include/asm-x86/hvm/vpic.h @@ -32,7 +32,6 @@ void vpic_irq_positive_edge(struct domain *d, int irq); void vpic_irq_negative_edge(struct domain *d, int irq); void vpic_init(struct domain *d); -int cpu_get_pic_interrupt(struct vcpu *v, int *type); -int is_periodic_irq(struct vcpu *v, int irq, int type); +int cpu_get_pic_interrupt(struct vcpu *v); #endif /* __ASM_X86_HVM_VPIC_H__ */ diff -Naurp xen/include/asm-x86/hvm/vpt.h xen-redhat/include/asm-x86/hvm/vpt.h --- xen/include/asm-x86/hvm/vpt.h +++ xen-redhat/include/asm-x86/hvm/vpt.h @@ -29,6 +29,7 @@ #include <xen/timer.h> #include <xen/list.h> #include <asm/hvm/vpic.h> +#include <asm/hvm/irq.h> #include <public/hvm/save.h> struct HPETState; @@ -39,8 +40,9 @@ struct HPET_timer_fn_info { typedef struct HPETState { struct hvm_hw_hpet hpet; - struct vcpu *vcpu; uint64_t tsc_freq; + uint64_t hpet_to_ns_scale; /* hpet ticks to ns (multiplied by 2^10) */ + uint64_t hpet_to_ns_limit; /* max hpet ticks convertable to ns */ uint64_t mc_offset; struct timer timers[HPET_TIMER_NUM]; struct HPET_timer_fn_info timer_fn_info[HPET_TIMER_NUM]; @@ -55,11 +57,16 @@ typedef void time_cb(struct vcpu *v, voi struct periodic_time { struct list_head list; - char enabled; - char one_shot; /* one shot time */ + bool_t on_list; + bool_t one_shot; + bool_t do_not_freeze; + bool_t irq_issued; +#define PTSRC_isa 1 /* ISA time source */ +#define PTSRC_lapic 2 /* LAPIC time source */ + u8 source; /* PTSRC_ */ u8 irq; struct vcpu *vcpu; /* vcpu timer interrupt delivers to */ - u32 pending_intr_nr; /* the couner for pending timer interrupts */ + u32 pending_intr_nr; /* pending timer interrupts */ u64 period; /* frequency in ns */ u64 period_cycles; /* frequency in cpu cycles */ s_time_t scheduled; /* scheduled timer interrupt */ @@ -102,6 +109,7 @@ typedef struct PMTState { struct hvm_hw_pmtimer pm; /* 32bit timer value */ struct vcpu *vcpu; /* Keeps sync with this vcpu's guest-time */ uint64_t last_gtime; /* Last (guest) time we updated the timer */ + uint32_t not_accounted; /* time not accounted at last update */ uint64_t scale; /* Multiplier to get from tsc to timer ticks */ struct timer timer; /* To make sure we send SCIs */ spinlock_t lock; @@ -112,16 +120,38 @@ struct pl_time { /* platform time */ struct RTCState vrtc; struct HPETState vhpet; struct PMTState vpmt; + /* guest_time = Xen sys time + stime_offset */ + int64_t stime_offset; + /* Ensures monotonicity in appropriate timer modes. */ + uint64_t last_guest_time; + spinlock_t pl_time_lock; }; #define ticks_per_sec(v) (v->domain->arch.hvm_domain.tsc_frequency) -void pt_freeze_time(struct vcpu *v); -void pt_thaw_time(struct vcpu *v); +void pt_save_timer(struct vcpu *v); +void pt_restore_timer(struct vcpu *v); void pt_update_irq(struct vcpu *v); -void pt_intr_post(struct vcpu *v, int vector, int type); +void pt_intr_post(struct vcpu *v, int vector, enum hvm_intack src); void pt_reset(struct vcpu *v); void pt_migrate(struct vcpu *v); + +void pt_adjust_global_vcpu_target(struct vcpu *v); +#define pt_global_vcpu_target(d) \ + ((d)->arch.hvm_domain.i8259_target ? : (d)->vcpu ? (d)->vcpu[0] : NULL) + +/* Is given periodic timer active? */ +#define pt_active(pt) ((pt)->on_list) + +/* + * Create/destroy a periodic (or one-shot!) timer. + * The given periodic timer structure must be initialised with zero bytes, + * except for the 'source' field which must be initialised with the + * correct PTSRC_ value. The initialised timer structure can then be passed + * to {create,destroy}_periodic_time() and number of times and in any order. + * Note that, for a given periodic timer, invocations of these functions MUST + * be serialised. + */ void create_periodic_time( struct vcpu *v, struct periodic_time *pt, uint64_t period, uint8_t irq, char one_shot, time_cb *cb, void *data); @@ -134,7 +164,6 @@ void pit_deinit(struct domain *d); void rtc_init(struct vcpu *v, int base); void rtc_migrate_timers(struct vcpu *v); void rtc_deinit(struct domain *d); -int is_rtc_periodic_irq(void *opaque); void pmtimer_init(struct vcpu *v); void pmtimer_deinit(struct domain *d); diff -Naurp xen/include/asm-x86/io_apic.h xen-redhat/include/asm-x86/io_apic.h --- xen/include/asm-x86/io_apic.h +++ xen-redhat/include/asm-x86/io_apic.h @@ -2,9 +2,11 @@ #define __ASM_IO_APIC_H #include <xen/config.h> -#include <asm/fixmap.h> #include <asm/types.h> #include <asm/mpspec.h> +#include <asm/apicdef.h> +#include <asm/fixmap.h> +#include <xen/iommu.h> /* * Intel IO-APIC support for SMP and UP systems. @@ -18,6 +20,8 @@ ((volatile int *)(__fix_to_virt(FIX_IO_APIC_BASE_0 + idx) \ + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK))) +#define IO_APIC_ID(idx) (mp_ioapics[idx].mpc_apicid) + /* * The structure of the IO-APIC: */ @@ -121,18 +125,35 @@ extern struct mpc_config_intsrc mp_irqs[ /* non-0 if default (table-less) MP configuration */ extern int mpc_default_type; -static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) +/* Only need to remap ioapic RTE (reg: 10~3Fh) */ +#define ioapic_reg_remapped(reg) (iommu_enabled && ((reg) >= 0x10)) + +static inline unsigned int __io_apic_read(unsigned int apic, unsigned int reg) { *IO_APIC_BASE(apic) = reg; return *(IO_APIC_BASE(apic)+4); } -static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) +{ + if (ioapic_reg_remapped(reg)) + return io_apic_read_remap_rte(apic, reg); + return __io_apic_read(apic, reg); +} + +static inline void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) { *IO_APIC_BASE(apic) = reg; *(IO_APIC_BASE(apic)+4) = value; } +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +{ + if (ioapic_reg_remapped(reg)) + return iommu_update_ire_from_apic(apic, reg, value); + __io_apic_write(apic, reg, value); +} + /* * Re-write a value: to be used for read-modify-write * cycles where the read already set up the index register. @@ -146,6 +167,8 @@ extern int sis_apic_bug; #endif static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) { + if (ioapic_reg_remapped(reg)) + return iommu_update_ire_from_apic(apic, reg, value); if (sis_apic_bug) *IO_APIC_BASE(apic) = reg; *(IO_APIC_BASE(apic)+4) = value; @@ -179,5 +202,6 @@ static inline int ioapic_resume(void) {r #endif extern int assign_irq_vector(int irq); +extern int free_irq_vector(int vector); #endif diff -Naurp xen/include/asm-x86/irq.h xen-redhat/include/asm-x86/irq.h --- xen/include/asm-x86/irq.h +++ xen-redhat/include/asm-x86/irq.h @@ -20,6 +20,8 @@ extern int vector_irq[NR_VECTORS]; extern u8 irq_vector[NR_IRQ_VECTORS]; #define AUTO_ASSIGN -1 +#define NEVER_ASSIGN -2 +#define FREE_TO_ASSIGN -3 #define platform_legacy_irq(irq) ((irq) < 16) @@ -48,7 +50,22 @@ extern unsigned long io_apic_irqs; extern atomic_t irq_err_count; extern atomic_t irq_mis_count; -int pirq_acktype(int irq); -int pirq_shared(int irq); +int pirq_acktype(struct domain *d, int irq); +int pirq_shared(struct domain *d , int irq); + +int map_domain_pirq(struct domain *d, int pirq, int vector, int type, + void *data); +int unmap_domain_pirq(struct domain *d, int pirq); +int get_free_pirq(struct domain *d, int type, int index); +void free_domain_pirqs(struct domain *d); + +#define domain_irq_to_vector(d, irq) ((d)->arch.pirq_vector[irq] ?: \ + IO_APIC_IRQ(irq) ? 0 : LEGACY_VECTOR(irq)) +#define domain_vector_to_irq(d, vec) ((d)->arch.vector_pirq[vec] ?: \ + ((vec) < FIRST_LEGACY_VECTOR || \ + (vec) > LAST_LEGACY_VECTOR) ? \ + 0 : LEGACY_IRQ_FROM_VECTOR(vec)) + +int pirq_guest_force_unbind(struct domain *d, int irq); #endif /* _ASM_HW_IRQ_H */ diff -Naurp xen/include/asm-x86/ldt.h xen-redhat/include/asm-x86/ldt.h --- xen/include/asm-x86/ldt.h +++ xen-redhat/include/asm-x86/ldt.h @@ -6,7 +6,6 @@ static inline void load_LDT(struct vcpu *v) { - unsigned int cpu; struct desc_struct *desc; unsigned long ents; @@ -16,11 +15,11 @@ static inline void load_LDT(struct vcpu } else { - cpu = smp_processor_id(); - desc = (!is_pv_32on64_vcpu(v) ? gdt_table : compat_gdt_table) - + __LDT(cpu) - FIRST_RESERVED_GDT_ENTRY; + desc = (!is_pv_32on64_vcpu(v) + ? this_cpu(gdt_table) : this_cpu(compat_gdt_table)) + + LDT_ENTRY - FIRST_RESERVED_GDT_ENTRY; _set_tssldt_desc(desc, LDT_VIRT_START(v), ents*8-1, 2); - __asm__ __volatile__ ( "lldt %%ax" : : "a" (__LDT(cpu)<<3) ); + __asm__ __volatile__ ( "lldt %%ax" : : "a" (LDT_ENTRY << 3) ); } } diff -Naurp xen/include/asm-x86/mm.h xen-redhat/include/asm-x86/mm.h --- xen/include/asm-x86/mm.h +++ xen-redhat/include/asm-x86/mm.h @@ -55,6 +55,41 @@ struct page_info u32 tlbflush_timestamp; /* + * When PGT_partial is true then this field is valid and indicates + * that PTEs in the range [0, @nr_validated_ptes) have been validated. + * An extra page reference must be acquired (or not dropped) whenever + * PGT_partial gets set, and it must be dropped when the flag gets + * cleared. This is so that a get() leaving a page in partially + * validated state (where the caller would drop the reference acquired + * due to the getting of the type [apparently] failing [-EAGAIN]) + * would not accidentally result in a page left with zero general + * reference count, but non-zero type reference count (possible when + * the partial get() is followed immediately by domain destruction). + * Likewise, the ownership of the single type reference for partially + * (in-)validated pages is tied to this flag, i.e. the instance + * setting the flag must not drop that reference, whereas the instance + * clearing it will have to. + * + * If @partial_pte is positive then PTE at @nr_validated_ptes+1 has + * been partially validated. This implies that the general reference + * to the page (acquired from get_page_from_lNe()) would be dropped + * (again due to the apparent failure) and hence must be re-acquired + * when resuming the validation, but must not be dropped when picking + * up the page for invalidation. + * + * If @partial_pte is negative then PTE at @nr_validated_ptes+1 has + * been partially invalidated. This is basically the opposite case of + * above, i.e. the general reference to the page was not dropped in + * put_page_from_lNe() (due to the apparent failure), and hence it + * must be dropped when the put operation is resumed (and completes), + * but it must not be acquired if picking up the page for validation. + */ + struct { + u16 nr_validated_ptes; + s8 partial_pte; + }; + + /* * Guest pages with a shadow. This does not conflict with * tlbflush_timestamp since page table pages are explicitly not * tracked for TLB-flush avoidance when a guest runs in shadow mode. @@ -83,9 +118,12 @@ struct page_info /* PAE only: is this an L2 page directory containing Xen-private mappings? */ #define _PGT_pae_xen_l2 26 #define PGT_pae_xen_l2 (1U<<_PGT_pae_xen_l2) +/* Has this page been *partially* validated for use as its current type? */ +#define _PGT_partial 25 +#define PGT_partial (1U<<_PGT_partial) - /* 16-bit count of uses of this frame as its current type. */ -#define PGT_count_mask ((1U<<16)-1) + /* 25-bit count of uses of this frame as its current type. */ +#define PGT_count_mask ((1U<<25)-1) /* Cleared when the owning guest 'frees' this page. */ #define _PGC_allocated 31 @@ -144,8 +182,9 @@ extern unsigned long max_page; extern unsigned long total_pages; void init_frametable(void); -int alloc_page_type(struct page_info *page, unsigned long type); -void free_page_type(struct page_info *page, unsigned long type); +int alloc_page_type(struct page_info *page, unsigned long type, int preemptible); +int free_page_type(struct page_info *page, unsigned long type, + int preemptible); int _shadow_mode_refcounts(struct domain *d); static inline void put_page(struct page_info *page) @@ -199,6 +238,8 @@ static inline int get_page(struct page_i void put_page_type(struct page_info *page); int get_page_type(struct page_info *page, unsigned long type); +int put_page_type_preemptible(struct page_info *page); +int get_page_type_preemptible(struct page_info *page, unsigned long type); int get_page_from_l1e(l1_pgentry_t l1e, struct domain *d); void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d); @@ -208,6 +249,19 @@ static inline void put_page_and_type(str put_page(page); } +static inline int put_page_and_type_preemptible(struct page_info *page, + int preemptible) +{ + int rc = 0; + + if ( preemptible ) + rc = put_page_type_preemptible(page); + else + put_page_type(page); + if ( likely(rc == 0) ) + put_page(page); + return rc; +} static inline int get_page_and_type(struct page_info *page, struct domain *domain, @@ -394,12 +448,16 @@ int map_ldt_shadow_page(unsigned int); #ifdef CONFIG_COMPAT int setup_arg_xlat_area(struct vcpu *, l4_pgentry_t *); +void domain_set_alloc_bitsize(struct domain *d); unsigned int domain_clamp_alloc_bitsize(struct domain *d, unsigned int bits); #else # define setup_arg_xlat_area(vcpu, l4tab) 0 +# define domain_set_alloc_bitsize(d) ((void)0) # define domain_clamp_alloc_bitsize(d, b) (b) #endif unsigned long domain_get_maximum_gpfn(struct domain *d); +extern struct domain *dom_xen, *dom_io; /* for vmcoreinfo */ + #endif /* __ASM_X86_MM_H__ */ diff -Naurp xen/include/asm-x86/msi.h xen-redhat/include/asm-x86/msi.h --- xen/include/asm-x86/msi.h +++ xen-redhat/include/asm-x86/msi.h @@ -0,0 +1,227 @@ +#ifndef __ASM_MSI_H +#define __ASM_MSI_H + +#include <xen/cpumask.h> +#include <asm/irq.h> +/* + * Constants for Intel APIC based MSI messages. + */ + +/* + * Shifts for MSI data + */ + +#define MSI_DATA_VECTOR_SHIFT 0 +#define MSI_DATA_VECTOR_MASK 0x000000ff +#define MSI_DATA_VECTOR(v) (((v) << MSI_DATA_VECTOR_SHIFT) & MSI_DATA_VECTOR_MASK) + +#define MSI_DATA_DELIVERY_MODE_SHIFT 8 +#define MSI_DATA_DELIVERY_FIXED (0 << MSI_DATA_DELIVERY_MODE_SHIFT) +#define MSI_DATA_DELIVERY_LOWPRI (1 << MSI_DATA_DELIVERY_MODE_SHIFT) + +#define MSI_DATA_LEVEL_SHIFT 14 +#define MSI_DATA_LEVEL_DEASSERT (0 << MSI_DATA_LEVEL_SHIFT) +#define MSI_DATA_LEVEL_ASSERT (1 << MSI_DATA_LEVEL_SHIFT) + +#define MSI_DATA_TRIGGER_SHIFT 15 +#define MSI_DATA_TRIGGER_EDGE (0 << MSI_DATA_TRIGGER_SHIFT) +#define MSI_DATA_TRIGGER_LEVEL (1 << MSI_DATA_TRIGGER_SHIFT) + +/* + * Shift/mask fields for msi address + */ + +#define MSI_ADDR_BASE_HI 0 +#define MSI_ADDR_BASE_LO 0xfee00000 +#define MSI_ADDR_HEADER MSI_ADDR_BASE_LO + +#define MSI_ADDR_DESTMODE_SHIFT 2 +#define MSI_ADDR_DESTMODE_PHYS (0 << MSI_ADDR_DESTMODE_SHIFT) +#define MSI_ADDR_DESTMODE_LOGIC (1 << MSI_ADDR_DESTMODE_SHIFT) + +#define MSI_ADDR_REDIRECTION_SHIFT 3 +#define MSI_ADDR_REDIRECTION_CPU (0 << MSI_ADDR_REDIRECTION_SHIFT) +#define MSI_ADDR_REDIRECTION_LOWPRI (1 << MSI_ADDR_REDIRECTION_SHIFT) + +#define MSI_ADDR_DEST_ID_SHIFT 12 +#define MSI_ADDR_DEST_ID_MASK 0x00ffff0 +#define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & MSI_ADDR_DEST_ID_MASK) + +/* MAX fixed pages reserved for mapping MSIX tables. */ +#if defined(__x86_64__) +#define FIX_MSIX_MAX_PAGES 512 +#else +#define FIX_MSIX_MAX_PAGES 32 +#endif + +struct msi_info { + int bus; + int devfn; + int vector; + int entry_nr; + uint64_t table_base; +}; + +struct msi_msg { + u32 address_lo; /* low 32 bits of msi message address */ + u32 address_hi; /* high 32 bits of msi message address */ + u32 data; /* 16 bits of msi message data */ +}; + +struct msi_desc; +/* Helper functions */ +extern void mask_msi_vector(unsigned int vector); +extern void unmask_msi_vector(unsigned int vector); +extern void set_msi_affinity(unsigned int vector, cpumask_t mask); +extern int pci_enable_msi(struct msi_info *msi, struct msi_desc **desc); +extern void pci_disable_msi(struct msi_desc *desc); +extern void pci_cleanup_msi(struct pci_dev *pdev); +extern int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc); +extern void teardown_msi_vector(int vector); +extern int msi_free_vector(struct msi_desc *entry); +extern int pci_restore_msi_state(struct pci_dev *pdev); + +extern unsigned int pci_msix_get_table_len(struct pci_dev *pdev); + +struct msi_desc { + struct { + __u8 type : 5; /* {0: unused, 5h:MSI, 11h:MSI-X} */ + __u8 maskbit : 1; /* mask-pending bit supported ? */ + __u8 masked : 1; + __u8 is_64 : 1; /* Address size: 0=32bit 1=64bit */ + __u8 pos; /* Location of the msi capability */ + __u16 entry_nr; /* specific enabled entry */ + }msi_attrib; + + struct list_head list; + + void __iomem *mask_base; /* va for the entry in mask table */ + struct pci_dev *dev; + int vector; + + struct msi_msg msg; /* Last set MSI message */ + + int remap_index; /* index in interrupt remapping table */ +}; + +int msi_maskable_irq(const struct msi_desc *); + +/* + * Assume the maximum number of hot plug slots supported by the system is about + * ten. The worstcase is that each of these slots is hot-added with a device, + * which has two MSI/MSI-X capable functions. To avoid any MSI-X driver, which + * attempts to request all available vectors, NR_HP_RESERVED_VECTORS is defined + * as below to ensure at least one message is assigned to each detected MSI/ + * MSI-X device function. + */ +#define NR_HP_RESERVED_VECTORS 20 + +extern struct hw_interrupt_type pci_msi_type; + +/* + * MSI-X Address Register + */ +#define PCI_MSIX_FLAGS_QSIZE 0x7FF +#define PCI_MSIX_FLAGS_ENABLE (1 << 15) +#define PCI_MSIX_FLAGS_BIRMASK (7 << 0) +#define PCI_MSIX_FLAGS_BITMASK (1 << 0) + +#define PCI_MSIX_ENTRY_SIZE 16 +#define PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET 0 +#define PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET 4 +#define PCI_MSIX_ENTRY_DATA_OFFSET 8 +#define PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET 12 + +#define msi_control_reg(base) (base + PCI_MSI_FLAGS) +#define msi_lower_address_reg(base) (base + PCI_MSI_ADDRESS_LO) +#define msi_upper_address_reg(base) (base + PCI_MSI_ADDRESS_HI) +#define msi_data_reg(base, is64bit) \ + ( (is64bit == 1) ? base+PCI_MSI_DATA_64 : base+PCI_MSI_DATA_32 ) +#define msi_mask_bits_reg(base, is64bit) \ + ( (is64bit == 1) ? base+PCI_MSI_MASK_BIT : base+PCI_MSI_MASK_BIT-4) +#define msi_disable(control) control &= ~PCI_MSI_FLAGS_ENABLE +#define multi_msi_capable(control) \ + (1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1)) +#define multi_msi_enable(control, num) \ + control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE); +#define is_64bit_address(control) (!!(control & PCI_MSI_FLAGS_64BIT)) +#define is_mask_bit_support(control) (!!(control & PCI_MSI_FLAGS_MASKBIT)) +#define msi_enable(control, num) multi_msi_enable(control, num); \ + control |= PCI_MSI_FLAGS_ENABLE + +#define msix_control_reg(base) (base + PCI_MSIX_FLAGS) +#define msix_table_offset_reg(base) (base + 0x04) +#define msix_pba_offset_reg(base) (base + 0x08) +#define msix_enable(control) control |= PCI_MSIX_FLAGS_ENABLE +#define msix_disable(control) control &= ~PCI_MSIX_FLAGS_ENABLE +#define msix_table_size(control) ((control & PCI_MSIX_FLAGS_QSIZE)+1) +#define multi_msix_capable msix_table_size +#define msix_unmask(address) (address & ~PCI_MSIX_FLAGS_BITMASK) +#define msix_mask(address) (address | PCI_MSIX_FLAGS_BITMASK) +#define msix_is_pending(address) (address & PCI_MSIX_FLAGS_PENDMASK) + +/* + * MSI Defined Data Structures + */ +#define MSI_ADDRESS_HEADER 0xfee +#define MSI_ADDRESS_HEADER_SHIFT 12 +#define MSI_ADDRESS_HEADER_MASK 0xfff000 +#define MSI_ADDRESS_DEST_ID_MASK 0xfff0000f +#define MSI_TARGET_CPU_MASK 0xff +#define MSI_TARGET_CPU_SHIFT 12 +#define MSI_DELIVERY_MODE 0 +#define MSI_LEVEL_MODE 1 /* Edge always assert */ +#define MSI_TRIGGER_MODE 0 /* MSI is edge sensitive */ +#define MSI_PHYSICAL_MODE 0 +#define MSI_LOGICAL_MODE 1 +#define MSI_REDIRECTION_HINT_MODE 0 + +#define __LITTLE_ENDIAN_BITFIELD 1 + +struct msg_data { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u32 vector : 8; + __u32 delivery_mode : 3; /* 000b: FIXED | 001b: lowest prior */ + __u32 reserved_1 : 3; + __u32 level : 1; /* 0: deassert | 1: assert */ + __u32 trigger : 1; /* 0: edge | 1: level */ + __u32 reserved_2 : 16; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u32 reserved_2 : 16; + __u32 trigger : 1; /* 0: edge | 1: level */ + __u32 level : 1; /* 0: deassert | 1: assert */ + __u32 reserved_1 : 3; + __u32 delivery_mode : 3; /* 000b: FIXED | 001b: lowest prior */ + __u32 vector : 8; +#else +#error "Bitfield endianness not defined! Check your byteorder.h" +#endif +} __attribute__ ((packed)); + +struct msg_address { + union { + struct { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u32 reserved_1 : 2; + __u32 dest_mode : 1; /*0:physic | 1:logic */ + __u32 redirection_hint: 1; /*0: dedicated CPU + 1: lowest priority */ + __u32 reserved_2 : 4; + __u32 dest_id : 24; /* Destination ID */ +#elif defined(__BIG_ENDIAN_BITFIELD) + __u32 dest_id : 24; /* Destination ID */ + __u32 reserved_2 : 4; + __u32 redirection_hint: 1; /*0: dedicated CPU + 1: lowest priority */ + __u32 dest_mode : 1; /*0:physic | 1:logic */ + __u32 reserved_1 : 2; +#else +#error "Bitfield endianness not defined! Check your byteorder.h" +#endif + }u; + __u32 value; + }lo_address; + __u32 hi_address; +} __attribute__ ((packed)); + +#endif /* __ASM_MSI_H */ diff -Naurp xen/include/asm-x86/msr.h xen-redhat/include/asm-x86/msr.h --- xen/include/asm-x86/msr.h +++ xen-redhat/include/asm-x86/msr.h @@ -121,6 +121,7 @@ static inline void wrmsrl(unsigned int m #define MSR_IA32_VMX_CR4_FIXED0 0x488 #define MSR_IA32_VMX_CR4_FIXED1 0x489 #define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b +#define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x48e #define IA32_FEATURE_CONTROL_MSR 0x3a #define IA32_FEATURE_CONTROL_MSR_LOCK 0x1 #define IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON 0x4 @@ -149,16 +150,14 @@ static inline void wrmsrl(unsigned int m #ifndef __ASSEMBLY__ -DECLARE_PER_CPU(__u64, efer); +DECLARE_PER_CPU(u64, efer); -static inline __u64 read_efer(void) +static inline u64 read_efer(void) { - if (!this_cpu(efer)) - rdmsrl(MSR_EFER, this_cpu(efer)); return this_cpu(efer); } -static inline void write_efer(__u64 val) +static inline void write_efer(u64 val) { this_cpu(efer) = val; wrmsrl(MSR_EFER, val); @@ -199,6 +198,9 @@ static inline void write_efer(__u64 val) #define MSR_IA32_PERF_STATUS 0x198 #define MSR_IA32_PERF_CTL 0x199 +#define MSR_IA32_MPERF 0x000000e7 +#define MSR_IA32_APERF 0x000000e8 + #define MSR_IA32_THERM_CONTROL 0x19a #define MSR_IA32_THERM_INTERRUPT 0x19b #define MSR_IA32_THERM_STATUS 0x19c @@ -207,6 +209,8 @@ static inline void write_efer(__u64 val) #define MSR_IA32_MISC_ENABLE_PERF_AVAIL (1<<7) #define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL (1<<11) #define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL (1<<12) +#define MSR_IA32_MISC_ENABLE_MONITOR_ENABLE (1<<18) +#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1<<23) #define MSR_IA32_DEBUGCTLMSR 0x1d9 #define MSR_IA32_LASTBRANCHFROMIP 0x1db @@ -214,6 +218,8 @@ static inline void write_efer(__u64 val) #define MSR_IA32_LASTINTFROMIP 0x1dd #define MSR_IA32_LASTINTTOIP 0x1de +#define MSR_IA32_CR_PAT 0x00000277 + #define MSR_IA32_MC0_CTL 0x400 #define MSR_IA32_MC0_STATUS 0x401 #define MSR_IA32_MC0_ADDR 0x402 @@ -349,6 +355,18 @@ static inline void write_efer(__u64 val) #define MSR_K7_CLK_CTL 0xC001001b #define MSR_K7_FID_VID_CTL 0xC0010041 #define MSR_K7_FID_VID_STATUS 0xC0010042 +#define MSR_K8_PSTATE_LIMIT 0xc0010061 +#define MSR_K8_PSTATE_CTRL 0xc0010062 +#define MSR_K8_PSTATE_STATUS 0xc0010063 +#define MSR_K8_PSTATE0 0xc0010064 +#define MSR_K8_PSTATE1 0xc0010065 +#define MSR_K8_PSTATE2 0xc0010066 +#define MSR_K8_PSTATE3 0xc0010067 +#define MSR_K8_PSTATE4 0xc0010068 +#define MSR_K8_PSTATE5 0xc0010069 +#define MSR_K8_PSTATE6 0xc001006A +#define MSR_K8_PSTATE7 0xc001006B +#define MSR_K8_ENABLE_C1E 0xc0010055 #define MSR_K8_TOP_MEM1 0xC001001A #define MSR_K8_TOP_MEM2 0xC001001D @@ -357,6 +375,9 @@ static inline void write_efer(__u64 val) #define MSR_K8_VM_CR 0xC0010114 #define MSR_K8_VM_HSAVE_PA 0xC0010117 +#define MSR_K8_FIDVID_CTL 0xC0010041 +#define MSR_K8_FIDVID_STATUS 0xC0010042 + /* MSR_K8_VM_CR bits: */ #define _K8_VMCR_SVME_DISABLE 4 #define K8_VMCR_SVME_DISABLE (1 << _K8_VMCR_SVME_DISABLE) diff -Naurp xen/include/asm-x86/mtrr.h xen-redhat/include/asm-x86/mtrr.h --- xen/include/asm-x86/mtrr.h +++ xen-redhat/include/asm-x86/mtrr.h @@ -18,5 +18,8 @@ extern int mtrr_add_page(unsigned long b extern int mtrr_del(int reg, unsigned long base, unsigned long size); extern int mtrr_del_page(int reg, unsigned long base, unsigned long size); extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi); +extern int hold_mtrr_updates_on_aps; +extern void mtrr_aps_sync_begin(void); +extern void mtrr_aps_sync_end(void); #endif /* __ASM_X86_MTRR_H__ */ diff -Naurp xen/include/asm-x86/numa.h xen-redhat/include/asm-x86/numa.h --- xen/include/asm-x86/numa.h +++ xen-redhat/include/asm-x86/numa.h @@ -52,7 +52,7 @@ struct node_data { extern struct node_data node_data[]; -static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) +static inline __attribute__((pure)) int phys_to_nid(paddr_t addr) { unsigned nid; VIRTUAL_BUG_ON((addr >> memnode_shift) >= NODEMAPSIZE); diff -Naurp xen/include/asm-x86/p2m.h xen-redhat/include/asm-x86/p2m.h --- xen/include/asm-x86/p2m.h +++ xen-redhat/include/asm-x86/p2m.h @@ -26,6 +26,78 @@ #ifndef _XEN_P2M_H #define _XEN_P2M_H +#include <asm/paging.h> + +#if 1 /* XEN_VERSION == 3 && XEN_SUBVERSION < 2 */ + +typedef enum { + p2m_invalid = 0, /* Nothing mapped here */ + p2m_ram_rw = 1, /* Normal read/write guest RAM */ + p2m_ram_logdirty = 2, /* Temporarily read-only for log-dirty */ + p2m_ram_ro = 3, /* Read-only; writes go to the device model */ + p2m_mmio_dm = 4, /* Reads and write go to the device model */ + p2m_mmio_direct = 5, /* Read/write mapping of genuine MMIO area */ +} p2m_type_t; + +/* We use bitmaps and maks to handle groups of types */ +#define p2m_to_mask(_t) (1UL << (_t)) + +/* RAM types, which map to real machine frames */ +#define P2M_RAM_TYPES (p2m_to_mask(p2m_ram_rw) \ + | p2m_to_mask(p2m_ram_logdirty) \ + | p2m_to_mask(p2m_ram_ro)) + +/* MMIO types, which don't have to map to anything in the frametable */ +#define P2M_MMIO_TYPES (p2m_to_mask(p2m_mmio_dm) \ + | p2m_to_mask(p2m_mmio_direct)) + +/* Read-only types, which must have the _PAGE_RW bit clear in their PTEs */ +#define P2M_RO_TYPES (p2m_to_mask(p2m_ram_logdirty) \ + | p2m_to_mask(p2m_ram_ro)) + +/* Useful predicates */ +#define p2m_is_ram(_t) (p2m_to_mask(_t) & P2M_RAM_TYPES) +#define p2m_is_mmio(_t) (p2m_to_mask(_t) & P2M_MMIO_TYPES) +#define p2m_is_readonly(_t) (p2m_to_mask(_t) & P2M_RO_TYPES) +#define p2m_is_valid(_t) (p2m_to_mask(_t) & (P2M_RAM_TYPES | P2M_MMIO_TYPES)) + +/* PTE flags for the various types of p2m entry */ +#define P2M_BASE_FLAGS \ + (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED) + +/* Extract the type from the PTE flags that store it */ +static inline p2m_type_t p2m_flags_to_type(unsigned long flags) +{ + if ( (flags & _PAGE_RW) && (flags & _PAGE_PCD) ) + return p2m_mmio_direct; + else if ( flags & _PAGE_RW ) + return p2m_ram_rw; + else if ( paging_mode_log_dirty(current->domain) ) + return p2m_ram_logdirty; + else + return p2m_invalid; +} + +static inline unsigned long p2m_type_to_flags(p2m_type_t t) +{ + unsigned long flags = 0; + switch(t) + { + case p2m_ram_rw: + return flags | P2M_BASE_FLAGS | _PAGE_RW; + case p2m_mmio_direct: + return flags | P2M_BASE_FLAGS | _PAGE_RW | _PAGE_PCD; + case p2m_ram_logdirty: + case p2m_ram_ro: + case p2m_mmio_dm: + return flags | P2M_BASE_FLAGS; + case p2m_invalid: + default: + return flags; + } +} + +#endif /* The phys_to_machine_mapping is the reversed mapping of MPT for full * virtualization. It is only used by shadow_mode_translate()==true @@ -38,8 +110,19 @@ /* Read the current domain's P2M table. */ static inline mfn_t gfn_to_mfn_current(unsigned long gfn) { + return current->domain->arch.p2m.get_entry_fast(gfn); +} + +/* Read the current domain's p2m table (through the linear mapping). */ +static inline mfn_t p2m_gfn_to_mfn_fast(unsigned long gfn) +{ l1_pgentry_t l1e = l1e_empty(); + l2_pgentry_t l2e = l2e_empty(); int ret; + paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT; +#if CONFIG_PAGING_LEVELS >= 4 + l3_pgentry_t l3e = l3e_empty(); +#endif if ( gfn > current->domain->arch.p2m.max_mapped_pfn ) return _mfn(INVALID_MFN); @@ -47,18 +130,50 @@ static inline mfn_t gfn_to_mfn_current(u /* Don't read off the end of the p2m table */ ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t)); - ret = __copy_from_user(&l1e, - &phys_to_machine_mapping[gfn], - sizeof(l1e)); - if ( (ret == 0) && (l1e_get_flags(l1e) & _PAGE_PRESENT) ) - return _mfn(l1e_get_pfn(l1e)); +#if CONFIG_PAGING_LEVELS >= 4 + /* check whether 1GB is available or not */ + ret = __copy_from_user(&l3e, + &__linear_l2_table[l2_linear_offset(RO_MPT_VIRT_START) + l3_linear_offset(addr)], + sizeof(l3e)); + if ( (ret == 0) && (l3e_get_flags(l3e) & _PAGE_PRESENT) && + (l3e_get_flags(l3e) & _PAGE_PSE) ) + { + return _mfn(l3e_get_pfn(l3e) + + l2_table_offset(addr) * L1_PAGETABLE_ENTRIES + + l1_table_offset(addr)); + } +#endif + + /* check 2MB entry */ + ret = __copy_from_user(&l2e, + &__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START) + l2_linear_offset(addr)], + sizeof(l2e)); + + if ( (ret == 0) && (l2e_get_flags(l2e) & _PAGE_PRESENT) && + (l2e_get_flags(l2e) & _PAGE_PSE) ) + { + return _mfn(l2e_get_pfn(l2e) + l1_table_offset(addr)); + } + else + { + ret = __copy_from_user(&l1e, + &phys_to_machine_mapping[gfn], + sizeof(l1e)); + + if ( (ret == 0) && (l1e_get_flags(l1e) & _PAGE_PRESENT) ) + return _mfn(l1e_get_pfn(l1e)); + } return _mfn(INVALID_MFN); } /* Read another domain's P2M table, mapping pages as we go */ -mfn_t gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn); +static inline +mfn_t gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn) +{ + return d->arch.p2m.get_entry(d, gpfn); +} /* General conversion function from gfn to mfn */ static inline mfn_t gfn_to_mfn(struct domain *d, unsigned long gfn) @@ -104,6 +219,9 @@ gl1e_to_ml1e(struct domain *d, l1_pgentr } +/* Set mmio addresses in the p2m table (for pass-through) */ +int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn); +int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn); /* Init the datastructures for later use by the p2m code */ void p2m_init(struct domain *d); @@ -122,16 +240,18 @@ int p2m_alloc_table(struct domain *d, void p2m_teardown(struct domain *d); /* Add a page to a domain's p2m table */ -void guest_physmap_add_page(struct domain *d, unsigned long gfn, - unsigned long mfn); +int guest_physmap_add_page(struct domain *d, unsigned long gfn, + unsigned long mfn, int order); /* Remove a page from a domain's p2m table */ void guest_physmap_remove_page(struct domain *d, unsigned long gfn, - unsigned long mfn); + unsigned long mfn, int order); /* set P2M table l1e flags */ void p2m_set_flags_global(struct domain *d, u32 l1e_flags); +void p2m_change_entry_type_global(struct domain *d, u32 l1e_flags); + /* set P2M table l1e flags for a gpa */ int p2m_set_flags(struct domain *d, paddr_t gpa, u32 l1e_flags); diff -Naurp xen/include/asm-x86/page.h xen-redhat/include/asm-x86/page.h --- xen/include/asm-x86/page.h +++ xen-redhat/include/asm-x86/page.h @@ -294,20 +294,6 @@ void paging_init(void); void setup_idle_pagetable(void); #endif /* !defined(__ASSEMBLY__) */ -#define __pge_off() \ - do { \ - __asm__ __volatile__( \ - "mov %0, %%cr4; # turn off PGE " \ - : : "r" (mmu_cr4_features & ~X86_CR4_PGE) ); \ - } while ( 0 ) - -#define __pge_on() \ - do { \ - __asm__ __volatile__( \ - "mov %0, %%cr4; # turn off PGE " \ - : : "r" (mmu_cr4_features) ); \ - } while ( 0 ) - #define _PAGE_PRESENT 0x001U #define _PAGE_RW 0x002U #define _PAGE_USER 0x004U diff -Naurp xen/include/asm-x86/paging.h xen-redhat/include/asm-x86/paging.h --- xen/include/asm-x86/paging.h +++ xen-redhat/include/asm-x86/paging.h @@ -36,6 +36,9 @@ /***************************************************************************** * Macros to tell which paging mode a domain is in */ +#define hap_enabled(d) (hvm_funcs.hap_supported && is_hvm_domain(d)) +#define hap_1gb_pgtb(d) (hvm_funcs.hap_1gb_pgtb && is_hvm_domain(d)) + #define PG_SH_shift 20 #define PG_HAP_shift 21 /* We're in one of the shadow modes */ diff -Naurp xen/include/asm-x86/processor.h xen-redhat/include/asm-x86/processor.h --- xen/include/asm-x86/processor.h +++ xen-redhat/include/asm-x86/processor.h @@ -8,6 +8,8 @@ #include <xen/config.h> #include <xen/cache.h> #include <xen/types.h> +#include <xen/smp.h> +#include <xen/percpu.h> #include <public/xen.h> #include <asm/types.h> #include <asm/cpufeature.h> @@ -194,6 +196,7 @@ extern int phys_proc_id[NR_CPUS]; extern int cpu_core_id[NR_CPUS]; extern void identify_cpu(struct cpuinfo_x86 *); +extern void setup_clear_cpu_cap(unsigned int); extern void print_cpu_info(struct cpuinfo_x86 *); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); extern void dodgy_tsc(void); @@ -296,16 +299,17 @@ static inline unsigned long read_cr2(voi return __cr2; } +DECLARE_PER_CPU(unsigned long, cr4); + static inline unsigned long read_cr4(void) { - unsigned long __cr4; - __asm__("mov %%cr4,%0\n\t" :"=r" (__cr4)); - return __cr4; -} + return this_cpu(cr4); +} static inline void write_cr4(unsigned long val) { - __asm__("mov %0,%%cr4": :"r" ((unsigned long)val)); + this_cpu(cr4) = val; + asm volatile ( "mov %0,%%cr4" : : "r" (val) ); } @@ -331,24 +335,14 @@ extern unsigned long mmu_cr4_features; static always_inline void set_in_cr4 (unsigned long mask) { - unsigned long dummy; mmu_cr4_features |= mask; - __asm__ __volatile__ ( - "mov %%cr4,%0\n\t" - "or %1,%0\n\t" - "mov %0,%%cr4\n" - : "=&r" (dummy) : "irg" (mask) ); + write_cr4(read_cr4() | mask); } static always_inline void clear_in_cr4 (unsigned long mask) { - unsigned long dummy; mmu_cr4_features &= ~mask; - __asm__ __volatile__ ( - "mov %%cr4,%0\n\t" - "and %1,%0\n\t" - "mov %0,%%cr4\n" - : "=&r" (dummy) : "irg" (~mask) ); + write_cr4(read_cr4() & ~mask); } /* diff -Naurp xen/include/asm-x86/smp.h xen-redhat/include/asm-x86/smp.h --- xen/include/asm-x86/smp.h +++ xen-redhat/include/asm-x86/smp.h @@ -13,7 +13,6 @@ #ifdef CONFIG_X86_LOCAL_APIC #ifndef __ASSEMBLY__ -#include <asm/fixmap.h> #include <asm/bitops.h> #include <asm/mpspec.h> #ifdef CONFIG_X86_IO_APIC diff -Naurp xen/include/asm-x86/spinlock.h xen-redhat/include/asm-x86/spinlock.h --- xen/include/asm-x86/spinlock.h +++ xen-redhat/include/asm-x86/spinlock.h @@ -8,22 +8,22 @@ typedef struct { volatile s16 lock; - s8 recurse_cpu; + u8 recurse_cpu; u8 recurse_cnt; } spinlock_t; #define SPIN_LOCK_UNLOCKED /*(spinlock_t)*/ { 1, -1, 0 } #define spin_lock_init(x) do { *(x) = (spinlock_t) SPIN_LOCK_UNLOCKED; } while(0) -#define spin_is_locked(x) (*(volatile char *)(&(x)->lock) <= 0) +#define spin_is_locked(x) ((x)->lock <= 0) static inline void _raw_spin_lock(spinlock_t *lock) { __asm__ __volatile__ ( - "1: lock; decb %0 \n" + "1: lock; decw %0 \n" " js 2f \n" ".section .text.lock,\"ax\"\n" - "2: cmpb $0,%0 \n" + "2: cmpw $0,%0 \n" " rep; nop \n" " jle 2b \n" " jmp 1b \n" @@ -36,23 +36,23 @@ static inline void _raw_spin_unlock(spin #if !defined(CONFIG_X86_OOSTORE) ASSERT(spin_is_locked(lock)); __asm__ __volatile__ ( - "movb $1,%0" + "movw $1,%0" : "=m" (lock->lock) : : "memory" ); #else - char oldval = 1; + s16 oldval = 1; ASSERT(spin_is_locked(lock)); __asm__ __volatile__ ( - "xchgb %b0, %1" - : "=q" (oldval), "=m" (lock->lock) : "0" (oldval) : "memory" ); + "xchgw %w0, %1" + : "=r" (oldval), "=m" (lock->lock) : "0" (oldval) : "memory" ); #endif } static inline int _raw_spin_trylock(spinlock_t *lock) { - char oldval; + s16 oldval; __asm__ __volatile__( - "xchgb %b0,%1" - :"=q" (oldval), "=m" (lock->lock) + "xchgw %w0,%1" + :"=r" (oldval), "=m" (lock->lock) :"0" (0) : "memory"); return oldval > 0; } diff -Naurp xen/include/asm-x86/system.h xen-redhat/include/asm-x86/system.h --- xen/include/asm-x86/system.h +++ xen-redhat/include/asm-x86/system.h @@ -14,6 +14,9 @@ #define wbinvd() \ __asm__ __volatile__ ("wbinvd": : :"memory"); +#define clflush(a) \ + __asm__ __volatile__ ("clflush (%0)": :"r"(a)); + #define nop() __asm__ __volatile__ ("nop") #define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr)))) diff -Naurp xen/include/asm-x86/time.h xen-redhat/include/asm-x86/time.h --- xen/include/asm-x86/time.h +++ xen-redhat/include/asm-x86/time.h @@ -26,4 +26,9 @@ extern int time_resume(void); extern void init_percpu_time(void); +struct ioreq; +int dom0_pit_access(struct ioreq *ioreq); + +int cpu_frequency_change(u64 freq); + #endif /* __X86_TIME_H__ */ diff -Naurp xen/include/asm-x86/x86_32/elf.h xen-redhat/include/asm-x86/x86_32/elf.h --- xen/include/asm-x86/x86_32/elf.h +++ xen-redhat/include/asm-x86/x86_32/elf.h @@ -1,8 +1,6 @@ #ifndef __X86_32_ELF_H__ #define __X86_32_ELF_H__ -#include <asm/processor.h> - typedef struct { unsigned long ebx; unsigned long ecx; @@ -40,7 +38,7 @@ static inline void elf_core_save_regs(EL asm volatile("movw %%fs, %%ax;" :"=a"(core_regs->fs)); asm volatile("movw %%gs, %%ax;" :"=a"(core_regs->gs)); /* orig_eax not filled in for now */ - core_regs->eip = (unsigned long)current_text_addr(); + core_regs->eip = (unsigned long)elf_core_save_regs; asm volatile("movw %%cs, %%ax;" :"=a"(core_regs->cs)); asm volatile("pushfl; popl %0" :"=m"(core_regs->eflags)); asm volatile("movl %%esp,%0" : "=m"(core_regs->esp)); diff -Naurp xen/include/asm-x86/x86_64/elf.h xen-redhat/include/asm-x86/x86_64/elf.h --- xen/include/asm-x86/x86_64/elf.h +++ xen-redhat/include/asm-x86/x86_64/elf.h @@ -1,8 +1,6 @@ #ifndef __X86_64_ELF_H__ #define __X86_64_ELF_H__ -#include <asm/processor.h> - typedef struct { unsigned long r15; unsigned long r14; @@ -54,7 +52,7 @@ static inline void elf_core_save_regs(EL asm volatile("movq %%rsi,%0" : "=m"(core_regs->rsi)); asm volatile("movq %%rdi,%0" : "=m"(core_regs->rdi)); /* orig_rax not filled in for now */ - core_regs->rip = (unsigned long)current_text_addr(); + core_regs->rip = (unsigned long)elf_core_save_regs; asm volatile("movl %%cs, %%eax;" :"=a"(core_regs->cs)); asm volatile("pushfq; popq %0" :"=m"(core_regs->eflags)); asm volatile("movq %%rsp,%0" : "=m"(core_regs->rsp)); diff -Naurp xen/include/asm-x86/x86_64/uaccess.h xen-redhat/include/asm-x86/x86_64/uaccess.h --- xen/include/asm-x86/x86_64/uaccess.h +++ xen-redhat/include/asm-x86/x86_64/uaccess.h @@ -8,7 +8,7 @@ * non-canonical address (and thus fault) before ever reaching VIRT_START. */ #define __addr_ok(addr) \ - (((unsigned long)(addr) < (1UL<<48)) || \ + (((unsigned long)(addr) < (1UL<<47)) || \ ((unsigned long)(addr) >= HYPERVISOR_VIRT_END)) #define access_ok(addr, size) (__addr_ok(addr)) diff -Naurp xen/include/Makefile xen-redhat/include/Makefile --- xen/include/Makefile +++ xen-redhat/include/Makefile @@ -19,7 +19,8 @@ headers-y := \ compat/version.h \ compat/xen.h \ compat/xencomm.h \ - compat/xenoprof.h + compat/xenoprof.h \ + compat/stratus.h headers-$(CONFIG_X86) += compat/arch-x86/xen.h headers-$(CONFIG_X86) += compat/arch-x86/xen-$(compat-arch-y).h headers-y += compat/arch-$(compat-arch-y).h compat/xlat.h diff -Naurp xen/include/public/domctl.h xen-redhat/include/public/domctl.h --- xen/include/public/domctl.h +++ xen-redhat/include/public/domctl.h @@ -432,7 +432,121 @@ struct xen_domctl_sendtrigger { typedef struct xen_domctl_sendtrigger xen_domctl_sendtrigger_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_sendtrigger_t); - + +/* Assign PCI device to HVM guest. Sets up IOMMU structures. */ +#define XEN_DOMCTL_assign_device 37 +#define XEN_DOMCTL_test_assign_device 45 +#define XEN_DOMCTL_deassign_device 47 +struct xen_domctl_assign_device { + uint32_t machine_bdf; /* machine PCI ID of assigned device */ +}; +typedef struct xen_domctl_assign_device xen_domctl_assign_device_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_assign_device_t); + +/* Retrieve sibling devices infomation of machine_bdf */ +#define XEN_DOMCTL_get_device_group 50 +struct xen_domctl_get_device_group { + uint32_t machine_bdf; /* IN */ + uint32_t max_sdevs; /* IN */ + uint32_t num_sdevs; /* OUT */ + XEN_GUEST_HANDLE_64(uint32_t) sdev_array; /* OUT */ +}; +typedef struct xen_domctl_get_device_group xen_domctl_get_device_group_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_get_device_group_t); + + +/* Pass-through interrupts: bind real irq -> hvm devfn. */ +#define XEN_DOMCTL_bind_pt_irq 38 +#define XEN_DOMCTL_unbind_pt_irq 48 +typedef enum pt_irq_type_e { + PT_IRQ_TYPE_PCI, + PT_IRQ_TYPE_ISA, + PT_IRQ_TYPE_MSI, +} pt_irq_type_t; +struct xen_domctl_bind_pt_irq { + uint32_t machine_irq; + pt_irq_type_t irq_type; + uint32_t hvm_domid; + + union { + struct { + uint8_t isa_irq; + } isa; + struct { + uint8_t bus; + uint8_t device; + uint8_t intx; + } pci; + struct { + uint8_t gvec; + uint32_t gflags; + } msi; + } u; +}; +typedef struct xen_domctl_bind_pt_irq xen_domctl_bind_pt_irq_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_bind_pt_irq_t); + + +/* Bind machine I/O address range -> HVM address range. */ +#define XEN_DOMCTL_memory_mapping 39 +#define DPCI_ADD_MAPPING 1 +#define DPCI_REMOVE_MAPPING 0 +struct xen_domctl_memory_mapping { + uint64_aligned_t first_gfn; /* first page (hvm guest phys page) in range */ + uint64_aligned_t first_mfn; /* first page (machine page) in range */ + uint64_aligned_t nr_mfns; /* number of pages in range (>0) */ + uint32_t add_mapping; /* add or remove mapping */ + uint32_t padding; /* padding for 64-bit aligned structure */ +}; +typedef struct xen_domctl_memory_mapping xen_domctl_memory_mapping_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_memory_mapping_t); + +#define XEN_DOMCTL_gdbsx_guestmemio 1000 /* guest mem io */ +struct xen_domctl_gdbsx_memio { + uint64_aligned_t pgd3val;/* optional: init_mm.pgd[3] value */ + uint64_aligned_t gva; /* guest virtual address */ + uint64_aligned_t uva; /* user buffer virtual address */ + int len; /* number of bytes to read/write */ + int gwr; /* 0 = read from guest. 1 = write to guest */ + int remain; /* bytes remaining to be copied */ +}; + +#define XEN_DOMCTL_gdbsx_pausevcpu 1001 +#define XEN_DOMCTL_gdbsx_unpausevcpu 1002 +struct xen_domctl_gdbsx_pauseunp_vcpu { /* pause/unpause a vcpu */ + uint32_t vcpu; /* which vcpu */ +}; + +#define XEN_DOMCTL_gdbsx_domstatus 1003 +struct xen_domctl_gdbsx_domstatus { + int paused; /* is the domain paused */ + uint32_t vcpu_id; /* any vcpu in an event? */ + uint32_t vcpu_ev; /* if yes, what event? */ + +}; + +/* Bind machine I/O port range -> HVM I/O port range. */ +#define XEN_DOMCTL_ioport_mapping 40 +struct xen_domctl_ioport_mapping { + uint32_t first_gport; /* first guest IO port*/ + uint32_t first_mport; /* first machine IO port */ + uint32_t nr_ports; /* size of port range */ + uint32_t add_mapping; /* add or remove mapping */ +}; +typedef struct xen_domctl_ioport_mapping xen_domctl_ioport_mapping_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_ioport_mapping_t); + +/* + * Request a particular record from the HVM context + */ +#define XEN_DOMCTL_gethvmcontext_partial 55 +typedef struct xen_domctl_hvmcontext_partial { + uint32_t type; /* IN: Type of record required */ + uint32_t instance; /* IN: Instance of that type */ + XEN_GUEST_HANDLE_64(uint8_t) buffer; /* OUT: buffer to write record into */ +} xen_domctl_hvmcontext_partial_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_partial_t); + struct xen_domctl { uint32_t cmd; uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */ @@ -460,8 +574,17 @@ struct xen_domctl { struct xen_domctl_settimeoffset settimeoffset; struct xen_domctl_real_mode_area real_mode_area; struct xen_domctl_hvmcontext hvmcontext; + struct xen_domctl_hvmcontext_partial hvmcontext_partial; struct xen_domctl_address_size address_size; struct xen_domctl_sendtrigger sendtrigger; + struct xen_domctl_get_device_group get_device_group; + struct xen_domctl_assign_device assign_device; + struct xen_domctl_bind_pt_irq bind_pt_irq; + struct xen_domctl_memory_mapping memory_mapping; + struct xen_domctl_ioport_mapping ioport_mapping; + struct xen_domctl_gdbsx_memio gdbsx_guest_memio; + struct xen_domctl_gdbsx_pauseunp_vcpu gdbsx_pauseunp_vcpu; + struct xen_domctl_gdbsx_domstatus gdbsx_domstatus; uint8_t pad[128]; } u; }; diff -Naurp xen/include/public/features.h xen-redhat/include/public/features.h --- xen/include/public/features.h +++ xen-redhat/include/public/features.h @@ -56,6 +56,9 @@ */ #define XENFEAT_pae_pgdir_above_4gb 4 +/* x86: Does this Xen host support the MMU_PT_UPDATE_PRESERVE_AD hypercall? */ +#define XENFEAT_mmu_pt_update_preserve_ad 5 + #define XENFEAT_NR_SUBMAPS 1 #endif /* __XEN_PUBLIC_FEATURES_H__ */ diff -Naurp xen/include/public/grant_table.h xen-redhat/include/public/grant_table.h --- xen/include/public/grant_table.h +++ xen-redhat/include/public/grant_table.h @@ -370,7 +370,8 @@ DEFINE_XEN_GUEST_HANDLE(gnttab_query_siz #define GNTST_no_device_space (-7) /* Out of space in I/O MMU. */ #define GNTST_permission_denied (-8) /* Not enough privilege for operation. */ #define GNTST_bad_page (-9) /* Specified page was invalid for op. */ -#define GNTST_bad_copy_arg (-10) /* copy arguments cross page boundary */ +#define GNTST_bad_copy_arg (-10) /* copy arguments cross page boundary. */ +#define GNTST_address_too_big (-11) /* transfer page address too large. */ #define GNTTABOP_error_msgs { \ "okay", \ @@ -383,7 +384,8 @@ DEFINE_XEN_GUEST_HANDLE(gnttab_query_siz "no spare translation slot in the I/O MMU", \ "permission denied", \ "bad page", \ - "copy arguments cross page boundary" \ + "copy arguments cross page boundary", \ + "page address size too large" \ } #endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */ diff -Naurp xen/include/public/hvm/hvm_op.h xen-redhat/include/public/hvm/hvm_op.h --- xen/include/public/hvm/hvm_op.h +++ xen-redhat/include/public/hvm/hvm_op.h @@ -73,4 +73,12 @@ DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_pci_ /* Flushes all VCPU TLBs: @arg must be NULL. */ #define HVMOP_flush_tlbs 5 +/* Get the current Xen time, in nanoseconds since system boot. */ +#define HVMOP_get_time 10 +struct xen_hvm_get_time { + uint64_t now; /* OUT */ +}; +typedef struct xen_hvm_get_time xen_hvm_get_time_t; +DEFINE_XEN_GUEST_HANDLE(xen_hvm_get_time_t); + #endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */ diff -Naurp xen/include/public/hvm/params.h xen-redhat/include/public/hvm/params.h --- xen/include/public/hvm/params.h +++ xen-redhat/include/public/hvm/params.h @@ -52,9 +52,36 @@ #ifdef __ia64__ #define HVM_PARAM_NVRAM_FD 7 -#define HVM_NR_PARAMS 8 -#else -#define HVM_NR_PARAMS 7 #endif +/* + * Set mode for virtual timers (currently x86 only): + * delay_for_missed_ticks (default): + * Do not advance a vcpu's time beyond the correct delivery time for + * interrupts that have been missed due to preemption. Deliver missed + * interrupts when the vcpu is rescheduled and advance the vcpu's virtual + * time stepwise for each one. + * no_delay_for_missed_ticks: + * As above, missed interrupts are delivered, but guest time always tracks + * wallclock (i.e., real) time while doing so. + * no_missed_ticks_pending: + * No missed interrupts are held pending. Instead, to ensure ticks are + * delivered at some non-zero rate, if we detect missed ticks then the + * internal tick alarm is not disabled if the VCPU is preempted during the + * next tick period. + * one_missed_tick_pending: + * Missed interrupts are collapsed together and delivered as one 'late tick'. + * Guest time always tracks wallclock (i.e., real) time. + */ +#define HVM_PARAM_TIMER_MODE 10 +#define HVMPTM_delay_for_missed_ticks 0 +#define HVMPTM_no_delay_for_missed_ticks 1 +#define HVMPTM_no_missed_ticks_pending 2 +#define HVMPTM_one_missed_tick_pending 3 + +/* Boolean: Enable virtual HPET (high-precision event timer)? (x86-only) */ +#define HVM_PARAM_HPET_ENABLED 11 + +#define HVM_NR_PARAMS 12 + #endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */ diff -Naurp xen/include/public/hvm/save.h xen-redhat/include/public/hvm/save.h --- xen/include/public/hvm/save.h +++ xen-redhat/include/public/hvm/save.h @@ -328,7 +328,7 @@ struct hvm_hw_pci_irqs { * Indexed by: device*4 + INTx#. */ union { - DECLARE_BITMAP(i, 32*4); + unsigned long i[16 / sizeof (unsigned long)]; /* DECLARE_BITMAP(i, 32*4); */ uint64_t pad[2]; }; }; @@ -341,7 +341,7 @@ struct hvm_hw_isa_irqs { * Indexed by ISA IRQ (assumes no ISA-device IRQ sharing). */ union { - DECLARE_BITMAP(i, 16); + unsigned long i[1]; /* DECLARE_BITMAP(i, 16); */ uint64_t pad[1]; }; }; diff -Naurp xen/include/public/kexec.h xen-redhat/include/public/kexec.h --- xen/include/public/kexec.h +++ xen-redhat/include/public/kexec.h @@ -109,6 +109,7 @@ typedef struct xen_kexec_load { #define KEXEC_RANGE_MA_XEN 1 /* machine address and size of Xen itself */ #define KEXEC_RANGE_MA_CPU 2 /* machine address and size of a CPU note */ +#define KEXEC_RANGE_MA_VMCOREINFO 6 /* machine address and size of vmcoreinfo */ /* * Find the address and size of certain memory areas * range == KEXEC_RANGE_... [in] @@ -124,6 +125,27 @@ typedef struct xen_kexec_range { unsigned long start; } xen_kexec_range_t; +/* vmcoreinfo stuff */ +#define VMCOREINFO_BYTES (4096) +#define VMCOREINFO_NOTE_NAME "VMCOREINFO_XEN" +void arch_crash_save_vmcoreinfo(void); +void vmcoreinfo_append_str(const char *fmt, ...) + __attribute__ ((format (printf, 1, 2))); +#define VMCOREINFO_PAGESIZE(value) \ + vmcoreinfo_append_str("PAGESIZE=%ld\n", value) +#define VMCOREINFO_SYMBOL(name) \ + vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name) +#define VMCOREINFO_SYMBOL_ALIAS(alias, name) \ + vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #alias, (unsigned long)&name) +#define VMCOREINFO_STRUCT_SIZE(name) \ + vmcoreinfo_append_str("SIZE(%s)=%zu\n", #name, sizeof(struct name)) +#define VMCOREINFO_OFFSET(name, field) \ + vmcoreinfo_append_str("OFFSET(%s.%s)=%zu\n", #name, #field, \ + offsetof(struct name, field)) +#define VMCOREINFO_OFFSET_ALIAS(name, field, alias) \ + vmcoreinfo_append_str("OFFSET(%s.%s)=%zu\n", #name, #alias, \ + offsetof(struct name, field)) + #endif /* _XEN_PUBLIC_KEXEC_H */ /* diff -Naurp xen/include/public/physdev.h xen-redhat/include/public/physdev.h --- xen/include/public/physdev.h +++ xen-redhat/include/public/physdev.h @@ -21,6 +21,8 @@ #ifndef __XEN_PUBLIC_PHYSDEV_H__ #define __XEN_PUBLIC_PHYSDEV_H__ +#include "xen.h" + /* * Prototype for this hypercall is: * int physdev_op(int cmd, void *args) @@ -117,7 +119,64 @@ struct physdev_irq { }; typedef struct physdev_irq physdev_irq_t; DEFINE_XEN_GUEST_HANDLE(physdev_irq_t); + +#define MAP_PIRQ_TYPE_MSI 0x0 +#define MAP_PIRQ_TYPE_GSI 0x1 +#define MAP_PIRQ_TYPE_UNKNOWN 0x2 + +#define PHYSDEVOP_map_pirq 13 +struct physdev_map_pirq { + domid_t domid; + /* IN */ + int type; + /* IN */ + int index; + /* IN or OUT */ + int pirq; + /* IN */ + int bus; + /* IN */ + int devfn; + /* IN */ + int entry_nr; + /* IN */ + uint64_t table_base; +}; +typedef struct physdev_map_pirq physdev_map_pirq_t; +DEFINE_XEN_GUEST_HANDLE(physdev_map_pirq_t); + +#define PHYSDEVOP_unmap_pirq 14 +struct physdev_unmap_pirq { + domid_t domid; + /* IN */ + int pirq; +}; + +typedef struct physdev_unmap_pirq physdev_unmap_pirq_t; +DEFINE_XEN_GUEST_HANDLE(physdev_unmap_pirq_t); + +#define PHYSDEVOP_manage_pci_add 15 +#define PHYSDEVOP_manage_pci_remove 16 +struct physdev_manage_pci { + /* IN */ + uint8_t bus; + uint8_t devfn; +}; + +typedef struct physdev_manage_pci physdev_manage_pci_t; +DEFINE_XEN_GUEST_HANDLE(physdev_manage_pci_t); + +/* N.B. RHEL specific hypercall */ +#define PHYSDEVOP_set_device_msixtbl 1999 +struct physdev_device_msixtbl { + /* IN */ + uint8_t bus; + uint8_t devfn; + uint64_t gtable; +}; +typedef struct physdev_device_msixtbl physdev_device_msixtbl_t; +DEFINE_XEN_GUEST_HANDLE(physdev_device_msixtbl_t); /* * Argument to physdev_op_compat() hypercall. Superceded by new physdev_op() * hypercall since 0x00030202. diff -Naurp xen/include/public/platform.h xen-redhat/include/public/platform.h --- xen/include/public/platform.h +++ xen-redhat/include/public/platform.h @@ -28,6 +28,7 @@ #define __XEN_PUBLIC_PLATFORM_H__ #include "xen.h" +#include "stratus.h" #define XENPF_INTERFACE_VERSION 0x03000001 @@ -153,6 +154,45 @@ struct xenpf_firmware_info { typedef struct xenpf_firmware_info xenpf_firmware_info_t; DEFINE_XEN_GUEST_HANDLE(xenpf_firmware_info_t); +#define XENPF_stratus_call 0xffffffff +typedef struct xenpf_stratus_call xenpf_stratus_call_t; +DEFINE_XEN_GUEST_HANDLE(xenpf_stratus_call_t); + +#define XENPF_change_freq 52 +struct xenpf_change_freq { + /* IN variables */ + uint32_t flags; /* Must be zero. */ + uint32_t cpu; /* Physical cpu. */ + uint64_t freq; /* New frequency (Hz). */ +}; +typedef struct xenpf_change_freq xenpf_change_freq_t; +DEFINE_XEN_GUEST_HANDLE(xenpf_change_freq_t); + +/* + * Get idle times (nanoseconds since boot) for physical CPUs specified in the + * @cpumap_bitmap with range [0..@cpumap_nr_cpus-1]. The @idletime array is + * indexed by CPU number; only entries with the corresponding @cpumap_bitmap + * bit set are written to. On return, @cpumap_bitmap is modified so that any + * non-existent CPUs are cleared. Such CPUs have their @idletime array entry + * cleared. + */ +#define XENPF_getidletime 53 +struct xenpf_getidletime { + /* IN/OUT variables */ + /* IN: CPUs to interrogate; OUT: subset of IN which are present */ + XEN_GUEST_HANDLE(uint8_t) cpumap_bitmap; + /* IN variables */ + /* Size of cpumap bitmap. */ + uint32_t cpumap_nr_cpus; + /* Must be indexable for every cpu in cpumap_bitmap. */ + XEN_GUEST_HANDLE(uint64_t) idletime; + /* OUT variables */ + /* System time when the idletime snapshots were taken. */ + uint64_t now; +}; +typedef struct xenpf_getidletime xenpf_getidletime_t; +DEFINE_XEN_GUEST_HANDLE(xenpf_getidletime_t); + struct xen_platform_op { uint32_t cmd; uint32_t interface_version; /* XENPF_INTERFACE_VERSION */ @@ -164,6 +204,9 @@ struct xen_platform_op { struct xenpf_microcode_update microcode; struct xenpf_platform_quirk platform_quirk; struct xenpf_firmware_info firmware_info; + struct xenpf_change_freq change_freq; + struct xenpf_getidletime getidletime; + struct xenpf_stratus_call stratus_call; uint8_t pad[128]; } u; }; diff -Naurp xen/include/public/stratus.h xen-redhat/include/public/stratus.h --- xen/include/public/stratus.h +++ xen-redhat/include/public/stratus.h @@ -0,0 +1,76 @@ +#ifndef _CC_INTERFACE_H +#define _CC_INTERFACE_H + +// Clear the entire Host BIOS vector +#define CC_HBV_MEMSET 1 +// Read/Write from page 0 (HBV or DUMP) +#define CC_RW_REGION 2 +// Trigger SMI through local apic +#define CC_TRIGGER_SMI 3 +// Return local cpu apic id +#define CC_LAPIC_ID 4 +// Get/Set CR4. +#define CC_CR4 5 +// Get cpuid +#define CC_CPUID 6 +// Read/Write MSRs +#define CC_RW_MSR 7 +// Are we on a Stratus box? +#define CC_VALIDATE_PLATFORM 8 + +// Page 0 regions to read/write (host bios vector or dump vector signature). +#define RW_HBV 1 +#define RW_DUMPVEC 2 + +struct cr4_struct { + int rw; // 0 = read, 1 = write. + unsigned long cr4; +}; + +struct cpuid_struct { + unsigned int op; + unsigned int eax, ebx, ecx, edx; +}; + +struct msr_struct { + int rw; + unsigned int msr; + unsigned long val; +}; + +struct lapic_struct { + int id; +}; + +struct rw_struct { + int rw; // 0 = read, 1 = write + int region; // RW_HBV or RW_CONTIG + void *data; + unsigned long where; // offset in region + int size; +}; + +struct smi_struct { + unsigned int dest; +}; + +struct hbv_memset_struct { + int val; + int size; +}; + +struct xenpf_stratus_call { + int cmd; + int ret; + union { + struct smi_struct smi; + struct hbv_memset_struct hbv_m; + struct rw_struct rw; + struct lapic_struct ls; + struct cr4_struct cr4; + struct cpuid_struct cpuid; + struct msr_struct msr; + } u; +}; + +#endif diff -Naurp xen/include/public/sysctl.h xen-redhat/include/public/sysctl.h --- xen/include/public/sysctl.h +++ xen-redhat/include/public/sysctl.h @@ -76,6 +76,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_sysctl_tbuf_ */ #define XEN_SYSCTL_physinfo 3 struct xen_sysctl_physinfo { + /* IN variables. */ uint32_t threads_per_core; uint32_t cores_per_socket; uint32_t sockets_per_node; @@ -85,6 +86,23 @@ struct xen_sysctl_physinfo { uint64_aligned_t free_pages; uint64_aligned_t scrub_pages; uint32_t hw_cap[8]; + + /* IN/OUT variables. */ + /* + * IN: maximum addressable entry in the caller-provided cpu_to_node array. + * OUT: largest cpu identifier in the system. + * If OUT is greater than IN then the cpu_to_node array is truncated! + */ + uint32_t max_cpu_id; + /* + * If not NULL, this array is filled with node identifier for each cpu. + * If a cpu has no node information (e.g., cpu not present) then the + * sentinel value ~0u is written. + * The size of this array is specified by the caller in @max_cpu_id. + * If the actual @max_cpu_id is smaller than the array then the trailing + * elements of the array will not be written by the sysctl. + */ + XEN_GUEST_HANDLE_64(uint32_t) cpu_to_node; }; typedef struct xen_sysctl_physinfo xen_sysctl_physinfo_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_physinfo_t); @@ -167,6 +185,18 @@ struct xen_sysctl_getcpuinfo { typedef struct xen_sysctl_getcpuinfo xen_sysctl_getcpuinfo_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_getcpuinfo_t); +#define XEN_SYSCTL_availheap 9 +struct xen_sysctl_availheap { + /* IN variables. */ + uint32_t min_bitwidth; /* Smallest address width (zero if don't care). */ + uint32_t max_bitwidth; /* Largest address width (zero if don't care). */ + int32_t node; /* NUMA node of interest (-1 for all nodes). */ + /* OUT variables. */ + uint64_t avail_bytes; /* Bytes available in the specified region. */ +}; +typedef struct xen_sysctl_availheap xen_sysctl_availheap_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_availheap_t); + struct xen_sysctl { uint32_t cmd; uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */ @@ -178,6 +208,7 @@ struct xen_sysctl { struct xen_sysctl_perfc_op perfc_op; struct xen_sysctl_getdomaininfolist getdomaininfolist; struct xen_sysctl_debug_keys debug_keys; + struct xen_sysctl_availheap availheap; struct xen_sysctl_getcpuinfo getcpuinfo; uint8_t pad[128]; } u; diff -Naurp xen/include/public/vcpu.h xen-redhat/include/public/vcpu.h --- xen/include/public/vcpu.h +++ xen-redhat/include/public/vcpu.h @@ -170,7 +170,7 @@ DEFINE_XEN_GUEST_HANDLE(vcpu_set_singles * * This may be called only once per vcpu. */ -#define VCPUOP_register_vcpu_info 10 /* arg == struct vcpu_info */ +#define VCPUOP_register_vcpu_info 10 /* arg == vcpu_register_vcpu_info_t */ struct vcpu_register_vcpu_info { uint64_t mfn; /* mfn of page to place vcpu_info */ uint32_t offset; /* offset within page */ @@ -179,6 +179,22 @@ struct vcpu_register_vcpu_info { typedef struct vcpu_register_vcpu_info vcpu_register_vcpu_info_t; DEFINE_XEN_GUEST_HANDLE(vcpu_register_vcpu_info_t); +/* + * Get the physical ID information for a pinned vcpu's underlying physical + * processor. The physical ID informmation is architecture-specific. + * On x86: id[7:0]=apic_id, id[15:8]=acpi_id, id[63:16]=mbz, + * and an unavailable identifier is returned as 0xff. + * This command returns -EINVAL if it is not a valid operation for this VCPU. + */ +#define VCPUOP_get_physid 12 /* arg == vcpu_get_physid_t */ +struct vcpu_get_physid { + uint64_t phys_id; +}; +typedef struct vcpu_get_physid vcpu_get_physid_t; +DEFINE_XEN_GUEST_HANDLE(vcpu_get_physid_t); +#define xen_vcpu_physid_to_x86_apicid(physid) ((uint8_t)((physid)>>0)) +#define xen_vcpu_physid_to_x86_acpiid(physid) ((uint8_t)((physid)>>8)) + #endif /* __XEN_PUBLIC_VCPU_H__ */ /* diff -Naurp xen/include/public/xen.h xen-redhat/include/public/xen.h --- xen/include/public/xen.h +++ xen-redhat/include/public/xen.h @@ -168,9 +168,13 @@ * ptr[:2] -- Machine address within the frame whose mapping to modify. * The frame must belong to the FD, if one is specified. * val -- Value to write into the mapping entry. + * + * ptr[1:0] == MMU_PT_UPDATE_PRESERVE_AD: + * As MMU_NORMAL_PT_UPDATE above, but A/D bits in the PTE are preserved (ORed). */ -#define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */ -#define MMU_MACHPHYS_UPDATE 1 /* ptr = MA of frame to modify entry for */ +#define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */ +#define MMU_MACHPHYS_UPDATE 1 /* ptr = MA of frame to modify entry for */ +#define MMU_PT_UPDATE_PRESERVE_AD 2 /* '*ptr = val', preserve (OR) A/D bits */ /* * MMU EXTENDED OPERATIONS diff -Naurp xen/include/xen/acpi.h xen-redhat/include/xen/acpi.h --- xen/include/xen/acpi.h +++ xen-redhat/include/xen/acpi.h @@ -367,15 +367,88 @@ enum acpi_table_id { ACPI_SPMI, ACPI_HPET, ACPI_MCFG, + ACPI_DMAR, + ACPI_IVRS, ACPI_TABLE_COUNT }; +/* DMA Remapping Reporting Table (DMAR) */ + +#define DMAR_FLAGS_INTR_REMAP 0x1 /* intr remap supported */ +struct acpi_table_dmar { + struct acpi_table_header header; + u8 haw; /* Host address Width */ + u8 flags; + u8 reserved[10]; +} __attribute__ ((packed)); + +struct acpi_dmar_entry_header { + u16 type; + u16 length; +} __attribute__((packed)); + +enum acpi_dmar_entry_type { + ACPI_DMAR_DRHD = 0, + ACPI_DMAR_RMRR, + ACPI_DMAR_ATSR, + ACPI_DMAR_ENTRY_COUNT +}; + +#define DRHD_FLAGS_INCLUDE_ALL 0x1 /* drhd remaps remaining devices */ +struct acpi_table_drhd { + struct acpi_dmar_entry_header header; + u8 flags; + u8 reserved; + u16 segment; + u64 address; /* register base address for this drhd */ +} __attribute__ ((packed)); + +struct acpi_table_rmrr { + struct acpi_dmar_entry_header header; + u16 reserved; + u16 segment; + u64 base_address; + u64 end_address; +} __attribute__ ((packed)); + +struct acpi_table_atsr { + struct acpi_dmar_entry_header header; + u8 flags; + u8 reserved; + u16 segment; +} __attribute__ ((packed)); + +enum acpi_dev_scope_type { + ACPI_DEV_ENDPOINT=0x01, /* PCI Endpoing device */ + ACPI_DEV_P2PBRIDGE, /* PCI-PCI Bridge */ + ACPI_DEV_IOAPIC, /* IOAPIC device*/ + ACPI_DEV_MSI_HPET, /* MSI capable HPET*/ + ACPI_DEV_ENTRY_COUNT +}; + +struct acpi_dev_scope { + u8 dev_type; + u8 length; + u8 reserved[2]; + u8 enum_id; + u8 start_bus; +} __attribute__((packed)); + +struct acpi_pci_path { + u8 dev; + u8 fn; +} __attribute__((packed)); + +typedef int (*acpi_dmar_entry_handler) (struct acpi_dmar_entry_header *header, const unsigned long end); + + typedef int (*acpi_table_handler) (unsigned long phys_addr, unsigned long size); extern acpi_table_handler acpi_table_ops[ACPI_TABLE_COUNT]; typedef int (*acpi_madt_entry_handler) (acpi_table_entry_header *header, const unsigned long end); +unsigned int acpi_get_processor_id (unsigned int cpu); char * __acpi_map_table (unsigned long phys_addr, unsigned long size); unsigned long acpi_find_rsdp (void); int acpi_boot_init (void); diff -Naurp xen/include/xen/compat.h xen-redhat/include/xen/compat.h --- xen/include/xen/compat.h +++ xen-redhat/include/xen/compat.h @@ -176,15 +176,10 @@ void xlat_vcpu_runstate_info(struct vcpu int switch_compat(struct domain *); int switch_native(struct domain *); -#define BITS_PER_GUEST_LONG(d) \ - (!IS_COMPAT(d) ? BITS_PER_LONG : COMPAT_BITS_PER_LONG) - #else #define compat_handle_is_null(hnd) 0 -#define BITS_PER_GUEST_LONG(d) BITS_PER_LONG - #endif #endif /* __XEN_COMPAT_H__ */ diff -Naurp xen/include/xen/console.h xen-redhat/include/xen/console.h --- xen/include/xen/console.h +++ xen-redhat/include/xen/console.h @@ -26,9 +26,6 @@ void console_force_lock(void); void console_start_sync(void); void console_end_sync(void); -void console_start_log_everything(void); -void console_end_log_everything(void); - /* * Steal output from the console. Returns +ve identifier, else -ve error. * Takes the handle of the serial line to steal, and steal callback function. diff -Naurp xen/include/xen/cpumask.h xen-redhat/include/xen/cpumask.h --- xen/include/xen/cpumask.h +++ xen-redhat/include/xen/cpumask.h @@ -222,6 +222,15 @@ static inline int __next_cpu(int n, cons return min_t(int, nbits, find_next_bit(srcp->bits, nbits, n+1)); } +#define last_cpu(src) __last_cpu(&(src), NR_CPUS) +static inline int __last_cpu(const cpumask_t *srcp, int nbits) +{ + int cpu, pcpu = NR_CPUS; + for (cpu = first_cpu(*srcp); cpu < NR_CPUS; cpu = next_cpu(cpu, *srcp)) + pcpu = cpu; + return pcpu; +} + #define cpumask_of_cpu(cpu) \ ({ \ typeof(_unused_cpumask_arg_) m; \ diff -Naurp xen/include/xen/dmi.h xen-redhat/include/xen/dmi.h --- xen/include/xen/dmi.h +++ xen-redhat/include/xen/dmi.h @@ -34,5 +34,7 @@ struct dmi_system_id { extern int dmi_check_system(struct dmi_system_id *list); extern char * dmi_get_system_info(int field); +extern void dmi_scan_machine(void); +extern int dmi_get_table(u32 *base, u32 *len); #endif /* __DMI_H__ */ diff -Naurp xen/include/xen/domain.h xen-redhat/include/xen/domain.h --- xen/include/xen/domain.h +++ xen-redhat/include/xen/domain.h @@ -45,7 +45,7 @@ void arch_domain_destroy(struct domain * int arch_set_info_guest(struct vcpu *, vcpu_guest_context_u); void arch_get_info_guest(struct vcpu *, vcpu_guest_context_u); -void domain_relinquish_resources(struct domain *d); +int domain_relinquish_resources(struct domain *d); void dump_pageframe_info(struct domain *d); diff -Naurp xen/include/xen/elfcore.h xen-redhat/include/xen/elfcore.h --- xen/include/xen/elfcore.h +++ xen-redhat/include/xen/elfcore.h @@ -66,6 +66,7 @@ typedef struct { unsigned long xen_compile_time; unsigned long tainted; #ifdef CONFIG_X86 + unsigned long xen_phys_start; unsigned long dom0_pfn_to_mfn_frame_list_list; #endif } crash_xen_info_t; diff -Naurp xen/include/xen/gdbstub.h xen-redhat/include/xen/gdbstub.h --- xen/include/xen/gdbstub.h +++ xen-redhat/include/xen/gdbstub.h @@ -47,6 +47,7 @@ struct gdb_context { unsigned long out_offset; u8 out_csum; }; +extern struct gdb_context *gdb_ctx; /* interface to arch specific routines */ void gdb_write_to_packet( diff -Naurp xen/include/xen/hvm/iommu.h xen-redhat/include/xen/hvm/iommu.h --- xen/include/xen/hvm/iommu.h +++ xen-redhat/include/xen/hvm/iommu.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Allen Kay <allen.m.kay@intel.com> + */ + +#ifndef __XEN_HVM_IOMMU_H__ +#define __XEN_HVM_IOMMU_H__ + +#include <xen/iommu.h> + +struct g2m_ioport { + struct list_head list; + unsigned int gport; + unsigned int mport; + unsigned int np; +}; + +struct hvm_iommu { + u64 pgd_maddr; /* io page directory machine address */ + spinlock_t mapping_lock; /* io page table lock */ + int agaw; /* adjusted guest address width, 0 is level 2 30-bit */ + struct list_head g2m_ioport_list; /* guest to machine ioport mapping */ + domid_t iommu_domid; /* domain id stored in iommu */ + u64 iommu_bitmap; /* bitmap of iommu(s) that the domain uses */ + + /* amd iommu support */ + int domain_id; + int paging_mode; + struct page_info *root_table; + bool_t p2m_synchronized; + + /* iommu_ops */ + struct iommu_ops *platform_ops; +}; + +#endif /* __XEN_HVM_IOMMU_H__ */ diff -Naurp xen/include/xen/hvm/irq.h xen-redhat/include/xen/hvm/irq.h --- xen/include/xen/hvm/irq.h +++ xen-redhat/include/xen/hvm/irq.h @@ -0,0 +1,99 @@ +/****************************************************************************** + * irq.h + * + * Interrupt distribution and delivery logic. + * + * Copyright (c) 2006, K A Fraser, XenSource Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + */ + +#ifndef __XEN_HVM_IRQ_H__ +#define __XEN_HVM_IRQ_H__ + +#include <xen/types.h> +#include <xen/spinlock.h> +#include <asm/irq.h> +#include <public/hvm/save.h> + +struct dev_intx_gsi_link { + struct list_head list; + uint8_t device; + uint8_t intx; + uint8_t gsi; + uint8_t link; +}; + +#define _HVM_IRQ_DPCI_MSI 0x1 + +struct hvm_gmsi_info { + uint32_t gvec; + uint32_t gflags; +}; + +struct hvm_mirq_dpci_mapping { + uint32_t flags; + int pending; + struct list_head digl_list; + struct domain *dom; + struct hvm_gmsi_info gmsi; +}; + +struct hvm_girq_dpci_mapping { + uint8_t valid; + uint8_t device; + uint8_t intx; + uint8_t machine_gsi; +}; + +#define NR_ISAIRQS 16 +#define NR_LINK 4 + +/* Protected by domain's event_lock */ +struct hvm_irq_dpci { + /* Machine IRQ to guest device/intx mapping. */ + DECLARE_BITMAP(mapping, NR_IRQS); + struct hvm_mirq_dpci_mapping mirq[NR_IRQS]; + /* Guest IRQ to guest device/intx mapping. */ + struct hvm_girq_dpci_mapping girq[NR_IRQS]; + uint8_t msi_gvec_pirq[NR_VECTORS]; + DECLARE_BITMAP(dirq_mask, NR_IRQS); + /* Record of mapped ISA IRQs */ + DECLARE_BITMAP(isairq_map, NR_ISAIRQS); + /* Record of mapped Links */ + uint8_t link_cnt[NR_LINK]; + struct timer hvm_timer[NR_IRQS]; +}; + +/* Modify state of a PCI INTx wire. */ +void hvm_pci_intx_assert( + struct domain *d, unsigned int device, unsigned int intx); +void hvm_pci_intx_deassert( + struct domain *d, unsigned int device, unsigned int intx); + +/* Modify state of an ISA device's IRQ wire. */ +void hvm_isa_irq_assert( + struct domain *d, unsigned int isa_irq); +void hvm_isa_irq_deassert( + struct domain *d, unsigned int isa_irq); + +void hvm_set_pci_link_route(struct domain *d, u8 link, u8 isa_irq); + +void hvm_maybe_deassert_evtchn_irq(void); +void hvm_assert_evtchn_irq(struct vcpu *v); +void hvm_set_callback_via(struct domain *d, uint64_t via); + +void hvm_dirq_assist(struct vcpu *v); + +#endif /* __XEN_HVM_IRQ_H__ */ diff -Naurp xen/include/xen/iommu.h xen-redhat/include/xen/iommu.h --- xen/include/xen/iommu.h +++ xen-redhat/include/xen/iommu.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Allen Kay <allen.m.kay@intel.com> + */ + +#ifndef _IOMMU_H_ +#define _IOMMU_H_ + +#include <xen/init.h> +#include <xen/spinlock.h> +#include <xen/pci.h> +#include <public/hvm/ioreq.h> +#include <public/domctl.h> + +extern int vtd_enabled; +extern int iommu_enabled; +extern int iommu_pv_enabled; +extern int force_iommu; +extern int iommu_passthrough; +extern int iommu_snoop; +extern int iommu_intremap; +extern int iommu_intremap_cmdline; + +#define domain_hvm_iommu(d) (&d->arch.hvm_domain.hvm_iommu) + +#define MAX_IOMMUS 32 + +#define PAGE_SHIFT_4K (12) +#define PAGE_SIZE_4K (1UL << PAGE_SHIFT_4K) +#define PAGE_MASK_4K (((u64)-1) << PAGE_SHIFT_4K) +#define PAGE_ALIGN_4K(addr) (((addr) + PAGE_SIZE_4K - 1) & PAGE_MASK_4K) + +struct iommu { + struct list_head list; + void __iomem *reg; /* Pointer to hardware regs, virtual addr */ + u32 index; /* Sequence number of iommu */ + u32 gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */ + u32 nr_pt_levels; + u64 cap; + u64 ecap; + spinlock_t lock; /* protect context, domain ids */ + spinlock_t register_lock; /* protect iommu register handling */ + u64 root_maddr; /* root entry machine address */ + unsigned int vector; + struct intel_iommu *intel; +}; + +int iommu_setup(void); + +int iommu_add_device(struct pci_dev *pdev); +int iommu_remove_device(struct pci_dev *pdev); +int iommu_domain_init(struct domain *d); +void iommu_domain_destroy(struct domain *d); +int device_assignable(struct domain *d, u8 bus, u8 devfn); +int assign_device(struct domain *d, u8 bus, u8 devfn); +int deassign_device(struct domain *d, u8 bus, u8 devfn); +int iommu_get_device_group(struct domain *d, u8 bus, u8 devfn, + XEN_GUEST_HANDLE_64(uint32_t) buf, int max_sdevs); +int iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn); +int iommu_unmap_page(struct domain *d, unsigned long gfn); +void iommu_domain_teardown(struct domain *d); +int hvm_do_IRQ_dpci(struct domain *d, unsigned int irq); +int dpci_ioport_intercept(ioreq_t *p); +int pt_irq_create_bind_vtd(struct domain *d, + xen_domctl_bind_pt_irq_t *pt_irq_bind); +int pt_irq_destroy_bind_vtd(struct domain *d, + xen_domctl_bind_pt_irq_t *pt_irq_bind); +unsigned int io_apic_read_remap_rte(unsigned int apic, unsigned int reg); +void io_apic_write_remap_rte(unsigned int apic, + unsigned int reg, unsigned int value); + +struct msi_desc; +struct msi_msg; +void msi_msg_read_remap_rte(struct msi_desc *msi_desc, struct msi_msg *msg); +void msi_msg_write_remap_rte(struct msi_desc *msi_desc, struct msi_msg *msg); +struct qi_ctrl *iommu_qi_ctrl(struct iommu *iommu); +struct ir_ctrl *iommu_ir_ctrl(struct iommu *iommu); +struct iommu_flush *iommu_get_flush(struct iommu *iommu); +void hvm_dpci_isairq_eoi(struct domain *d, unsigned int isairq); +struct hvm_irq_dpci *domain_get_irq_dpci(struct domain *domain); +int domain_set_irq_dpci(struct domain *domain, struct hvm_irq_dpci *dpci); + +#define PT_IRQ_TIME_OUT MILLISECS(8) +#define VTDPREFIX "[VT-D]" + +struct iommu_ops { + int (*init)(struct domain *d); + int (*add_device)(struct pci_dev *pdev); + int (*remove_device)(struct pci_dev *pdev); + int (*assignable)(const struct domain *d); + int (*assign_device)(struct domain *d, u8 bus, u8 devfn); + void (*teardown)(struct domain *d); + int (*map_page)(struct domain *d, unsigned long gfn, unsigned long mfn); + int (*unmap_page)(struct domain *d, unsigned long gfn); + int (*reassign_device)(struct domain *s, struct domain *t, + u8 bus, u8 devfn); + int (*get_device_group_id)(u8 bus, u8 devfn); + void (*update_ire_from_apic)(unsigned int apic, unsigned int reg, unsigned int value); + void (*update_ire_from_msi)(struct msi_desc *msi_desc, struct msi_msg *msg); +}; + +void iommu_update_ire_from_apic(unsigned int apic, unsigned int reg, unsigned int value); +void iommu_update_ire_from_msi(struct msi_desc *msi_desc, struct msi_msg *msg); + +void iommu_suspend(void); +void iommu_resume(void); + +#endif /* _IOMMU_H_ */ diff -Naurp xen/include/xen/irq.h xen-redhat/include/xen/irq.h --- xen/include/xen/irq.h +++ xen-redhat/include/xen/irq.h @@ -4,6 +4,7 @@ #include <xen/config.h> #include <xen/cpumask.h> #include <xen/spinlock.h> +#include <xen/time.h> #include <asm/regs.h> #include <asm/hardirq.h> @@ -44,6 +45,7 @@ typedef struct hw_interrupt_type hw_irq_ #include <asm/irq.h> +struct msi_desc; /* * This is the "IRQ descriptor", which contains various information * about the irq, including what kind of hardware handling it has, @@ -54,15 +56,27 @@ typedef struct hw_interrupt_type hw_irq_ typedef struct { unsigned int status; /* IRQ status */ hw_irq_controller *handler; + struct msi_desc *msi_desc; struct irqaction *action; /* IRQ action list */ unsigned int depth; /* nested irq disables */ + int vector; spinlock_t lock; + cpumask_t affinity; + + /* irq ratelimit */ + s_time_t rl_quantum_start; + unsigned int rl_cnt; + struct list_head rl_link; } __cacheline_aligned irq_desc_t; extern irq_desc_t irq_desc[NR_IRQS]; extern int setup_irq(unsigned int, struct irqaction *); extern void free_irq(unsigned int); +extern int request_irq(unsigned int irq, + void (*handler)(int, void *, struct cpu_user_regs *), + unsigned long irqflags, const char * devname, + void *dev_id); extern hw_irq_controller no_irq_type; extern void no_action(int cpl, void *dev_id, struct cpu_user_regs *regs); @@ -72,6 +86,20 @@ struct vcpu; extern int pirq_guest_eoi(struct domain *d, int irq); extern int pirq_guest_unmask(struct domain *d); extern int pirq_guest_bind(struct vcpu *v, int irq, int will_share); -extern int pirq_guest_unbind(struct domain *d, int irq); +extern void pirq_guest_unbind(struct domain *d, int irq); +extern irq_desc_t *domain_spin_lock_irq_desc( + struct domain *d, int irq, unsigned long *pflags); + +static inline void set_native_irq_info(unsigned int vector, cpumask_t mask) +{ + irq_desc[vector].affinity = mask; +} + +#ifdef irq_to_vector +static inline void set_irq_info(int irq, cpumask_t mask) +{ + set_native_irq_info(irq_to_vector(irq), mask); +} +#endif #endif /* __XEN_IRQ_H__ */ diff -Naurp xen/include/xen/mm.h xen-redhat/include/xen/mm.h --- xen/include/xen/mm.h +++ xen-redhat/include/xen/mm.h @@ -61,6 +61,8 @@ struct page_info *__alloc_domheap_pages( struct domain *d, unsigned int cpu, unsigned int order, unsigned int memflags); void free_domheap_pages(struct page_info *pg, unsigned int order); +unsigned long avail_domheap_pages_region( + unsigned int node, unsigned int min_width, unsigned int max_width); unsigned long avail_domheap_pages(void); #define alloc_domheap_page(d) (alloc_domheap_pages(d,0,0)) #define free_domheap_page(p) (free_domheap_pages(p,0)) @@ -85,19 +87,7 @@ int assign_pages( #define MAX_ORDER 20 /* 2^20 contiguous pages */ #endif -/* Automatic page scrubbing for dead domains. */ -extern struct list_head page_scrub_list; -#define page_scrub_schedule_work() \ - do { \ - if ( !list_empty(&page_scrub_list) ) \ - raise_softirq(PAGE_SCRUB_SOFTIRQ); \ - } while ( 0 ) -#define page_scrub_kick() \ - do { \ - if ( !list_empty(&page_scrub_list) ) \ - cpumask_raise_softirq(cpu_online_map, PAGE_SCRUB_SOFTIRQ); \ - } while ( 0 ) -unsigned long avail_scrub_pages(void); +void scrub_one_page(struct page_info *); #include <asm/mm.h> @@ -106,4 +96,6 @@ int guest_remove_page(struct domain *d, /* Returns TRUE if the memory at address @p is ordinary RAM. */ int memory_is_conventional_ram(paddr_t p); +extern unsigned long *alloc_bitmap; /* for vmcoreinfo */ + #endif /* __XEN_MM_H__ */ diff -Naurp xen/include/xen/paging.h xen-redhat/include/xen/paging.h --- xen/include/xen/paging.h +++ xen-redhat/include/xen/paging.h @@ -18,8 +18,8 @@ #else #define paging_mode_translate(d) (0) -#define guest_physmap_add_page(d, p, m) ((void)0) -#define guest_physmap_remove_page(d, p, m) ((void)0) +#define guest_physmap_add_page(d, p, m, order) (0) +#define guest_physmap_remove_page(d, p, m, order) ((void)0) #endif diff -Naurp xen/include/xen/pci.h xen-redhat/include/xen/pci.h --- xen/include/xen/pci.h +++ xen-redhat/include/xen/pci.h @@ -0,0 +1,93 @@ +/****************************************************************************** + * pci.h + * + * PCI access functions. + */ + +#ifndef __XEN_PCI_H__ +#define __XEN_PCI_H__ + +#include <xen/config.h> +#include <xen/types.h> +#include <xen/list.h> +#include <xen/spinlock.h> + +/* + * The PCI interface treats multi-function devices as independent + * devices. The slot/function address of each device is encoded + * in a single byte as follows: + * + * 15:8 = bus + * 7:3 = slot + * 2:0 = function + */ +#define PCI_BUS(bdf) (((bdf) >> 8) & 0xff) +#define PCI_SLOT(bdf) (((bdf) >> 3) & 0x1f) +#define PCI_FUNC(bdf) ((bdf) & 0x07) +#define PCI_DEVFN(d,f) ((((d) & 0x1f) << 3) | ((f) & 0x07)) +#define PCI_DEVFN2(bdf) ((bdf) & 0xff) +#define PCI_BDF(b,d,f) ((((b) & 0xff) << 8) | PCI_DEVFN(d,f)) +#define PCI_BDF2(b,df) ((((b) & 0xff) << 8) | ((df) & 0xff)) + +#define MAX_MSIX_TABLE_ENTRIES 2048 +#define MAX_MSIX_TABLE_PAGES 8 +struct pci_dev { + struct list_head alldevs_list; + struct list_head domain_list; + + struct list_head msi_list; + int msix_table_refcnt[MAX_MSIX_TABLE_PAGES]; + int msix_table_idx[MAX_MSIX_TABLE_PAGES]; + spinlock_t msix_table_lock; + u64 msix_table; + + struct domain *domain; + const u8 bus; + const u8 devfn; +}; + +#define for_each_pdev(domain, pdev) \ + list_for_each_entry(pdev, &(domain->arch.pdev_list), domain_list) + +/* + * The pcidevs_lock protect alldevs_list, and the assignment for the + * devices, it also sync the access to the msi capability that is not + * interrupt handling related (the mask bit register). + */ + +extern spinlock_t pcidevs_lock; + +struct pci_dev *alloc_pdev(u8 bus, u8 devfn); +void free_pdev(struct pci_dev *pdev); +struct pci_dev *pci_lock_pdev(int bus, int devfn); +struct pci_dev *pci_lock_domain_pdev(struct domain *d, int bus, int devfn); + +void pci_release_devices(struct domain *d); +int pci_add_device(u8 bus, u8 devfn); +int pci_remove_device(u8 bus, u8 devfn); +struct pci_dev *pci_get_pdev(int bus, int devfn); +struct pci_dev *pci_get_pdev_by_domain(struct domain *d, int bus, int devfn); + +uint8_t pci_conf_read8( + unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg); +uint16_t pci_conf_read16( + unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg); +uint32_t pci_conf_read32( + unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg); +void pci_conf_write8( + unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg, + uint8_t data); +void pci_conf_write16( + unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg, + uint16_t data); +void pci_conf_write32( + unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg, + uint32_t data); +int pci_find_cap_offset(u8 bus, u8 dev, u8 func, u8 cap); +int pci_find_next_cap(u8 bus, unsigned int devfn, u8 pos, int cap); + +int pci_set_device_msixtbl(u8 bus, u8 devfn, u64 gtable); +int msixtbl_pt_register(struct domain *d, int pirq); +void msixtbl_pt_unregister(struct domain *d, int pirq); + +#endif /* __XEN_PCI_H__ */ diff -Naurp xen/include/xen/pci_regs.h xen-redhat/include/xen/pci_regs.h --- xen/include/xen/pci_regs.h +++ xen-redhat/include/xen/pci_regs.h @@ -0,0 +1,530 @@ +/* + * pci_regs.h + * + * PCI standard defines + * Copyright 1994, Drew Eckhardt + * Copyright 1997--1999 Martin Mares <mj@ucw.cz> + * + * For more information, please consult the following manuals (look at + * http://www.pcisig.com/ for how to get them): + * + * PCI BIOS Specification + * PCI Local Bus Specification + * PCI to PCI Bridge Specification + * PCI System Design Guide + * + * For hypertransport information, please consult the following manuals + * from http://www.hypertransport.org + * + * The Hypertransport I/O Link Specification + */ + +#ifndef LINUX_PCI_REGS_H +#define LINUX_PCI_REGS_H + +/* + * Under PCI, each device has 256 bytes of configuration address space, + * of which the first 64 bytes are standardized as follows: + */ +#define PCI_VENDOR_ID 0x00 /* 16 bits */ +#define PCI_DEVICE_ID 0x02 /* 16 bits */ +#define PCI_COMMAND 0x04 /* 16 bits */ +#define PCI_COMMAND_IO 0x1 /* Enable response in I/O space */ +#define PCI_COMMAND_MEMORY 0x2 /* Enable response in Memory space */ +#define PCI_COMMAND_MASTER 0x4 /* Enable bus mastering */ +#define PCI_COMMAND_SPECIAL 0x8 /* Enable response to special cycles */ +#define PCI_COMMAND_INVALIDATE 0x10 /* Use memory write and invalidate */ +#define PCI_COMMAND_VGA_PALETTE 0x20 /* Enable palette snooping */ +#define PCI_COMMAND_PARITY 0x40 /* Enable parity checking */ +#define PCI_COMMAND_WAIT 0x80 /* Enable address/data stepping */ +#define PCI_COMMAND_SERR 0x100 /* Enable SERR */ +#define PCI_COMMAND_FAST_BACK 0x200 /* Enable back-to-back writes */ +#define PCI_COMMAND_INTX_DISABLE 0x400 /* INTx Emulation Disable */ + +#define PCI_STATUS 0x06 /* 16 bits */ +#define PCI_STATUS_CAP_LIST 0x10 /* Support Capability List */ +#define PCI_STATUS_66MHZ 0x20 /* Support 66 Mhz PCI 2.1 bus */ +#define PCI_STATUS_UDF 0x40 /* Support User Definable Features [obsolete] */ +#define PCI_STATUS_FAST_BACK 0x80 /* Accept fast-back to back */ +#define PCI_STATUS_PARITY 0x100 /* Detected parity error */ +#define PCI_STATUS_DEVSEL_MASK 0x600 /* DEVSEL timing */ +#define PCI_STATUS_DEVSEL_FAST 0x000 +#define PCI_STATUS_DEVSEL_MEDIUM 0x200 +#define PCI_STATUS_DEVSEL_SLOW 0x400 +#define PCI_STATUS_SIG_TARGET_ABORT 0x800 /* Set on target abort */ +#define PCI_STATUS_REC_TARGET_ABORT 0x1000 /* Master ack of " */ +#define PCI_STATUS_REC_MASTER_ABORT 0x2000 /* Set on master abort */ +#define PCI_STATUS_SIG_SYSTEM_ERROR 0x4000 /* Set when we drive SERR */ +#define PCI_STATUS_DETECTED_PARITY 0x8000 /* Set on parity error */ + +#define PCI_CLASS_REVISION 0x08 /* High 24 bits are class, low 8 revision */ +#define PCI_REVISION_ID 0x08 /* Revision ID */ +#define PCI_CLASS_PROG 0x09 /* Reg. Level Programming Interface */ +#define PCI_CLASS_DEVICE 0x0a /* Device class */ + +#define PCI_CACHE_LINE_SIZE 0x0c /* 8 bits */ +#define PCI_LATENCY_TIMER 0x0d /* 8 bits */ +#define PCI_HEADER_TYPE 0x0e /* 8 bits */ +#define PCI_HEADER_TYPE_NORMAL 0 +#define PCI_HEADER_TYPE_BRIDGE 1 +#define PCI_HEADER_TYPE_CARDBUS 2 + +#define PCI_BIST 0x0f /* 8 bits */ +#define PCI_BIST_CODE_MASK 0x0f /* Return result */ +#define PCI_BIST_START 0x40 /* 1 to start BIST, 2 secs or less */ +#define PCI_BIST_CAPABLE 0x80 /* 1 if BIST capable */ + +/* + * Base addresses specify locations in memory or I/O space. + * Decoded size can be determined by writing a value of + * 0xffffffff to the register, and reading it back. Only + * 1 bits are decoded. + */ +#define PCI_BASE_ADDRESS_0 0x10 /* 32 bits */ +#define PCI_BASE_ADDRESS_1 0x14 /* 32 bits [htype 0,1 only] */ +#define PCI_BASE_ADDRESS_2 0x18 /* 32 bits [htype 0 only] */ +#define PCI_BASE_ADDRESS_3 0x1c /* 32 bits */ +#define PCI_BASE_ADDRESS_4 0x20 /* 32 bits */ +#define PCI_BASE_ADDRESS_5 0x24 /* 32 bits */ +#define PCI_BASE_ADDRESS_SPACE 0x01 /* 0 = memory, 1 = I/O */ +#define PCI_BASE_ADDRESS_SPACE_IO 0x01 +#define PCI_BASE_ADDRESS_SPACE_MEMORY 0x00 +#define PCI_BASE_ADDRESS_MEM_TYPE_MASK 0x06 +#define PCI_BASE_ADDRESS_MEM_TYPE_32 0x00 /* 32 bit address */ +#define PCI_BASE_ADDRESS_MEM_TYPE_1M 0x02 /* Below 1M [obsolete] */ +#define PCI_BASE_ADDRESS_MEM_TYPE_64 0x04 /* 64 bit address */ +#define PCI_BASE_ADDRESS_MEM_PREFETCH 0x08 /* prefetchable? */ +#define PCI_BASE_ADDRESS_MEM_MASK (~0x0fUL) +#define PCI_BASE_ADDRESS_IO_MASK (~0x03UL) +/* bit 1 is reserved if address_space = 1 */ + +/* Header type 0 (normal devices) */ +#define PCI_CARDBUS_CIS 0x28 +#define PCI_SUBSYSTEM_VENDOR_ID 0x2c +#define PCI_SUBSYSTEM_ID 0x2e +#define PCI_ROM_ADDRESS 0x30 /* Bits 31..11 are address, 10..1 reserved */ +#define PCI_ROM_ADDRESS_ENABLE 0x01 +#define PCI_ROM_ADDRESS_MASK (~0x7ffUL) + +#define PCI_CAPABILITY_LIST 0x34 /* Offset of first capability list entry */ + +/* 0x35-0x3b are reserved */ +#define PCI_INTERRUPT_LINE 0x3c /* 8 bits */ +#define PCI_INTERRUPT_PIN 0x3d /* 8 bits */ +#define PCI_MIN_GNT 0x3e /* 8 bits */ +#define PCI_MAX_LAT 0x3f /* 8 bits */ + +/* Header type 1 (PCI-to-PCI bridges) */ +#define PCI_PRIMARY_BUS 0x18 /* Primary bus number */ +#define PCI_SECONDARY_BUS 0x19 /* Secondary bus number */ +#define PCI_SUBORDINATE_BUS 0x1a /* Highest bus number behind the bridge */ +#define PCI_SEC_LATENCY_TIMER 0x1b /* Latency timer for secondary interface */ +#define PCI_IO_BASE 0x1c /* I/O range behind the bridge */ +#define PCI_IO_LIMIT 0x1d +#define PCI_IO_RANGE_TYPE_MASK 0x0fUL /* I/O bridging type */ +#define PCI_IO_RANGE_TYPE_16 0x00 +#define PCI_IO_RANGE_TYPE_32 0x01 +#define PCI_IO_RANGE_MASK (~0x0fUL) +#define PCI_SEC_STATUS 0x1e /* Secondary status register, only bit 14 used */ +#define PCI_MEMORY_BASE 0x20 /* Memory range behind */ +#define PCI_MEMORY_LIMIT 0x22 +#define PCI_MEMORY_RANGE_TYPE_MASK 0x0fUL +#define PCI_MEMORY_RANGE_MASK (~0x0fUL) +#define PCI_PREF_MEMORY_BASE 0x24 /* Prefetchable memory range behind */ +#define PCI_PREF_MEMORY_LIMIT 0x26 +#define PCI_PREF_RANGE_TYPE_MASK 0x0fUL +#define PCI_PREF_RANGE_TYPE_32 0x00 +#define PCI_PREF_RANGE_TYPE_64 0x01 +#define PCI_PREF_RANGE_MASK (~0x0fUL) +#define PCI_PREF_BASE_UPPER32 0x28 /* Upper half of prefetchable memory range */ +#define PCI_PREF_LIMIT_UPPER32 0x2c +#define PCI_IO_BASE_UPPER16 0x30 /* Upper half of I/O addresses */ +#define PCI_IO_LIMIT_UPPER16 0x32 +/* 0x34 same as for htype 0 */ +/* 0x35-0x3b is reserved */ +#define PCI_ROM_ADDRESS1 0x38 /* Same as PCI_ROM_ADDRESS, but for htype 1 */ +/* 0x3c-0x3d are same as for htype 0 */ +#define PCI_BRIDGE_CONTROL 0x3e +#define PCI_BRIDGE_CTL_PARITY 0x01 /* Enable parity detection on secondary interface */ +#define PCI_BRIDGE_CTL_SERR 0x02 /* The same for SERR forwarding */ +#define PCI_BRIDGE_CTL_ISA 0x04 /* Enable ISA mode */ +#define PCI_BRIDGE_CTL_VGA 0x08 /* Forward VGA addresses */ +#define PCI_BRIDGE_CTL_MASTER_ABORT 0x20 /* Report master aborts */ +#define PCI_BRIDGE_CTL_BUS_RESET 0x40 /* Secondary bus reset */ +#define PCI_BRIDGE_CTL_FAST_BACK 0x80 /* Fast Back2Back enabled on secondary interface */ + +/* Header type 2 (CardBus bridges) */ +#define PCI_CB_CAPABILITY_LIST 0x14 +/* 0x15 reserved */ +#define PCI_CB_SEC_STATUS 0x16 /* Secondary status */ +#define PCI_CB_PRIMARY_BUS 0x18 /* PCI bus number */ +#define PCI_CB_CARD_BUS 0x19 /* CardBus bus number */ +#define PCI_CB_SUBORDINATE_BUS 0x1a /* Subordinate bus number */ +#define PCI_CB_LATENCY_TIMER 0x1b /* CardBus latency timer */ +#define PCI_CB_MEMORY_BASE_0 0x1c +#define PCI_CB_MEMORY_LIMIT_0 0x20 +#define PCI_CB_MEMORY_BASE_1 0x24 +#define PCI_CB_MEMORY_LIMIT_1 0x28 +#define PCI_CB_IO_BASE_0 0x2c +#define PCI_CB_IO_BASE_0_HI 0x2e +#define PCI_CB_IO_LIMIT_0 0x30 +#define PCI_CB_IO_LIMIT_0_HI 0x32 +#define PCI_CB_IO_BASE_1 0x34 +#define PCI_CB_IO_BASE_1_HI 0x36 +#define PCI_CB_IO_LIMIT_1 0x38 +#define PCI_CB_IO_LIMIT_1_HI 0x3a +#define PCI_CB_IO_RANGE_MASK (~0x03UL) +/* 0x3c-0x3d are same as for htype 0 */ +#define PCI_CB_BRIDGE_CONTROL 0x3e +#define PCI_CB_BRIDGE_CTL_PARITY 0x01 /* Similar to standard bridge control register */ +#define PCI_CB_BRIDGE_CTL_SERR 0x02 +#define PCI_CB_BRIDGE_CTL_ISA 0x04 +#define PCI_CB_BRIDGE_CTL_VGA 0x08 +#define PCI_CB_BRIDGE_CTL_MASTER_ABORT 0x20 +#define PCI_CB_BRIDGE_CTL_CB_RESET 0x40 /* CardBus reset */ +#define PCI_CB_BRIDGE_CTL_16BIT_INT 0x80 /* Enable interrupt for 16-bit cards */ +#define PCI_CB_BRIDGE_CTL_PREFETCH_MEM0 0x100 /* Prefetch enable for both memory regions */ +#define PCI_CB_BRIDGE_CTL_PREFETCH_MEM1 0x200 +#define PCI_CB_BRIDGE_CTL_POST_WRITES 0x400 +#define PCI_CB_SUBSYSTEM_VENDOR_ID 0x40 +#define PCI_CB_SUBSYSTEM_ID 0x42 +#define PCI_CB_LEGACY_MODE_BASE 0x44 /* 16-bit PC Card legacy mode base address (ExCa) */ +/* 0x48-0x7f reserved */ + +/* Capability lists */ + +#define PCI_CAP_LIST_ID 0 /* Capability ID */ +#define PCI_CAP_ID_PM 0x01 /* Power Management */ +#define PCI_CAP_ID_AGP 0x02 /* Accelerated Graphics Port */ +#define PCI_CAP_ID_VPD 0x03 /* Vital Product Data */ +#define PCI_CAP_ID_SLOTID 0x04 /* Slot Identification */ +#define PCI_CAP_ID_MSI 0x05 /* Message Signalled Interrupts */ +#define PCI_CAP_ID_CHSWP 0x06 /* CompactPCI HotSwap */ +#define PCI_CAP_ID_PCIX 0x07 /* PCI-X */ +#define PCI_CAP_ID_HT 0x08 /* HyperTransport */ +#define PCI_CAP_ID_VNDR 0x09 /* Vendor specific */ +#define PCI_CAP_ID_DBG 0x0A /* Debug port */ +#define PCI_CAP_ID_CCRC 0x0B /* CompactPCI Central Resource Control */ +#define PCI_CAP_ID_SHPC 0x0C /* PCI Standard Hot-Plug Controller */ +#define PCI_CAP_ID_SSVID 0x0D /* Bridge subsystem vendor/device ID */ +#define PCI_CAP_ID_AGP3 0x0E /* AGP Target PCI-PCI bridge */ +#define PCI_CAP_ID_EXP 0x10 /* PCI Express */ +#define PCI_CAP_ID_MSIX 0x11 /* MSI-X */ +#define PCI_CAP_LIST_NEXT 1 /* Next capability in the list */ +#define PCI_CAP_FLAGS 2 /* Capability defined flags (16 bits) */ +#define PCI_CAP_SIZEOF 4 + +/* Power Management Registers */ + +#define PCI_PM_PMC 2 /* PM Capabilities Register */ +#define PCI_PM_CAP_VER_MASK 0x0007 /* Version */ +#define PCI_PM_CAP_PME_CLOCK 0x0008 /* PME clock required */ +#define PCI_PM_CAP_RESERVED 0x0010 /* Reserved field */ +#define PCI_PM_CAP_DSI 0x0020 /* Device specific initialization */ +#define PCI_PM_CAP_AUX_POWER 0x01C0 /* Auxilliary power support mask */ +#define PCI_PM_CAP_D1 0x0200 /* D1 power state support */ +#define PCI_PM_CAP_D2 0x0400 /* D2 power state support */ +#define PCI_PM_CAP_PME 0x0800 /* PME pin supported */ +#define PCI_PM_CAP_PME_MASK 0xF800 /* PME Mask of all supported states */ +#define PCI_PM_CAP_PME_D0 0x0800 /* PME# from D0 */ +#define PCI_PM_CAP_PME_D1 0x1000 /* PME# from D1 */ +#define PCI_PM_CAP_PME_D2 0x2000 /* PME# from D2 */ +#define PCI_PM_CAP_PME_D3 0x4000 /* PME# from D3 (hot) */ +#define PCI_PM_CAP_PME_D3cold 0x8000 /* PME# from D3 (cold) */ +#define PCI_PM_CTRL 4 /* PM control and status register */ +#define PCI_PM_CTRL_STATE_MASK 0x0003 /* Current power state (D0 to D3) */ +#define PCI_PM_CTRL_NO_SOFT_RESET 0x0008 /* No reset for D3hot->D0 */ +#define PCI_PM_CTRL_PME_ENABLE 0x0100 /* PME pin enable */ +#define PCI_PM_CTRL_DATA_SEL_MASK 0x1e00 /* Data select (??) */ +#define PCI_PM_CTRL_DATA_SCALE_MASK 0x6000 /* Data scale (??) */ +#define PCI_PM_CTRL_PME_STATUS 0x8000 /* PME pin status */ +#define PCI_PM_PPB_EXTENSIONS 6 /* PPB support extensions (??) */ +#define PCI_PM_PPB_B2_B3 0x40 /* Stop clock when in D3hot (??) */ +#define PCI_PM_BPCC_ENABLE 0x80 /* Bus power/clock control enable (??) */ +#define PCI_PM_DATA_REGISTER 7 /* (??) */ +#define PCI_PM_SIZEOF 8 + +/* AGP registers */ + +#define PCI_AGP_VERSION 2 /* BCD version number */ +#define PCI_AGP_RFU 3 /* Rest of capability flags */ +#define PCI_AGP_STATUS 4 /* Status register */ +#define PCI_AGP_STATUS_RQ_MASK 0xff000000 /* Maximum number of requests - 1 */ +#define PCI_AGP_STATUS_SBA 0x0200 /* Sideband addressing supported */ +#define PCI_AGP_STATUS_64BIT 0x0020 /* 64-bit addressing supported */ +#define PCI_AGP_STATUS_FW 0x0010 /* FW transfers supported */ +#define PCI_AGP_STATUS_RATE4 0x0004 /* 4x transfer rate supported */ +#define PCI_AGP_STATUS_RATE2 0x0002 /* 2x transfer rate supported */ +#define PCI_AGP_STATUS_RATE1 0x0001 /* 1x transfer rate supported */ +#define PCI_AGP_COMMAND 8 /* Control register */ +#define PCI_AGP_COMMAND_RQ_MASK 0xff000000 /* Master: Maximum number of requests */ +#define PCI_AGP_COMMAND_SBA 0x0200 /* Sideband addressing enabled */ +#define PCI_AGP_COMMAND_AGP 0x0100 /* Allow processing of AGP transactions */ +#define PCI_AGP_COMMAND_64BIT 0x0020 /* Allow processing of 64-bit addresses */ +#define PCI_AGP_COMMAND_FW 0x0010 /* Force FW transfers */ +#define PCI_AGP_COMMAND_RATE4 0x0004 /* Use 4x rate */ +#define PCI_AGP_COMMAND_RATE2 0x0002 /* Use 2x rate */ +#define PCI_AGP_COMMAND_RATE1 0x0001 /* Use 1x rate */ +#define PCI_AGP_SIZEOF 12 + +/* Vital Product Data */ + +#define PCI_VPD_ADDR 2 /* Address to access (15 bits!) */ +#define PCI_VPD_ADDR_MASK 0x7fff /* Address mask */ +#define PCI_VPD_ADDR_F 0x8000 /* Write 0, 1 indicates completion */ +#define PCI_VPD_DATA 4 /* 32-bits of data returned here */ + +/* Slot Identification */ + +#define PCI_SID_ESR 2 /* Expansion Slot Register */ +#define PCI_SID_ESR_NSLOTS 0x1f /* Number of expansion slots available */ +#define PCI_SID_ESR_FIC 0x20 /* First In Chassis Flag */ +#define PCI_SID_CHASSIS_NR 3 /* Chassis Number */ + +/* Message Signalled Interrupts registers */ + +#define PCI_MSI_FLAGS 2 /* Various flags */ +#define PCI_MSI_FLAGS_64BIT 0x80 /* 64-bit addresses allowed */ +#define PCI_MSI_FLAGS_QSIZE 0x70 /* Message queue size configured */ +#define PCI_MSI_FLAGS_QMASK 0x0e /* Maximum queue size available */ +#define PCI_MSI_FLAGS_ENABLE 0x01 /* MSI feature enabled */ +#define PCI_MSI_FLAGS_MASKBIT 0x100 /* 64-bit mask bits allowed */ +#define PCI_MSI_RFU 3 /* Rest of capability flags */ +#define PCI_MSI_ADDRESS_LO 4 /* Lower 32 bits */ +#define PCI_MSI_ADDRESS_HI 8 /* Upper 32 bits (if PCI_MSI_FLAGS_64BIT set) */ +#define PCI_MSI_DATA_32 8 /* 16 bits of data for 32-bit devices */ +#define PCI_MSI_DATA_64 12 /* 16 bits of data for 64-bit devices */ +#define PCI_MSI_MASK_BIT 16 /* Mask bits register */ + +/* MSI-X registers (these are at offset PCI_MSIX_FLAGS) */ +#define PCI_MSIX_FLAGS 2 +#define PCI_MSIX_FLAGS_QSIZE 0x7FF +#define PCI_MSIX_FLAGS_ENABLE (1 << 15) +#define PCI_MSIX_FLAGS_MASKALL (1 << 14) +#define PCI_MSIX_FLAGS_BIRMASK (7 << 0) +#define PCI_MSIX_FLAGS_BITMASK (1 << 0) + +/* CompactPCI Hotswap Register */ + +#define PCI_CHSWP_CSR 2 /* Control and Status Register */ +#define PCI_CHSWP_DHA 0x01 /* Device Hiding Arm */ +#define PCI_CHSWP_EIM 0x02 /* ENUM# Signal Mask */ +#define PCI_CHSWP_PIE 0x04 /* Pending Insert or Extract */ +#define PCI_CHSWP_LOO 0x08 /* LED On / Off */ +#define PCI_CHSWP_PI 0x30 /* Programming Interface */ +#define PCI_CHSWP_EXT 0x40 /* ENUM# status - extraction */ +#define PCI_CHSWP_INS 0x80 /* ENUM# status - insertion */ + +/* PCI-X registers */ + +#define PCI_X_CMD 2 /* Modes & Features */ +#define PCI_X_CMD_DPERR_E 0x0001 /* Data Parity Error Recovery Enable */ +#define PCI_X_CMD_ERO 0x0002 /* Enable Relaxed Ordering */ +#define PCI_X_CMD_READ_512 0x0000 /* 512 byte maximum read byte count */ +#define PCI_X_CMD_READ_1K 0x0004 /* 1Kbyte maximum read byte count */ +#define PCI_X_CMD_READ_2K 0x0008 /* 2Kbyte maximum read byte count */ +#define PCI_X_CMD_READ_4K 0x000c /* 4Kbyte maximum read byte count */ +#define PCI_X_CMD_MAX_READ 0x000c /* Max Memory Read Byte Count */ + /* Max # of outstanding split transactions */ +#define PCI_X_CMD_SPLIT_1 0x0000 /* Max 1 */ +#define PCI_X_CMD_SPLIT_2 0x0010 /* Max 2 */ +#define PCI_X_CMD_SPLIT_3 0x0020 /* Max 3 */ +#define PCI_X_CMD_SPLIT_4 0x0030 /* Max 4 */ +#define PCI_X_CMD_SPLIT_8 0x0040 /* Max 8 */ +#define PCI_X_CMD_SPLIT_12 0x0050 /* Max 12 */ +#define PCI_X_CMD_SPLIT_16 0x0060 /* Max 16 */ +#define PCI_X_CMD_SPLIT_32 0x0070 /* Max 32 */ +#define PCI_X_CMD_MAX_SPLIT 0x0070 /* Max Outstanding Split Transactions */ +#define PCI_X_CMD_VERSION(x) (((x) >> 12) & 3) /* Version */ +#define PCI_X_STATUS 4 /* PCI-X capabilities */ +#define PCI_X_STATUS_DEVFN 0x000000ff /* A copy of devfn */ +#define PCI_X_STATUS_BUS 0x0000ff00 /* A copy of bus nr */ +#define PCI_X_STATUS_64BIT 0x00010000 /* 64-bit device */ +#define PCI_X_STATUS_133MHZ 0x00020000 /* 133 MHz capable */ +#define PCI_X_STATUS_SPL_DISC 0x00040000 /* Split Completion Discarded */ +#define PCI_X_STATUS_UNX_SPL 0x00080000 /* Unexpected Split Completion */ +#define PCI_X_STATUS_COMPLEX 0x00100000 /* Device Complexity */ +#define PCI_X_STATUS_MAX_READ 0x00600000 /* Designed Max Memory Read Count */ +#define PCI_X_STATUS_MAX_SPLIT 0x03800000 /* Designed Max Outstanding Split Transactions */ +#define PCI_X_STATUS_MAX_CUM 0x1c000000 /* Designed Max Cumulative Read Size */ +#define PCI_X_STATUS_SPL_ERR 0x20000000 /* Rcvd Split Completion Error Msg */ +#define PCI_X_STATUS_266MHZ 0x40000000 /* 266 MHz capable */ +#define PCI_X_STATUS_533MHZ 0x80000000 /* 533 MHz capable */ + +/* PCI Express capability registers */ + +#define PCI_EXP_FLAGS 2 /* Capabilities register */ +#define PCI_EXP_FLAGS_VERS 0x000f /* Capability version */ +#define PCI_EXP_FLAGS_TYPE 0x00f0 /* Device/Port type */ +#define PCI_EXP_TYPE_ENDPOINT 0x0 /* Express Endpoint */ +#define PCI_EXP_TYPE_LEG_END 0x1 /* Legacy Endpoint */ +#define PCI_EXP_TYPE_ROOT_PORT 0x4 /* Root Port */ +#define PCI_EXP_TYPE_UPSTREAM 0x5 /* Upstream Port */ +#define PCI_EXP_TYPE_DOWNSTREAM 0x6 /* Downstream Port */ +#define PCI_EXP_TYPE_PCI_BRIDGE 0x7 /* PCI/PCI-X Bridge */ +#define PCI_EXP_FLAGS_SLOT 0x0100 /* Slot implemented */ +#define PCI_EXP_FLAGS_IRQ 0x3e00 /* Interrupt message number */ +#define PCI_EXP_DEVCAP 4 /* Device capabilities */ +#define PCI_EXP_DEVCAP_PAYLOAD 0x07 /* Max_Payload_Size */ +#define PCI_EXP_DEVCAP_PHANTOM 0x18 /* Phantom functions */ +#define PCI_EXP_DEVCAP_EXT_TAG 0x20 /* Extended tags */ +#define PCI_EXP_DEVCAP_L0S 0x1c0 /* L0s Acceptable Latency */ +#define PCI_EXP_DEVCAP_L1 0xe00 /* L1 Acceptable Latency */ +#define PCI_EXP_DEVCAP_ATN_BUT 0x1000 /* Attention Button Present */ +#define PCI_EXP_DEVCAP_ATN_IND 0x2000 /* Attention Indicator Present */ +#define PCI_EXP_DEVCAP_PWR_IND 0x4000 /* Power Indicator Present */ +#define PCI_EXP_DEVCAP_PWR_VAL 0x3fc0000 /* Slot Power Limit Value */ +#define PCI_EXP_DEVCAP_PWR_SCL 0xc000000 /* Slot Power Limit Scale */ +#define PCI_EXP_DEVCTL 8 /* Device Control */ +#define PCI_EXP_DEVCTL_CERE 0x0001 /* Correctable Error Reporting En. */ +#define PCI_EXP_DEVCTL_NFERE 0x0002 /* Non-Fatal Error Reporting Enable */ +#define PCI_EXP_DEVCTL_FERE 0x0004 /* Fatal Error Reporting Enable */ +#define PCI_EXP_DEVCTL_URRE 0x0008 /* Unsupported Request Reporting En. */ +#define PCI_EXP_DEVCTL_RELAX_EN 0x0010 /* Enable relaxed ordering */ +#define PCI_EXP_DEVCTL_PAYLOAD 0x00e0 /* Max_Payload_Size */ +#define PCI_EXP_DEVCTL_EXT_TAG 0x0100 /* Extended Tag Field Enable */ +#define PCI_EXP_DEVCTL_PHANTOM 0x0200 /* Phantom Functions Enable */ +#define PCI_EXP_DEVCTL_AUX_PME 0x0400 /* Auxiliary Power PM Enable */ +#define PCI_EXP_DEVCTL_NOSNOOP_EN 0x0800 /* Enable No Snoop */ +#define PCI_EXP_DEVCTL_READRQ 0x7000 /* Max_Read_Request_Size */ +#define PCI_EXP_DEVSTA 10 /* Device Status */ +#define PCI_EXP_DEVSTA_CED 0x01 /* Correctable Error Detected */ +#define PCI_EXP_DEVSTA_NFED 0x02 /* Non-Fatal Error Detected */ +#define PCI_EXP_DEVSTA_FED 0x04 /* Fatal Error Detected */ +#define PCI_EXP_DEVSTA_URD 0x08 /* Unsupported Request Detected */ +#define PCI_EXP_DEVSTA_AUXPD 0x10 /* AUX Power Detected */ +#define PCI_EXP_DEVSTA_TRPND 0x20 /* Transactions Pending */ +#define PCI_EXP_LNKCAP 12 /* Link Capabilities */ +#define PCI_EXP_LNKCTL 16 /* Link Control */ +#define PCI_EXP_LNKCTL_CLKREQ_EN 0x100 /* Enable clkreq */ +#define PCI_EXP_LNKSTA 18 /* Link Status */ +#define PCI_EXP_SLTCAP 20 /* Slot Capabilities */ +#define PCI_EXP_SLTCTL 24 /* Slot Control */ +#define PCI_EXP_SLTSTA 26 /* Slot Status */ +#define PCI_EXP_RTCTL 28 /* Root Control */ +#define PCI_EXP_RTCTL_SECEE 0x01 /* System Error on Correctable Error */ +#define PCI_EXP_RTCTL_SENFEE 0x02 /* System Error on Non-Fatal Error */ +#define PCI_EXP_RTCTL_SEFEE 0x04 /* System Error on Fatal Error */ +#define PCI_EXP_RTCTL_PMEIE 0x08 /* PME Interrupt Enable */ +#define PCI_EXP_RTCTL_CRSSVE 0x10 /* CRS Software Visibility Enable */ +#define PCI_EXP_RTCAP 30 /* Root Capabilities */ +#define PCI_EXP_RTSTA 32 /* Root Status */ + +/* Extended Capabilities (PCI-X 2.0 and Express) */ +#define PCI_EXT_CAP_ID(header) (header & 0x0000ffff) +#define PCI_EXT_CAP_VER(header) ((header >> 16) & 0xf) +#define PCI_EXT_CAP_NEXT(header) ((header >> 20) & 0xffc) + +#define PCI_EXT_CAP_ID_ERR 1 +#define PCI_EXT_CAP_ID_VC 2 +#define PCI_EXT_CAP_ID_DSN 3 +#define PCI_EXT_CAP_ID_PWR 4 + +/* Advanced Error Reporting */ +#define PCI_ERR_UNCOR_STATUS 4 /* Uncorrectable Error Status */ +#define PCI_ERR_UNC_TRAIN 0x00000001 /* Training */ +#define PCI_ERR_UNC_DLP 0x00000010 /* Data Link Protocol */ +#define PCI_ERR_UNC_POISON_TLP 0x00001000 /* Poisoned TLP */ +#define PCI_ERR_UNC_FCP 0x00002000 /* Flow Control Protocol */ +#define PCI_ERR_UNC_COMP_TIME 0x00004000 /* Completion Timeout */ +#define PCI_ERR_UNC_COMP_ABORT 0x00008000 /* Completer Abort */ +#define PCI_ERR_UNC_UNX_COMP 0x00010000 /* Unexpected Completion */ +#define PCI_ERR_UNC_RX_OVER 0x00020000 /* Receiver Overflow */ +#define PCI_ERR_UNC_MALF_TLP 0x00040000 /* Malformed TLP */ +#define PCI_ERR_UNC_ECRC 0x00080000 /* ECRC Error Status */ +#define PCI_ERR_UNC_UNSUP 0x00100000 /* Unsupported Request */ +#define PCI_ERR_UNCOR_MASK 8 /* Uncorrectable Error Mask */ + /* Same bits as above */ +#define PCI_ERR_UNCOR_SEVER 12 /* Uncorrectable Error Severity */ + /* Same bits as above */ +#define PCI_ERR_COR_STATUS 16 /* Correctable Error Status */ +#define PCI_ERR_COR_RCVR 0x00000001 /* Receiver Error Status */ +#define PCI_ERR_COR_BAD_TLP 0x00000040 /* Bad TLP Status */ +#define PCI_ERR_COR_BAD_DLLP 0x00000080 /* Bad DLLP Status */ +#define PCI_ERR_COR_REP_ROLL 0x00000100 /* REPLAY_NUM Rollover */ +#define PCI_ERR_COR_REP_TIMER 0x00001000 /* Replay Timer Timeout */ +#define PCI_ERR_COR_MASK 20 /* Correctable Error Mask */ + /* Same bits as above */ +#define PCI_ERR_CAP 24 /* Advanced Error Capabilities */ +#define PCI_ERR_CAP_FEP(x) ((x) & 31) /* First Error Pointer */ +#define PCI_ERR_CAP_ECRC_GENC 0x00000020 /* ECRC Generation Capable */ +#define PCI_ERR_CAP_ECRC_GENE 0x00000040 /* ECRC Generation Enable */ +#define PCI_ERR_CAP_ECRC_CHKC 0x00000080 /* ECRC Check Capable */ +#define PCI_ERR_CAP_ECRC_CHKE 0x00000100 /* ECRC Check Enable */ +#define PCI_ERR_HEADER_LOG 28 /* Header Log Register (16 bytes) */ +#define PCI_ERR_ROOT_COMMAND 44 /* Root Error Command */ +/* Correctable Err Reporting Enable */ +#define PCI_ERR_ROOT_CMD_COR_EN 0x00000001 +/* Non-fatal Err Reporting Enable */ +#define PCI_ERR_ROOT_CMD_NONFATAL_EN 0x00000002 +/* Fatal Err Reporting Enable */ +#define PCI_ERR_ROOT_CMD_FATAL_EN 0x00000004 +#define PCI_ERR_ROOT_STATUS 48 +#define PCI_ERR_ROOT_COR_RCV 0x00000001 /* ERR_COR Received */ +/* Multi ERR_COR Received */ +#define PCI_ERR_ROOT_MULTI_COR_RCV 0x00000002 +/* ERR_FATAL/NONFATAL Recevied */ +#define PCI_ERR_ROOT_UNCOR_RCV 0x00000004 +/* Multi ERR_FATAL/NONFATAL Recevied */ +#define PCI_ERR_ROOT_MULTI_UNCOR_RCV 0x00000008 +#define PCI_ERR_ROOT_FIRST_FATAL 0x00000010 /* First Fatal */ +#define PCI_ERR_ROOT_NONFATAL_RCV 0x00000020 /* Non-Fatal Received */ +#define PCI_ERR_ROOT_FATAL_RCV 0x00000040 /* Fatal Received */ +#define PCI_ERR_ROOT_COR_SRC 52 +#define PCI_ERR_ROOT_SRC 54 + +/* Virtual Channel */ +#define PCI_VC_PORT_REG1 4 +#define PCI_VC_PORT_REG2 8 +#define PCI_VC_PORT_CTRL 12 +#define PCI_VC_PORT_STATUS 14 +#define PCI_VC_RES_CAP 16 +#define PCI_VC_RES_CTRL 20 +#define PCI_VC_RES_STATUS 26 + +/* Power Budgeting */ +#define PCI_PWR_DSR 4 /* Data Select Register */ +#define PCI_PWR_DATA 8 /* Data Register */ +#define PCI_PWR_DATA_BASE(x) ((x) & 0xff) /* Base Power */ +#define PCI_PWR_DATA_SCALE(x) (((x) >> 8) & 3) /* Data Scale */ +#define PCI_PWR_DATA_PM_SUB(x) (((x) >> 10) & 7) /* PM Sub State */ +#define PCI_PWR_DATA_PM_STATE(x) (((x) >> 13) & 3) /* PM State */ +#define PCI_PWR_DATA_TYPE(x) (((x) >> 15) & 7) /* Type */ +#define PCI_PWR_DATA_RAIL(x) (((x) >> 18) & 7) /* Power Rail */ +#define PCI_PWR_CAP 12 /* Capability */ +#define PCI_PWR_CAP_BUDGET(x) ((x) & 1) /* Included in system budget */ + +/* + * Hypertransport sub capability types + * + * Unfortunately there are both 3 bit and 5 bit capability types defined + * in the HT spec, catering for that is a little messy. You probably don't + * want to use these directly, just use pci_find_ht_capability() and it + * will do the right thing for you. + */ +#define HT_3BIT_CAP_MASK 0xE0 +#define HT_CAPTYPE_SLAVE 0x00 /* Slave/Primary link configuration */ +#define HT_CAPTYPE_HOST 0x20 /* Host/Secondary link configuration */ + +#define HT_5BIT_CAP_MASK 0xF8 +#define HT_CAPTYPE_IRQ 0x80 /* IRQ Configuration */ +#define HT_CAPTYPE_REMAPPING_40 0xA0 /* 40 bit address remapping */ +#define HT_CAPTYPE_REMAPPING_64 0xA2 /* 64 bit address remapping */ +#define HT_CAPTYPE_UNITID_CLUMP 0x90 /* Unit ID clumping */ +#define HT_CAPTYPE_EXTCONF 0x98 /* Extended Configuration Space Access */ +#define HT_CAPTYPE_MSI_MAPPING 0xA8 /* MSI Mapping Capability */ +#define HT_MSI_FLAGS 0x02 /* Offset to flags */ +#define HT_MSI_FLAGS_ENABLE 0x1 /* Mapping enable */ +#define HT_MSI_FLAGS_FIXED 0x2 /* Fixed mapping only */ +#define HT_MSI_FIXED_ADDR 0x00000000FEE00000ULL /* Fixed addr */ +#define HT_MSI_ADDR_LO 0x04 /* Offset to low addr bits */ +#define HT_MSI_ADDR_LO_MASK 0xFFF00000 /* Low address bit mask */ +#define HT_MSI_ADDR_HI 0x08 /* Offset to high addr bits */ +#define HT_CAPTYPE_DIRECT_ROUTE 0xB0 /* Direct routing configuration */ +#define HT_CAPTYPE_VCSET 0xB8 /* Virtual Channel configuration */ +#define HT_CAPTYPE_ERROR_RETRY 0xC0 /* Retry on error configuration */ +#define HT_CAPTYPE_GEN3 0xD0 /* Generation 3 hypertransport configuration */ +#define HT_CAPTYPE_PM 0xE0 /* Hypertransport powermanagement configuration */ + + +#endif /* LINUX_PCI_REGS_H */ diff -Naurp xen/include/xen/sched.h xen-redhat/include/xen/sched.h --- xen/include/xen/sched.h +++ xen-redhat/include/xen/sched.h @@ -31,12 +31,11 @@ extern unsigned long volatile jiffies; extern struct domain *dom0; #ifndef CONFIG_COMPAT -#define MAX_EVTCHNS(d) NR_EVENT_CHANNELS +#define BITS_PER_EVTCHN_WORD(d) BITS_PER_LONG #else -#define MAX_EVTCHNS(d) (!IS_COMPAT(d) ? \ - NR_EVENT_CHANNELS : \ - sizeof(unsigned int) * sizeof(unsigned int) * 64) +#define BITS_PER_EVTCHN_WORD(d) (has_32bit_shinfo(d) ? 32 : BITS_PER_LONG) #endif +#define MAX_EVTCHNS(d) (BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d)) #define EVTCHNS_PER_BUCKET 128 #define NR_EVTCHN_BUCKETS (NR_EVENT_CHANNELS / EVTCHNS_PER_BUCKET) @@ -119,6 +118,8 @@ struct vcpu bool_t defer_shutdown; /* VCPU is paused following shutdown request (d->is_shutting_down)? */ bool_t paused_for_shutdown; + /* VCPU affinity is temporarily locked from controller changes? */ + bool_t affinity_locked; unsigned long pause_flags; atomic_t pause_count; @@ -166,7 +167,7 @@ struct domain /* Event channel information. */ struct evtchn *evtchn[NR_EVTCHN_BUCKETS]; - spinlock_t evtchn_lock; + spinlock_t event_lock; struct grant_table *grant_table; @@ -191,9 +192,11 @@ struct domain /* Are any VCPUs polling event channels (SCHEDOP_poll)? */ bool_t is_polling; /* Is this guest dying (i.e., a zombie)? */ - bool_t is_dying; + enum { DOMDYING_alive, DOMDYING_dying, DOMDYING_dead } is_dying; /* Domain is paused by controller software? */ bool_t is_paused_by_controller; + /* Domain's VCPUs are pinned 1:1 to physical CPUs? */ + bool_t is_pinned; /* Guest has shut down (inc. reason code)? */ spinlock_t shutdown_lock; @@ -224,6 +227,9 @@ struct domain int32_t time_offset_seconds; struct rcu_head rcu; + /* HV */ + atomic_t hard_virt; + }; struct domain_setup_info @@ -335,7 +341,7 @@ static inline struct domain *rcu_lock_cu struct domain *get_domain_by_id(domid_t dom); void domain_destroy(struct domain *d); -void domain_kill(struct domain *d); +int domain_kill(struct domain *d); void domain_shutdown(struct domain *d, u8 reason); void domain_resume(struct domain *d); void domain_pause_for_debugger(void); @@ -476,6 +482,8 @@ void cpu_init(void); void vcpu_force_reschedule(struct vcpu *v); int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity); +int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity); +void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity); void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate); @@ -487,15 +495,15 @@ static inline void vcpu_unblock(struct v #define IS_PRIV(_d) ((_d)->is_privileged) -#ifndef IS_COMPAT -#define IS_COMPAT(d) 0 -#endif - #define VM_ASSIST(_d,_t) (test_bit((_t), &(_d)->vm_assist)) #define is_hvm_domain(d) ((d)->is_hvm) #define is_hvm_vcpu(v) (is_hvm_domain(v->domain)) +extern enum cpufreq_controller { + FREQCTL_none, FREQCTL_dom0_kernel +} cpufreq_controller; + #endif /* __SCHED_H__ */ /* diff -Naurp xen/include/xen/serial.h xen-redhat/include/xen/serial.h --- xen/include/xen/serial.h +++ xen-redhat/include/xen/serial.h @@ -3,7 +3,7 @@ * * Framework for serial device drivers. * - * Copyright (c) 2003-2005, K A Fraser + * Copyright (c) 2003-2008, K A Fraser */ #ifndef __XEN_SERIAL_H__ @@ -34,6 +34,7 @@ struct serial_port { /* Transmit data buffer (interrupt-driven uart). */ char *txbuf; unsigned int txbufp, txbufc; + bool_t tx_quench; /* Force synchronous transmit. */ int sync; /* Receiver callback functions (asynchronous receivers). */ diff -Naurp xen/include/xen/time.h xen-redhat/include/xen/time.h --- xen/include/xen/time.h +++ xen-redhat/include/xen/time.h @@ -63,6 +63,7 @@ struct tm { }; struct tm gmtime(unsigned long t); +#define SYSTEM_TIME_HZ 1000000000ULL #define NOW() ((s_time_t)get_s_time()) #define SECONDS(_s) ((s_time_t)((_s) * 1000000000ULL)) #define MILLISECS(_ms) ((s_time_t)((_ms) * 1000000ULL))