From: ddugger@redhat.com <ddugger@redhat.com> Date: Tue, 21 Apr 2009 13:01:22 -0600 Subject: [xen] VT-d: enhance MTRR/PAT virtualization Message-id: 200904211901.n3LJ1Mcx028378@sobek.n0ano.com O-Subject: [RHEL5.4 PATCH 1/6 V2] BZ496873: VT-d: Enhance MTRR/PAT virtualization Bugzilla: 496873 RH-Acked-by: Rik van Riel <riel@redhat.com> RH-Acked-by: Rik van Riel <riel@redhat.com> RH-Acked-by: Gerd Hoffmann <kraxel@redhat.com> RH-Acked-by: Chris Lalancette <clalance@redhat.com> RH-Acked-by: Gerd Hoffmann <kraxel@redhat.com> RH-Acked-by: Justin M. Forbes <jforbes@redhat.com> RH-Acked-by: Justin M. Forbes <jforbes@redhat.com> VT-d: Enhance MTRR/PAT virtualization when EPT/VT-d both enabled, and utilise the snoop control capability of the VT-d engine, also some cleanup in the VT-d and EPT code. Difference from Xen-unstable: 1)epte_get_entry_emt() is not in the xen/arch/x86/hvm/mtrr.c file, since the file doesn't exist in xen-3.1. So I put it in the p2m-ept.c file. 2)some justification in the epte_get_entry_emt() is removed now, since some of the conditions don't exist in xen-3.1. Upstream Status: Accepted (CS 19079, 19154, 19165, 19198) Signed-off-by: Xiaohui Xin <xiaohui.xin@intel.com> Signed-off-by: Gerd Hoffman <kraxel@redhat.com> Signed-off-by: Don Dugger <donald.d.dugger@intel.com> Yet Another Resend - resolve the conflicting BZs between the Subject line (correct) and the message body (incorrect). diff --git a/arch/x86/mm/p2m-ept.c b/arch/x86/mm/p2m-ept.c index e5420f5..4fb80e3 100644 --- a/arch/x86/mm/p2m-ept.c +++ b/arch/x86/mm/p2m-ept.c @@ -24,6 +24,7 @@ #include <asm/domain.h> #include <asm/hvm/vmx/vmx.h> #include <xen/iocap.h> +#include <asm/mtrr.h> #if 1 /* XEN_VERSION == 3 && XEN_SUBVERSION < 2 */ @@ -72,6 +73,38 @@ static mfn_t compat_ept_get_entry_fast(unsigned long gfn) #endif +uint8_t epte_get_entry_emt( + struct domain *d, unsigned long gfn, + unsigned long mfn, uint8_t *igmt, int direct_mmio) +{ + struct vcpu *v = current; + + *igmt = 0; + + if ( (current->domain != d) && ((v = d->vcpu[0]) == NULL) ) + return MTRR_TYPE_WRBACK; + + if ( !mfn_valid(mfn) ) + return MTRR_TYPE_UNCACHABLE; + + if ( !iommu_enabled ) + { + *igmt = 1; + return MTRR_TYPE_WRBACK; + } + + if ( direct_mmio ) + return MTRR_TYPE_UNCACHABLE; + + if ( iommu_snoop ) + { + *igmt = 1; + return MTRR_TYPE_WRBACK; + } + + return MTRR_TYPE_WRBACK; +} + static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type) { switch(type) @@ -81,7 +114,6 @@ static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type) return; case p2m_mmio_direct: entry->r = entry->w = entry->x = 1; - entry->emt = 0x8; return; case p2m_ram_logdirty: case p2m_ram_ro: @@ -112,6 +144,7 @@ static int ept_set_middle_entry(struct domain *d, ept_entry_t *ept_entry) list_add_tail(&pg->list, &d->arch.p2m.pages); ept_entry->emt = 0; + ept_entry->igmt = 0; ept_entry->sp_avail = 0; ept_entry->avail1 = 0; ept_entry->mfn = page_to_mfn(pg); @@ -170,6 +203,9 @@ ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, u32 index; int i, rv = 0, ret = 0; int walk_level = order / EPT_TABLE_ORDER; + int direct_mmio = (p2mt == p2m_mmio_direct); + uint8_t igmt = 0; + int need_modify_vtd_table = 1; /* We only support 4k and 2m pages now */ @@ -203,26 +239,31 @@ ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, { if ( mfn_valid(mfn_x(mfn)) || (p2mt == p2m_mmio_direct) ) { - /* Track the highest gfn for which we have ever had a valid mapping */ - if ( gfn > d->arch.p2m.max_mapped_pfn ) - d->arch.p2m.max_mapped_pfn = gfn; - - if ( p2mt == p2m_mmio_direct ) - ept_entry->emt = 0x8; - else - ept_entry->emt = EPT_DEFAULT_MT; + ept_entry->emt = epte_get_entry_emt(d, gfn, mfn_x(mfn), + &igmt, direct_mmio); + ept_entry->igmt = igmt; ept_entry->sp_avail = walk_level ? 1 : 0; if ( ret == GUEST_TABLE_SUPER_PAGE ) { - ept_entry->mfn = mfn_x(mfn) - offset; + if ( ept_entry->mfn == (mfn_x(mfn) - offset) ) + need_modify_vtd_table = 0; + else + ept_entry->mfn = mfn_x(mfn) - offset; + if ( ept_entry->avail1 == p2m_ram_logdirty && p2mt == p2m_ram_rw ) for ( i = 0; i < (1UL << order); i++ ) paging_mark_dirty(d, mfn_x(mfn)-offset+i); } else - ept_entry->mfn = mfn_x(mfn); + { + if ( ept_entry->mfn == mfn_x(mfn) ) + need_modify_vtd_table = 0; + else + ept_entry->mfn = mfn_x(mfn); + } + ept_entry->avail1 = p2mt; ept_entry->rsvd = 0; @@ -260,10 +301,11 @@ ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, for ( i = 0; i < 512; i++ ) { split_ept_entry = split_table + i; - if ( p2mt == p2m_mmio_direct ) - split_ept_entry->emt = 0x8; - else - split_ept_entry->emt = EPT_DEFAULT_MT; + split_ept_entry->emt = epte_get_entry_emt(d, + gfn-offset+i, split_mfn+i, + &igmt, direct_mmio); + split_ept_entry->igmt = igmt; + split_ept_entry->sp_avail = 0; split_ept_entry->mfn = split_mfn+i; @@ -278,17 +320,25 @@ ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, /* Set the destinated 4k page as normal */ split_ept_entry = split_table + offset; - if ( p2mt == p2m_mmio_direct ) - split_ept_entry->emt = 0x8; + split_ept_entry->emt = epte_get_entry_emt(d, gfn, mfn_x(mfn), + &igmt, direct_mmio); + split_ept_entry->igmt = igmt; + if ( split_ept_entry->mfn == mfn_x(mfn) ) + need_modify_vtd_table = 0; else - split_ept_entry->emt = EPT_DEFAULT_MT; - split_ept_entry->mfn = mfn_x(mfn); + split_ept_entry->mfn = mfn_x(mfn); + split_ept_entry->avail1 = p2mt; ept_p2m_type_to_flags(split_ept_entry, p2mt); unmap_domain_page(split_table); } + /* Track the highest gfn for which we have ever had a valid mapping */ + if ( mfn_valid(mfn_x(mfn)) + && (gfn + (1UL << order) - 1 > d->arch.p2m.max_mapped_pfn) ) + d->arch.p2m.max_mapped_pfn = gfn + (1UL << order) - 1; + /* Success */ rv = 1; @@ -297,7 +347,8 @@ ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, ept_sync_domain(d); /* support pci pass-through */ - if ( iommu_enabled ) + if ( iommu_enabled && is_hvm_domain(d) + && need_modify_vtd_table) { if ( p2mt == p2m_ram_rw ) { diff --git a/drivers/passthrough/iommu.c b/drivers/passthrough/iommu.c index 2661d2b..0cd3aa9 100644 --- a/drivers/passthrough/iommu.c +++ b/drivers/passthrough/iommu.c @@ -29,16 +29,20 @@ int intel_vtd_setup(void); * off|no|false|disable Disable IOMMU (default) * force|required Don't boot unless IOMMU is enabled * passthrough Bypass VT-d translation for Dom0 + * snoop Utilize the snoop control for IOMMU (default) + * no-snoop Dont utilize the snoop control for IOMMU */ custom_param("iommu", parse_iommu_param); int iommu_enabled = 0; int force_iommu = 0; int iommu_passthrough = 0; +int iommu_snoop = 0; static void __init parse_iommu_param(char *s) { char *ss; iommu_enabled = 1; + iommu_snoop = 1; do { ss = strchr(s, ','); @@ -52,6 +56,10 @@ static void __init parse_iommu_param(char *s) force_iommu = 1; else if ( !strcmp(s, "passthrough") ) iommu_passthrough = 1; + else if ( !strcmp(s, "snoop") ) + iommu_snoop = 1; + else if ( !strcmp(s, "no-snoop") ) + iommu_snoop = 0; s = ss + 1; } while ( ss ); diff --git a/drivers/passthrough/vtd/dmar.c b/drivers/passthrough/vtd/dmar.c index 4e20aef..63e4e0b 100644 --- a/drivers/passthrough/vtd/dmar.c +++ b/drivers/passthrough/vtd/dmar.c @@ -29,6 +29,7 @@ #include <xen/pci_regs.h> #include <asm/string.h> #include "dmar.h" +#include "iommu.h" int vtd_enabled = 1; diff --git a/drivers/passthrough/vtd/iommu.c b/drivers/passthrough/vtd/iommu.c index 6a44091..1705e2f 100644 --- a/drivers/passthrough/vtd/iommu.c +++ b/drivers/passthrough/vtd/iommu.c @@ -1473,6 +1473,11 @@ int intel_iommu_map_page( pte_present = dma_pte_present(*pte); dma_set_pte_addr(*pte, (paddr_t)mfn << PAGE_SHIFT_4K); dma_set_pte_prot(*pte, DMA_PTE_READ | DMA_PTE_WRITE); + + /* Set the SNP on leaf page table if Snoop Control available */ + if ( iommu_snoop ) + dma_set_pte_snp(*pte); + iommu_flush_cache_entry(pte); spin_unlock(&hd->mapping_lock); unmap_vtd_domain_page(page); @@ -1746,12 +1751,29 @@ int intel_vtd_setup(void) if ( init_vtd_hw() ) goto error; + /* Giving that all devices within guest use same io page table, + * enable snoop control only if all VT-d engines support it. + */ + if ( iommu_snoop ) + { + for_each_drhd_unit ( drhd ) + { + iommu = drhd->iommu; + if ( !ecap_snp_ctl(iommu->ecap) ) { + iommu_snoop = 0; + break; + } + } + } + + printk("Intel VT-d snoop control %sabled\n", iommu_snoop ? "en" : "dis"); return 0; error: for_each_drhd_unit ( drhd ) iommu_free(drhd); vtd_enabled = 0; + iommu_snoop = 0; return -ENOMEM; } diff --git a/drivers/passthrough/vtd/iommu.h b/drivers/passthrough/vtd/iommu.h index 5037e57..d5582ba 100644 --- a/drivers/passthrough/vtd/iommu.h +++ b/drivers/passthrough/vtd/iommu.h @@ -104,6 +104,7 @@ #define ecap_ext_intr(e) ((e >> 4) & 0x1) #define ecap_cache_hints(e) ((e >> 5) & 0x1) #define ecap_pass_thru(e) ((e >> 6) & 0x1) +#define ecap_snp_ctl(e) ((e >> 7) & 0x1) /* IOTLB_REG */ #define DMA_TLB_FLUSH_GRANU_OFFSET 60 @@ -260,10 +261,14 @@ struct dma_pte { }; #define DMA_PTE_READ (1) #define DMA_PTE_WRITE (2) +#define DMA_PTE_SNP (1 << 11) + #define dma_clear_pte(p) do {(p).val = 0;} while(0) #define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while(0) #define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while(0) #define dma_set_pte_superpage(p) do {(p).val |= (1 << 7);} while(0) +#define dma_set_pte_snp(p) do {(p).val |= DMA_PTE_SNP;} while(0) + #define dma_set_pte_prot(p, prot) \ do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0) #define dma_pte_addr(p) ((p).val & PAGE_MASK_4K) diff --git a/include/asm-x86/hvm/vmx/vmx.h b/include/asm-x86/hvm/vmx/vmx.h index aa30520..c40f092 100644 --- a/include/asm-x86/hvm/vmx/vmx.h +++ b/include/asm-x86/hvm/vmx/vmx.h @@ -35,7 +35,8 @@ typedef union { u64 r : 1, w : 1, x : 1, - emt : 4, + emt : 3, + igmt : 1, sp_avail : 1, avail1 : 4, mfn : 45, diff --git a/include/xen/iommu.h b/include/xen/iommu.h index 440d9b6..19f21c5 100644 --- a/include/xen/iommu.h +++ b/include/xen/iommu.h @@ -31,6 +31,7 @@ extern int iommu_enabled; extern int iommu_pv_enabled; extern int force_iommu; extern int iommu_passthrough; +extern int iommu_snoop; #define domain_hvm_iommu(d) (&d->arch.hvm_domain.hvm_iommu)