Sophie

Sophie

distrib > CentOS > 5 > x86_64 > by-pkgid > ea32411352494358b8d75a78402a4713 > files > 5653

kernel-2.6.18-238.19.1.el5.centos.plus.src.rpm

From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 30 Aug 2010 16:46:42 -0400
Subject: [xen] emulate task switching
Message-id: <1283186802-21009-7-git-send-email-pbonzini@redhat.com>
Patchwork-id: 27940
O-Subject: [RHEL5.6 XEN PATCH 6/6] emulate task switching
Bugzilla: 625903

Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=625903

Upstream status: http://xenbits.xensource.com/xen-unstable.hg/rev/15895
    http://xenbits.xensource.com/xen-unstable.hg/rev/15958
    http://xenbits.xensource.com/xen-unstable.hg/rev/15966

Brew build: https://brewweb.devel.redhat.com/taskinfo?taskID=2719051

This patch emulates hardware task switching.  It is long but boring. :)

The check at the beginning of vmx_vmexit_handler is needed because after
exiting the VM with EXIT_REASON_TASK_SWITCH, the processor leaves the
IDT vectoring info field set to "valid", so that the VMM can look for
an error code and push it on the stack; however, the exception should
not be injected, so the fields should not be copied to the VM entry
interrupt fields.  Without this bit, the processor will try reinjecting
the exception that led to the task switch, and trigger a #GP after
finding a busy TSS.
---
 arch/x86/hvm/hvm.c        |  323 +++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/hvm/svm/svm.c    |   24 +++-
 arch/x86/hvm/vmx/vmx.c    |   30 ++++--
 include/asm-x86/hvm/hvm.h |    5 +
 4 files changed, 372 insertions(+), 10 deletions(-)

Signed-off-by: Jarod Wilson <jarod@redhat.com>

diff --git a/arch/x86/hvm/hvm.c b/arch/x86/hvm/hvm.c
index 6f5a3d6..a3ccaba 100644
--- a/arch/x86/hvm/hvm.c
+++ b/arch/x86/hvm/hvm.c
@@ -659,6 +659,329 @@ int hvm_virtual_to_linear_addr(
     return 0;
 }
 
+static void *hvm_map(unsigned long va, int size)
+{
+    unsigned long gfn, mfn;
+
+    if ( ((va & ~PAGE_MASK) + size) > PAGE_SIZE )
+    {
+        hvm_inject_exception(TRAP_page_fault, PFEC_write_access,
+                             (va + PAGE_SIZE - 1) & PAGE_MASK);
+        return NULL;
+    }
+
+    gfn = paging_gva_to_gfn(current, va);
+    mfn = mfn_x(gfn_to_mfn_current(gfn));
+
+    ASSERT(mfn_valid(mfn));
+
+    paging_mark_dirty(current->domain, mfn);
+
+    return (char *)map_domain_page(mfn) + (va & ~PAGE_MASK);
+}
+
+static void hvm_unmap(void *p)
+{
+    if ( p )
+        unmap_domain_page(p);
+}
+
+static int hvm_load_segment_selector(
+    struct vcpu *v, enum x86_segment seg, uint16_t sel)
+{
+    struct segment_register desctab, cs, segr;
+    struct desc_struct *pdesc, desc;
+    u8 dpl, rpl, cpl;
+    int fault_type = TRAP_invalid_tss;
+
+    /* NULL selector? */
+    if ( (sel & 0xfffc) == 0 )
+    {
+        if ( (seg == x86_seg_cs) || (seg == x86_seg_ss) )
+            goto fail;
+        memset(&segr, 0, sizeof(segr));
+        hvm_set_segment_register(v, seg, &segr);
+        return 0;
+    }
+
+    /* LDT descriptor must be in the GDT. */
+    if ( (seg == x86_seg_ldtr) && (sel & 4) )
+        goto fail;
+
+    hvm_get_segment_register(v, x86_seg_cs, &cs);
+    hvm_get_segment_register(
+        v, (sel & 4) ? x86_seg_ldtr : x86_seg_gdtr, &desctab);
+
+    /* Check against descriptor table limit. */
+    if ( ((sel & 0xfff8) + 7) > desctab.limit )
+        goto fail;
+
+    pdesc = hvm_map(desctab.base + (sel & 0xfff8), 8);
+    if ( pdesc == NULL )
+        goto hvm_map_fail;
+
+    do {
+        desc = *pdesc;
+
+        /* Segment present in memory? */
+        if ( !(desc.b & (1u<<15)) )
+        {
+            fault_type = TRAP_no_segment;
+            goto unmap_and_fail;
+        }
+
+        /* LDT descriptor is a system segment. All others are code/data. */
+        if ( (desc.b & (1u<<12)) == ((seg == x86_seg_ldtr) << 12) )
+            goto unmap_and_fail;
+
+        dpl = (desc.b >> 13) & 3;
+        rpl = sel & 3;
+        cpl = cs.sel & 3;
+
+        switch ( seg )
+        {
+        case x86_seg_cs:
+            /* Code segment? */
+            if ( !(desc.b & (1u<<11)) )
+                goto unmap_and_fail;
+            /* Non-conforming segment: check DPL against RPL. */
+            if ( ((desc.b & (6u<<9)) != 6) && (dpl != rpl) )
+                goto unmap_and_fail;
+            break;
+        case x86_seg_ss:
+            /* Writable data segment? */
+            if ( (desc.b & (5u<<9)) != (1u<<9) )
+                goto unmap_and_fail;
+            if ( (dpl != cpl) || (dpl != rpl) )
+                goto unmap_and_fail;
+            break;
+        case x86_seg_ldtr:
+            /* LDT system segment? */
+            if ( (desc.b & (15u<<8)) != (2u<<8) )
+                goto unmap_and_fail;
+            goto skip_accessed_flag;
+        default:
+            /* Readable code or data segment? */
+            if ( (desc.b & (5u<<9)) == (4u<<9) )
+                goto unmap_and_fail;
+            /* Non-conforming segment: check DPL against RPL and CPL. */
+            if ( ((desc.b & (6u<<9)) != 6) && ((dpl < cpl) || (dpl < rpl)) )
+                goto unmap_and_fail;
+            break;
+        }
+    } while ( !(desc.b & 0x100) && /* Ensure Accessed flag is set */
+              (cmpxchg(&pdesc->b, desc.b, desc.b | 0x100) != desc.b) );
+
+    /* Force the Accessed flag in our local copy. */
+    desc.b |= 0x100;
+
+ skip_accessed_flag:
+    hvm_unmap(pdesc);
+
+    segr.base = (((desc.b <<  0) & 0xff000000u) |
+                 ((desc.b << 16) & 0x00ff0000u) |
+                 ((desc.a >> 16) & 0x0000ffffu));
+    segr.attr.bytes = (((desc.b >>  8) & 0x00ffu) |
+                       ((desc.b >> 12) & 0x0f00u));
+    segr.limit = (desc.b & 0x000f0000u) | (desc.a & 0x0000ffffu);
+    if ( segr.attr.fields.g )
+        segr.limit = (segr.limit << 12) | 0xfffu;
+    segr.sel = sel;
+    hvm_set_segment_register(v, seg, &segr);
+
+    return 0;
+
+ unmap_and_fail:
+    hvm_unmap(pdesc);
+ fail:
+    hvm_inject_exception(fault_type, sel & 0xfffc, 0);
+ hvm_map_fail:
+    return 1;
+}
+
+void hvm_task_switch(
+    uint16_t tss_sel, enum hvm_task_switch_reason taskswitch_reason,
+    int32_t errcode)
+{
+    struct vcpu *v = current;
+    struct cpu_user_regs *regs = guest_cpu_user_regs();
+    struct segment_register gdt, tr, prev_tr, segr;
+    struct desc_struct *optss_desc = NULL, *nptss_desc = NULL, tss_desc;
+    unsigned long eflags;
+    int exn_raised;
+    struct {
+        u16 back_link,__blh;
+        u32 esp0;
+        u16 ss0, _0;
+        u32 esp1;
+        u16 ss1, _1;
+        u32 esp2;
+        u16 ss2, _2;
+        u32 cr3, eip, eflags, eax, ecx, edx, ebx, esp, ebp, esi, edi;
+        u16 es, _3, cs, _4, ss, _5, ds, _6, fs, _7, gs, _8, ldt, _9;
+        u16 trace, iomap;
+    } *ptss, tss;
+
+    hvm_get_segment_register(v, x86_seg_gdtr, &gdt);
+    hvm_get_segment_register(v, x86_seg_tr, &prev_tr);
+
+    if ( ((tss_sel & 0xfff8) + 7) > gdt.limit )
+    {
+        hvm_inject_exception((taskswitch_reason == TSW_iret) ?
+                             TRAP_invalid_tss : TRAP_gp_fault,
+                             tss_sel & 0xfff8, 0);
+        goto out;
+    }
+
+    optss_desc = hvm_map(gdt.base + (prev_tr.sel & 0xfff8), 8);
+    if ( optss_desc == NULL )
+        goto out;
+
+    nptss_desc = hvm_map(gdt.base + (tss_sel & 0xfff8), 8);
+    if ( nptss_desc == NULL )
+        goto out;
+
+    tss_desc = *nptss_desc;
+    tr.sel = tss_sel;
+    tr.base = (((tss_desc.b <<  0) & 0xff000000u) |
+               ((tss_desc.b << 16) & 0x00ff0000u) |
+               ((tss_desc.a >> 16) & 0x0000ffffu));
+    tr.attr.bytes = (((tss_desc.b >>  8) & 0x00ffu) |
+                     ((tss_desc.b >> 12) & 0x0f00u));
+    tr.limit = (tss_desc.b & 0x000f0000u) | (tss_desc.a & 0x0000ffffu);
+    if ( tr.attr.fields.g )
+        tr.limit = (tr.limit << 12) | 0xfffu;
+
+    if ( !tr.attr.fields.p )
+    {
+        hvm_inject_exception(TRAP_no_segment, tss_sel & 0xfff8, 0);
+        goto out;
+    }
+
+    if ( tr.attr.fields.type != ((taskswitch_reason == TSW_iret) ? 0xb : 0x9) )
+    {
+        hvm_inject_exception(
+            (taskswitch_reason == TSW_iret) ? TRAP_invalid_tss : TRAP_gp_fault,
+            tss_sel & 0xfff8, 0);
+        goto out;
+    }
+
+    if ( !tr.attr.fields.g && (tr.limit < (sizeof(tss)-1)) )
+    {
+        hvm_inject_exception(TRAP_invalid_tss, tss_sel & 0xfff8, 0);
+        goto out;
+    }
+
+    hvm_store_cpu_guest_regs(v, regs, NULL);
+
+    ptss = hvm_map(prev_tr.base, sizeof(tss));
+    if ( ptss == NULL )
+        goto out;
+
+    eflags = regs->eflags;
+    if ( taskswitch_reason == TSW_iret )
+        eflags &= ~X86_EFLAGS_NT;
+
+    ptss->cr3    = hvm_get_guest_ctrl_reg(v, 3);
+    ptss->eip    = regs->eip;
+    ptss->eflags = eflags;
+    ptss->eax    = regs->eax;
+    ptss->ecx    = regs->ecx;
+    ptss->edx    = regs->edx;
+    ptss->ebx    = regs->ebx;
+    ptss->esp    = regs->esp;
+    ptss->ebp    = regs->ebp;
+    ptss->esi    = regs->esi;
+    ptss->edi    = regs->edi;
+
+    hvm_get_segment_register(v, x86_seg_es, &segr);
+    ptss->es = segr.sel;
+    hvm_get_segment_register(v, x86_seg_cs, &segr);
+    ptss->cs = segr.sel;
+    hvm_get_segment_register(v, x86_seg_ss, &segr);
+    ptss->ss = segr.sel;
+    hvm_get_segment_register(v, x86_seg_ds, &segr);
+    ptss->ds = segr.sel;
+    hvm_get_segment_register(v, x86_seg_fs, &segr);
+    ptss->fs = segr.sel;
+    hvm_get_segment_register(v, x86_seg_gs, &segr);
+    ptss->gs = segr.sel;
+    hvm_get_segment_register(v, x86_seg_ldtr, &segr);
+    ptss->ldt = segr.sel;
+
+    hvm_unmap(ptss);
+
+    ptss = hvm_map(tr.base, sizeof(tss));
+    if ( ptss == NULL )
+        goto out;
+
+    if ( hvm_set_cr3(ptss->cr3) )
+        goto out;
+
+    regs->eip    = ptss->eip;
+    regs->eflags = ptss->eflags | 2;
+    regs->eax    = ptss->eax;
+    regs->ecx    = ptss->ecx;
+    regs->edx    = ptss->edx;
+    regs->ebx    = ptss->ebx;
+    regs->esp    = ptss->esp;
+    regs->ebp    = ptss->ebp;
+    regs->esi    = ptss->esi;
+    regs->edi    = ptss->edi;
+
+    if ( (taskswitch_reason == TSW_call_or_int) )
+    {
+        regs->eflags |= X86_EFLAGS_NT;
+        ptss->back_link = prev_tr.sel;
+    }
+
+    exn_raised = 0;
+    if ( hvm_load_segment_selector(v, x86_seg_es, ptss->es) ||
+         hvm_load_segment_selector(v, x86_seg_cs, ptss->cs) ||
+         hvm_load_segment_selector(v, x86_seg_ss, ptss->ss) ||
+         hvm_load_segment_selector(v, x86_seg_ds, ptss->ds) ||
+         hvm_load_segment_selector(v, x86_seg_fs, ptss->fs) ||
+         hvm_load_segment_selector(v, x86_seg_gs, ptss->gs) ||
+         hvm_load_segment_selector(v, x86_seg_ldtr, ptss->ldt) )
+        exn_raised = 1;
+
+    if ( (ptss->trace & 1) && !exn_raised )
+        hvm_inject_exception(TRAP_debug, tss_sel & 0xfff8, 0);
+
+    hvm_unmap(ptss);
+
+    tr.attr.fields.type = 0xb; /* busy 32-bit tss */
+    hvm_set_segment_register(v, x86_seg_tr, &tr);
+
+    hvm_stts(v);
+
+    if ( (taskswitch_reason == TSW_iret) ||
+         (taskswitch_reason == TSW_jmp) )
+        clear_bit(41, optss_desc); /* clear B flag of old task */
+
+    if ( taskswitch_reason != TSW_iret )
+        set_bit(41, nptss_desc); /* set B flag of new task */
+
+    if ( errcode >= 0 )
+    {
+        struct segment_register reg;
+        unsigned long linear_addr;
+        regs->esp -= 4;
+        hvm_get_segment_register(current, x86_seg_ss, &reg);
+        /* Todo: do not ignore access faults here. */
+        if ( hvm_virtual_to_linear_addr(x86_seg_ss, &reg, regs->esp,
+                                        4, hvm_access_write, 32,
+                                        &linear_addr) )
+            hvm_copy_to_guest_virt(linear_addr, &errcode, 4);
+    }
+
+    hvm_load_cpu_guest_regs(v, regs);
+
+ out:
+    hvm_unmap(optss_desc);
+    hvm_unmap(nptss_desc);
+}
+
 /*
  * __hvm_copy():
  *  @buf  = hypervisor buffer
diff --git a/arch/x86/hvm/svm/svm.c b/arch/x86/hvm/svm/svm.c
index 2573983..d73a785 100644
--- a/arch/x86/hvm/svm/svm.c
+++ b/arch/x86/hvm/svm/svm.c
@@ -696,6 +696,8 @@ static void svm_get_segment_register(struct vcpu *v, enum x86_segment seg,
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
 
+    ASSERT(v == current);
+
     switch ( seg )
     {
     case x86_seg_cs:
@@ -742,10 +744,13 @@ static void svm_set_segment_register(struct vcpu *v, enum x86_segment seg,
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
 
+    ASSERT(v == current);
+
     switch ( seg )
     {
     case x86_seg_cs:
         memcpy(&vmcb->cs, reg, sizeof(*reg));
+        guest_cpu_user_regs()->cs = reg->sel;
         break;
     case x86_seg_ds:
         memcpy(&vmcb->ds, reg, sizeof(*reg));
@@ -765,6 +770,7 @@ static void svm_set_segment_register(struct vcpu *v, enum x86_segment seg,
         break;
     case x86_seg_ss:
         memcpy(&vmcb->ss, reg, sizeof(*reg));
+        guest_cpu_user_regs()->ss = reg->sel;
         break;
     case x86_seg_tr:
         svm_sync_vmcb(v);
@@ -2538,12 +2544,20 @@ asmlinkage void svm_vmexit_handler(struct cpu_user_regs *regs)
         svm_vmexit_do_invd(v);
         break;
 
-    case VMEXIT_GDTR_WRITE:
-        printk("WRITE to GDTR\n");
+    case VMEXIT_TASK_SWITCH: {
+        enum hvm_task_switch_reason reason;
+        int32_t errcode = -1;
+        if ( (vmcb->exitinfo2 >> 36) & 1 )
+            reason = TSW_iret;
+        else if ( (vmcb->exitinfo2 >> 38) & 1 )
+            reason = TSW_jmp;
+        else
+            reason = TSW_call_or_int;
+        if ( (vmcb->exitinfo2 >> 44) & 1 )
+            errcode = (uint32_t)vmcb->exitinfo2;
+        hvm_task_switch((uint16_t)vmcb->exitinfo1, reason, errcode);
         break;
-
-    case VMEXIT_TASK_SWITCH:
-        goto exit_and_crash;
+    }
 
     case VMEXIT_CPUID:
         svm_vmexit_do_cpuid(vmcb, regs);
diff --git a/arch/x86/hvm/vmx/vmx.c b/arch/x86/hvm/vmx/vmx.c
index c7110f4..a0f4db8 100644
--- a/arch/x86/hvm/vmx/vmx.c
+++ b/arch/x86/hvm/vmx/vmx.c
@@ -1087,7 +1087,7 @@ static unsigned long vmx_get_segment_base(struct vcpu *v, enum x86_segment seg)
 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
                                      struct segment_register *reg)
 {
-    u16 attr = 0;
+    uint32_t attr = 0;
 
     ASSERT(v == current);
 
@@ -1159,13 +1159,17 @@ static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
 static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg,
                                      struct segment_register *reg)
 {
-    u16 attr;
+    uint32_t attr;
 
     ASSERT(v == current);
 
     attr = reg->attr.bytes;
     attr = ((attr & 0xf00) << 4) | (attr & 0xff);
 
+    /* Not-present must mean unusable. */
+    if ( !reg->attr.fields.p )
+        attr |= (1u << 16);
+
     switch ( seg )
     {
     case x86_seg_cs:
@@ -1173,6 +1177,7 @@ static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg,
         __vmwrite(GUEST_CS_LIMIT, reg->limit);
         __vmwrite(GUEST_CS_BASE, reg->base);
         __vmwrite(GUEST_CS_AR_BYTES, attr);
+        guest_cpu_user_regs()->cs = reg->sel;
         break;
     case x86_seg_ds:
         __vmwrite(GUEST_DS_SELECTOR, reg->sel);
@@ -1203,6 +1208,7 @@ static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg,
         __vmwrite(GUEST_SS_LIMIT, reg->limit);
         __vmwrite(GUEST_SS_BASE, reg->base);
         __vmwrite(GUEST_SS_AR_BYTES, attr);
+        guest_cpu_user_regs()->ss = reg->sel;
         break;
     case x86_seg_tr:
         __vmwrite(GUEST_TR_SELECTOR, reg->sel);
@@ -3242,7 +3248,8 @@ asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
 
     /* Event delivery caused this intercept? Queue for redelivery. */
     idtv_info = __vmread(IDT_VECTORING_INFO_FIELD);
-    if ( unlikely(idtv_info & INTR_INFO_VALID_MASK) )
+    if ( unlikely(idtv_info & INTR_INFO_VALID_MASK)
+	  && (exit_reason != EXIT_REASON_TASK_SWITCH) )
     {
         if ( vmx_event_needs_reinjection((idtv_info>>8)&7, idtv_info&0xff) )
         {
@@ -3360,8 +3367,21 @@ asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
         __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
                   v->arch.hvm_vmx.exec_control);
         break;
-    case EXIT_REASON_TASK_SWITCH:
-        goto exit_and_crash;
+    case EXIT_REASON_TASK_SWITCH: {
+        const enum hvm_task_switch_reason reasons[] = {
+            TSW_call_or_int, TSW_iret, TSW_jmp, TSW_call_or_int };
+        int32_t errcode = -1;
+        unsigned int idtv_info;
+        exit_qualification = __vmread(EXIT_QUALIFICATION);
+        idtv_info = __vmread(IDT_VECTORING_INFO_FIELD);
+        if ( (idtv_info & INTR_INFO_VALID_MASK) &&
+             (idtv_info & INTR_INFO_DELIVER_CODE_MASK) )
+            errcode = __vmread(IDT_VECTORING_ERROR_CODE);
+        hvm_task_switch((uint16_t)exit_qualification,
+                        reasons[(exit_qualification >> 30) & 3],
+                        errcode);
+        break;
+    }
     case EXIT_REASON_CPUID:
         inst_len = __get_instruction_length(); /* Safe: CPUID */
         __update_guest_eip(inst_len);
diff --git a/include/asm-x86/hvm/hvm.h b/include/asm-x86/hvm/hvm.h
index f845e1a..629c6e1 100644
--- a/include/asm-x86/hvm/hvm.h
+++ b/include/asm-x86/hvm/hvm.h
@@ -364,6 +364,11 @@ static inline int hvm_event_pending(struct vcpu *v)
 
 #define HVM_IDENT_PT_PAGE 0xE8000
 
+enum hvm_task_switch_reason { TSW_jmp, TSW_iret, TSW_call_or_int };
+void hvm_task_switch(
+    uint16_t tss_sel, enum hvm_task_switch_reason taskswitch_reason,
+    int32_t errcode);
+
 enum hvm_access_type {
     hvm_access_insn_fetch, hvm_access_read, hvm_access_write
 };