Xen PV DoS vulnerability with SYSENTER

The SYSENTER instruction can be used by PV guests to accelerate system call processing. This instruction, however, leaves the EFLAGS register mostly unmodified - in particular, the NT flag doesn’t get cleared. If the hypervisor subsequently uses IRET to return to the guest (which it will always do if the guest is a 32-bit one), that instruction will cause a #GP fault to be raised, but the recovery code in the hypervisor will again try to use IRET without intermediately clearing the NT flag. The #GP fault raised on this second IRET is a fatal event, causing the hypervisor to crash.

logic error and improper error handling (not clear NT flag)



x86: clear EFLAGS.NT in SYSENTER entry path

… as it causes problems if we happen to exit back via IRET: In the course of trying to handle the fault, the hypervisor creates a stack frame by hand, and uses PUSHFQ to set the respective EFLAGS field, but expects to be able to IRET through that stack frame to the second portion of the fixup code (which causes a #GP due to the stored EFLAGS having NT set).

And even if this worked (e.g if we cleared NT in that path), it would then (through the fail safe callback) cause a #GP in the guest with the SYSENTER handler’s first instruction as the source, which in turn would allow guest user mode code to crash the guest kernel.

Inject a #GP on the fake (NULL) address of the SYSENTER instruction instead, just like in the case where the guest kernel didn’t register a corresponding entry point.

On 32-bit we also need to make sure we clear SYSENTER_CS for all CPUs (neither #RESET nor #INIT guarantee this).

--- a/xen/arch/x86/acpi/suspend.c
+++ b/xen/arch/x86/acpi/suspend.c
@@ -81,8 +81,12 @@ void restore_rest_processor_state(void)
 #else /* !defined(CONFIG_X86_64) */
-    if ( supervisor_mode_kernel && cpu_has_sep )
-        wrmsr(MSR_IA32_SYSENTER_ESP, &this_cpu(init_tss).esp1, 0);
+    if ( cpu_has_sep )
+    {
+        wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
+        if ( supervisor_mode_kernel )
+            wrmsr(MSR_IA32_SYSENTER_ESP, &this_cpu(init_tss).esp1, 0);
+    }
     /* Maybe load the debug registers. */
--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -655,8 +655,11 @@ void __cpuinit cpu_init(void)
 #if defined(CONFIG_X86_32)
    t->ss0  = __HYPERVISOR_DS;
    t->esp0 = get_stack_bottom();
-   if ( supervisor_mode_kernel && cpu_has_sep )
+   if ( cpu_has_sep ) {
+       wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
+       if ( supervisor_mode_kernel )
        wrmsr(MSR_IA32_SYSENTER_ESP, &t->esp1, 0);
+   }
 #elif defined(CONFIG_X86_64)
    /* Bottom-of-stack must be 16-byte aligned! */
    BUG_ON((get_stack_bottom() & 15) != 0);
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -284,7 +284,14 @@ sysenter_eflags_saved:
         cmpb  $0,VCPU_sysenter_disables_events(%rbx)
         movq  VCPU_sysenter_addr(%rbx),%rax
         setne %cl
+        testl $X86_EFLAGS_NT,UREGS_eflags(%rsp)
         leaq  VCPU_trap_bounce(%rbx),%rdx
+UNLIKELY_START(nz, sysenter_nt_set)
+        pushfq
+        andl  $~X86_EFLAGS_NT,(%rsp)
+        popfq
+        xorl  %eax,%eax
         testq %rax,%rax
         leal  (,%rcx,TBF_INTERRUPT),%ecx
 UNLIKELY_START(z, sysenter_gpf)


Malicious or buggy unprivileged user space can cause the entire host to crash.