[PATCH 00/10] target/i386/tcg: fixes for seg_helper.c

Paolo Bonzini posted 10 patches 5 months, 3 weeks ago
target/i386/cpu.h            |  11 +-
target/i386/cpu.c            |  27 +-
target/i386/tcg/seg_helper.c | 606 +++++++++++++++++++----------------
3 files changed, 354 insertions(+), 290 deletions(-)
[PATCH 00/10] target/i386/tcg: fixes for seg_helper.c
Posted by Paolo Bonzini 5 months, 3 weeks ago
This includes bugfixes:
- allowing IRET from user mode to user mode with SMAP (do not use implicit
  kernel accesses, which break if the stack is in userspace)

- use DPL-level accesses for interrupts and call gates

- various fixes for task switching

And two related cleanups: computing MMU index once for far calls and returns
(including task switches), and using X86Access for TSS access.

Tested with a really ugly patch to kvm-unit-tests, included after signature.

Paolo Bonzini (7):
  target/i386/tcg: Allow IRET from user mode to user mode with SMAP
  target/i386/tcg: use PUSHL/PUSHW for error code
  target/i386/tcg: Compute MMU index once
  target/i386/tcg: Use DPL-level accesses for interrupts and call gates
  target/i386/tcg: check for correct busy state before switching to a
    new task
  target/i386/tcg: use X86Access for TSS access
  target/i386/tcg: save current task state before loading new one

Richard Henderson (3):
  target/i386/tcg: Remove SEG_ADDL
  target/i386/tcg: Reorg push/pop within seg_helper.c
  target/i386/tcg: Introduce x86_mmu_index_{kernel_,}pl

 target/i386/cpu.h            |  11 +-
 target/i386/cpu.c            |  27 +-
 target/i386/tcg/seg_helper.c | 606 +++++++++++++++++++----------------
 3 files changed, 354 insertions(+), 290 deletions(-)

-- 
2.45.2

diff --git a/lib/x86/usermode.c b/lib/x86/usermode.c
index c3ec0ad7..0bf40c6d 100644
--- a/lib/x86/usermode.c
+++ b/lib/x86/usermode.c
@@ -5,13 +5,15 @@
 #include "x86/desc.h"
 #include "x86/isr.h"
 #include "alloc.h"
+#include "alloc_page.h"
 #include "setjmp.h"
 #include "usermode.h"
 
 #include "libcflat.h"
 #include <stdint.h>
 
-#define USERMODE_STACK_SIZE	0x2000
+#define USERMODE_STACK_ORDER	1 /* 8k */
+#define USERMODE_STACK_SIZE	(1 << (12 + USERMODE_STACK_ORDER))
 #define RET_TO_KERNEL_IRQ	0x20
 
 static jmp_buf jmpbuf;
@@ -37,9 +39,14 @@ uint64_t run_in_user(usermode_func func, unsigned int fault_vector,
 {
 	extern char ret_to_kernel;
 	volatile uint64_t rax = 0;
-	static unsigned char user_stack[USERMODE_STACK_SIZE];
+	static unsigned char *user_stack;
 	handler old_ex;
 
+	if (!user_stack) {
+		user_stack = alloc_pages(USERMODE_STACK_ORDER);
+		printf("%p\n", user_stack);
+	}
+
 	*raised_vector = 0;
 	set_idt_entry(RET_TO_KERNEL_IRQ, &ret_to_kernel, 3);
 	old_ex = handle_exception(fault_vector,
@@ -51,6 +58,8 @@ uint64_t run_in_user(usermode_func func, unsigned int fault_vector,
 		return 0;
 	}
 
+	memcpy(user_stack + USERMODE_STACK_SIZE - 8, &func, 8);
+
 	asm volatile (
 			/* Prepare kernel SP for exception handlers */
 			"mov %%rsp, %[rsp0]\n\t"
@@ -63,12 +72,13 @@ uint64_t run_in_user(usermode_func func, unsigned int fault_vector,
 			"pushq %[user_stack_top]\n\t"
 			"pushfq\n\t"
 			"pushq %[user_cs]\n\t"
-			"lea user_mode(%%rip), %%rax\n\t"
+			"lea user_mode+0x800000(%%rip), %%rax\n\t" // smap.flat places usermode addresses at 8MB-16MB
 			"pushq %%rax\n\t"
 			"iretq\n"
 
 			"user_mode:\n\t"
 			/* Back up volatile registers before invoking func */
+			"pop %%rax\n\t"
 			"push %%rcx\n\t"
 			"push %%rdx\n\t"
 			"push %%rdi\n\t"
@@ -78,11 +88,12 @@ uint64_t run_in_user(usermode_func func, unsigned int fault_vector,
 			"push %%r10\n\t"
 			"push %%r11\n\t"
 			/* Call user mode function */
+			"add $0x800000,%%rbp\n\t"
 			"mov %[arg1], %%rdi\n\t"
 			"mov %[arg2], %%rsi\n\t"
 			"mov %[arg3], %%rdx\n\t"
 			"mov %[arg4], %%rcx\n\t"
-			"call *%[func]\n\t"
+			"call *%%rax\n\t"
 			/* Restore registers */
 			"pop %%r11\n\t"
 			"pop %%r10\n\t"
@@ -112,12 +123,11 @@ uint64_t run_in_user(usermode_func func, unsigned int fault_vector,
 			[arg2]"m"(arg2),
 			[arg3]"m"(arg3),
 			[arg4]"m"(arg4),
-			[func]"m"(func),
 			[user_ds]"i"(USER_DS),
 			[user_cs]"i"(USER_CS),
 			[kernel_ds]"rm"(KERNEL_DS),
 			[user_stack_top]"r"(user_stack +
-					sizeof(user_stack)),
+					USERMODE_STACK_SIZE - 8),
 			[kernel_entry_vector]"i"(RET_TO_KERNEL_IRQ));
 
 	handle_exception(fault_vector, old_ex);
diff --git a/x86/smap.c b/x86/smap.c
index 9a823a55..65119442 100644
--- a/x86/smap.c
+++ b/x86/smap.c
@@ -2,6 +2,7 @@
 #include <alloc_page.h>
 #include "x86/desc.h"
 #include "x86/processor.h"
+#include "x86/usermode.h"
 #include "x86/vm.h"
 
 volatile int pf_count = 0;
@@ -89,6 +90,31 @@ static void check_smap_nowp(void)
 	write_cr3(read_cr3());
 }
 
+#ifdef __x86_64__
+static void iret(void)
+{
+	asm volatile(
+	    "mov %%rsp, %%rcx;"
+	    "movl %%ss, %%ebx; pushq %%rbx; pushq %%rcx;"
+	    "pushf;"
+	    "movl %%cs, %%ebx; pushq %%rbx; "
+	    "lea 1f(%%rip), %%rbx; pushq %%rbx; iretq; 1:"
+
+		: : : "ebx", "ecx", "cc"); /* RPL=0 */
+}
+
+static void test_user_iret(void)
+{
+	bool raised_vector;
+	uintptr_t user_iret = (uintptr_t)iret + USER_BASE;
+
+	run_in_user((usermode_func)user_iret, PF_VECTOR, 0, 0, 0, 0,
+		    &raised_vector);
+
+	report(!raised_vector, "No #PF on CPL=3 DPL=3 iret");
+}
+#endif
+
 int main(int ac, char **av)
 {
 	unsigned long i;
@@ -196,7 +222,9 @@ int main(int ac, char **av)
 
 	check_smap_nowp();
 
-	// TODO: implicit kernel access from ring 3 (e.g. int)
+#ifdef __x86_64__
+	test_user_iret();
+#endif
 
 	return report_summary();
 }
Re: [PATCH 00/10] target/i386/tcg: fixes for seg_helper.c
Posted by Robert Henry 5 months, 2 weeks ago
I have only skimmed the diffs.  Your knowledge of the deep semantics,
gained by close differential reading of intel and amd docs, is truly
amazing.  Many thanks for pushing this through!

I have 2 nits, perhaps stylistic only.

For code like "sp -= 2" or "sp += 2" followed or preceded by a write to the
stack pointer of a uint16_t variable 'x',  would it be better/more robust
to rewrite as: "sp -= sizeof(x)"  ?

There are a lot of masks constructed using -1.  I think it would be clearer
to use 0xffffffff (for 32-bit masks) as that reminds the reader that this
is a bit mask.  But it seems that using -1 is how the original code was
written.

On Tue, Jul 9, 2024 at 11:29 PM Paolo Bonzini <pbonzini@redhat.com> wrote:

> This includes bugfixes:
> - allowing IRET from user mode to user mode with SMAP (do not use implicit
>   kernel accesses, which break if the stack is in userspace)
>
> - use DPL-level accesses for interrupts and call gates
>
> - various fixes for task switching
>
> And two related cleanups: computing MMU index once for far calls and
> returns
> (including task switches), and using X86Access for TSS access.
>
> Tested with a really ugly patch to kvm-unit-tests, included after
> signature.
>
> Paolo Bonzini (7):
>   target/i386/tcg: Allow IRET from user mode to user mode with SMAP
>   target/i386/tcg: use PUSHL/PUSHW for error code
>   target/i386/tcg: Compute MMU index once
>   target/i386/tcg: Use DPL-level accesses for interrupts and call gates
>   target/i386/tcg: check for correct busy state before switching to a
>     new task
>   target/i386/tcg: use X86Access for TSS access
>   target/i386/tcg: save current task state before loading new one
>
> Richard Henderson (3):
>   target/i386/tcg: Remove SEG_ADDL
>   target/i386/tcg: Reorg push/pop within seg_helper.c
>   target/i386/tcg: Introduce x86_mmu_index_{kernel_,}pl
>
>  target/i386/cpu.h            |  11 +-
>  target/i386/cpu.c            |  27 +-
>  target/i386/tcg/seg_helper.c | 606 +++++++++++++++++++----------------
>  3 files changed, 354 insertions(+), 290 deletions(-)
>
> --
> 2.45.2
>
> diff --git a/lib/x86/usermode.c b/lib/x86/usermode.c
> index c3ec0ad7..0bf40c6d 100644
> --- a/lib/x86/usermode.c
> +++ b/lib/x86/usermode.c
> @@ -5,13 +5,15 @@
>  #include "x86/desc.h"
>  #include "x86/isr.h"
>  #include "alloc.h"
> +#include "alloc_page.h"
>  #include "setjmp.h"
>  #include "usermode.h"
>
>  #include "libcflat.h"
>  #include <stdint.h>
>
> -#define USERMODE_STACK_SIZE    0x2000
> +#define USERMODE_STACK_ORDER   1 /* 8k */
> +#define USERMODE_STACK_SIZE    (1 << (12 + USERMODE_STACK_ORDER))
>  #define RET_TO_KERNEL_IRQ      0x20
>
>  static jmp_buf jmpbuf;
> @@ -37,9 +39,14 @@ uint64_t run_in_user(usermode_func func, unsigned int
> fault_vector,
>  {
>         extern char ret_to_kernel;
>         volatile uint64_t rax = 0;
> -       static unsigned char user_stack[USERMODE_STACK_SIZE];
> +       static unsigned char *user_stack;
>         handler old_ex;
>
> +       if (!user_stack) {
> +               user_stack = alloc_pages(USERMODE_STACK_ORDER);
> +               printf("%p\n", user_stack);
> +       }
> +
>         *raised_vector = 0;
>         set_idt_entry(RET_TO_KERNEL_IRQ, &ret_to_kernel, 3);
>         old_ex = handle_exception(fault_vector,
> @@ -51,6 +58,8 @@ uint64_t run_in_user(usermode_func func, unsigned int
> fault_vector,
>                 return 0;
>         }
>
> +       memcpy(user_stack + USERMODE_STACK_SIZE - 8, &func, 8);
> +
>         asm volatile (
>                         /* Prepare kernel SP for exception handlers */
>                         "mov %%rsp, %[rsp0]\n\t"
> @@ -63,12 +72,13 @@ uint64_t run_in_user(usermode_func func, unsigned int
> fault_vector,
>                         "pushq %[user_stack_top]\n\t"
>                         "pushfq\n\t"
>                         "pushq %[user_cs]\n\t"
> -                       "lea user_mode(%%rip), %%rax\n\t"
> +                       "lea user_mode+0x800000(%%rip), %%rax\n\t" //
> smap.flat places usermode addresses at 8MB-16MB
>                         "pushq %%rax\n\t"
>                         "iretq\n"
>
>                         "user_mode:\n\t"
>                         /* Back up volatile registers before invoking func
> */
> +                       "pop %%rax\n\t"
>                         "push %%rcx\n\t"
>                         "push %%rdx\n\t"
>                         "push %%rdi\n\t"
> @@ -78,11 +88,12 @@ uint64_t run_in_user(usermode_func func, unsigned int
> fault_vector,
>                         "push %%r10\n\t"
>                         "push %%r11\n\t"
>                         /* Call user mode function */
> +                       "add $0x800000,%%rbp\n\t"
>                         "mov %[arg1], %%rdi\n\t"
>                         "mov %[arg2], %%rsi\n\t"
>                         "mov %[arg3], %%rdx\n\t"
>                         "mov %[arg4], %%rcx\n\t"
> -                       "call *%[func]\n\t"
> +                       "call *%%rax\n\t"
>                         /* Restore registers */
>                         "pop %%r11\n\t"
>                         "pop %%r10\n\t"
> @@ -112,12 +123,11 @@ uint64_t run_in_user(usermode_func func, unsigned
> int fault_vector,
>                         [arg2]"m"(arg2),
>                         [arg3]"m"(arg3),
>                         [arg4]"m"(arg4),
> -                       [func]"m"(func),
>                         [user_ds]"i"(USER_DS),
>                         [user_cs]"i"(USER_CS),
>                         [kernel_ds]"rm"(KERNEL_DS),
>                         [user_stack_top]"r"(user_stack +
> -                                       sizeof(user_stack)),
> +                                       USERMODE_STACK_SIZE - 8),
>                         [kernel_entry_vector]"i"(RET_TO_KERNEL_IRQ));
>
>         handle_exception(fault_vector, old_ex);
> diff --git a/x86/smap.c b/x86/smap.c
> index 9a823a55..65119442 100644
> --- a/x86/smap.c
> +++ b/x86/smap.c
> @@ -2,6 +2,7 @@
>  #include <alloc_page.h>
>  #include "x86/desc.h"
>  #include "x86/processor.h"
> +#include "x86/usermode.h"
>  #include "x86/vm.h"
>
>  volatile int pf_count = 0;
> @@ -89,6 +90,31 @@ static void check_smap_nowp(void)
>         write_cr3(read_cr3());
>  }
>
> +#ifdef __x86_64__
> +static void iret(void)
> +{
> +       asm volatile(
> +           "mov %%rsp, %%rcx;"
> +           "movl %%ss, %%ebx; pushq %%rbx; pushq %%rcx;"
> +           "pushf;"
> +           "movl %%cs, %%ebx; pushq %%rbx; "
> +           "lea 1f(%%rip), %%rbx; pushq %%rbx; iretq; 1:"
> +
> +               : : : "ebx", "ecx", "cc"); /* RPL=0 */
> +}
> +
> +static void test_user_iret(void)
> +{
> +       bool raised_vector;
> +       uintptr_t user_iret = (uintptr_t)iret + USER_BASE;
> +
> +       run_in_user((usermode_func)user_iret, PF_VECTOR, 0, 0, 0, 0,
> +                   &raised_vector);
> +
> +       report(!raised_vector, "No #PF on CPL=3 DPL=3 iret");
> +}
> +#endif
> +
>  int main(int ac, char **av)
>  {
>         unsigned long i;
> @@ -196,7 +222,9 @@ int main(int ac, char **av)
>
>         check_smap_nowp();
>
> -       // TODO: implicit kernel access from ring 3 (e.g. int)
> +#ifdef __x86_64__
> +       test_user_iret();
> +#endif
>
>         return report_summary();
>  }
>
>
>
>
Re: [PATCH 00/10] target/i386/tcg: fixes for seg_helper.c
Posted by Paolo Bonzini 5 months, 2 weeks ago
Il mer 10 lug 2024, 23:01 Robert Henry <rrh.henry@gmail.com> ha scritto:

> I have only skimmed the diffs.  Your knowledge of the deep semantics,
> gained by close differential reading of intel and amd docs, is truly
> amazing.  Many thanks for pushing this through!
>

Thanks for bringing this to our attention too, apart from the practical bug
hopefully it will help future readers to have a more precise implementation.

I tried to acknowledge your contribution in the commit messages.

I have 2 nits, perhaps stylistic only.
>
> For code like "sp -= 2" or "sp += 2" followed or preceded by a write to
> the stack pointer of a uint16_t variable 'x',  would it be better/more
> robust to rewrite as: "sp -= sizeof(x)"  ?
>

I think that's intentional because the value subtracted is related to the
"stw" or "stl" in the store (likewise for incrementing after a load) more
than to the size of x.

There are a lot of masks constructed using -1.  I think it would be clearer
> to use 0xffffffff (for 32-bit masks) as that reminds the reader that this
> is a bit mask.  But it seems that using -1 is how the original code was
> written.
>

-1 is used for 64-bit masks only. They get unwieldy quickly. :)

Paolo


> On Tue, Jul 9, 2024 at 11:29 PM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>> This includes bugfixes:
>> - allowing IRET from user mode to user mode with SMAP (do not use implicit
>>   kernel accesses, which break if the stack is in userspace)
>>
>> - use DPL-level accesses for interrupts and call gates
>>
>> - various fixes for task switching
>>
>> And two related cleanups: computing MMU index once for far calls and
>> returns
>> (including task switches), and using X86Access for TSS access.
>>
>> Tested with a really ugly patch to kvm-unit-tests, included after
>> signature.
>>
>> Paolo Bonzini (7):
>>   target/i386/tcg: Allow IRET from user mode to user mode with SMAP
>>   target/i386/tcg: use PUSHL/PUSHW for error code
>>   target/i386/tcg: Compute MMU index once
>>   target/i386/tcg: Use DPL-level accesses for interrupts and call gates
>>   target/i386/tcg: check for correct busy state before switching to a
>>     new task
>>   target/i386/tcg: use X86Access for TSS access
>>   target/i386/tcg: save current task state before loading new one
>>
>> Richard Henderson (3):
>>   target/i386/tcg: Remove SEG_ADDL
>>   target/i386/tcg: Reorg push/pop within seg_helper.c
>>   target/i386/tcg: Introduce x86_mmu_index_{kernel_,}pl
>>
>>  target/i386/cpu.h            |  11 +-
>>  target/i386/cpu.c            |  27 +-
>>  target/i386/tcg/seg_helper.c | 606 +++++++++++++++++++----------------
>>  3 files changed, 354 insertions(+), 290 deletions(-)
>>
>> --
>> 2.45.2
>>
>> diff --git a/lib/x86/usermode.c b/lib/x86/usermode.c
>> index c3ec0ad7..0bf40c6d 100644
>> --- a/lib/x86/usermode.c
>> +++ b/lib/x86/usermode.c
>> @@ -5,13 +5,15 @@
>>  #include "x86/desc.h"
>>  #include "x86/isr.h"
>>  #include "alloc.h"
>> +#include "alloc_page.h"
>>  #include "setjmp.h"
>>  #include "usermode.h"
>>
>>  #include "libcflat.h"
>>  #include <stdint.h>
>>
>> -#define USERMODE_STACK_SIZE    0x2000
>> +#define USERMODE_STACK_ORDER   1 /* 8k */
>> +#define USERMODE_STACK_SIZE    (1 << (12 + USERMODE_STACK_ORDER))
>>  #define RET_TO_KERNEL_IRQ      0x20
>>
>>  static jmp_buf jmpbuf;
>> @@ -37,9 +39,14 @@ uint64_t run_in_user(usermode_func func, unsigned int
>> fault_vector,
>>  {
>>         extern char ret_to_kernel;
>>         volatile uint64_t rax = 0;
>> -       static unsigned char user_stack[USERMODE_STACK_SIZE];
>> +       static unsigned char *user_stack;
>>         handler old_ex;
>>
>> +       if (!user_stack) {
>> +               user_stack = alloc_pages(USERMODE_STACK_ORDER);
>> +               printf("%p\n", user_stack);
>> +       }
>> +
>>         *raised_vector = 0;
>>         set_idt_entry(RET_TO_KERNEL_IRQ, &ret_to_kernel, 3);
>>         old_ex = handle_exception(fault_vector,
>> @@ -51,6 +58,8 @@ uint64_t run_in_user(usermode_func func, unsigned int
>> fault_vector,
>>                 return 0;
>>         }
>>
>> +       memcpy(user_stack + USERMODE_STACK_SIZE - 8, &func, 8);
>> +
>>         asm volatile (
>>                         /* Prepare kernel SP for exception handlers */
>>                         "mov %%rsp, %[rsp0]\n\t"
>> @@ -63,12 +72,13 @@ uint64_t run_in_user(usermode_func func, unsigned int
>> fault_vector,
>>                         "pushq %[user_stack_top]\n\t"
>>                         "pushfq\n\t"
>>                         "pushq %[user_cs]\n\t"
>> -                       "lea user_mode(%%rip), %%rax\n\t"
>> +                       "lea user_mode+0x800000(%%rip), %%rax\n\t" //
>> smap.flat places usermode addresses at 8MB-16MB
>>                         "pushq %%rax\n\t"
>>                         "iretq\n"
>>
>>                         "user_mode:\n\t"
>>                         /* Back up volatile registers before invoking
>> func */
>> +                       "pop %%rax\n\t"
>>                         "push %%rcx\n\t"
>>                         "push %%rdx\n\t"
>>                         "push %%rdi\n\t"
>> @@ -78,11 +88,12 @@ uint64_t run_in_user(usermode_func func, unsigned int
>> fault_vector,
>>                         "push %%r10\n\t"
>>                         "push %%r11\n\t"
>>                         /* Call user mode function */
>> +                       "add $0x800000,%%rbp\n\t"
>>                         "mov %[arg1], %%rdi\n\t"
>>                         "mov %[arg2], %%rsi\n\t"
>>                         "mov %[arg3], %%rdx\n\t"
>>                         "mov %[arg4], %%rcx\n\t"
>> -                       "call *%[func]\n\t"
>> +                       "call *%%rax\n\t"
>>                         /* Restore registers */
>>                         "pop %%r11\n\t"
>>                         "pop %%r10\n\t"
>> @@ -112,12 +123,11 @@ uint64_t run_in_user(usermode_func func, unsigned
>> int fault_vector,
>>                         [arg2]"m"(arg2),
>>                         [arg3]"m"(arg3),
>>                         [arg4]"m"(arg4),
>> -                       [func]"m"(func),
>>                         [user_ds]"i"(USER_DS),
>>                         [user_cs]"i"(USER_CS),
>>                         [kernel_ds]"rm"(KERNEL_DS),
>>                         [user_stack_top]"r"(user_stack +
>> -                                       sizeof(user_stack)),
>> +                                       USERMODE_STACK_SIZE - 8),
>>                         [kernel_entry_vector]"i"(RET_TO_KERNEL_IRQ));
>>
>>         handle_exception(fault_vector, old_ex);
>> diff --git a/x86/smap.c b/x86/smap.c
>> index 9a823a55..65119442 100644
>> --- a/x86/smap.c
>> +++ b/x86/smap.c
>> @@ -2,6 +2,7 @@
>>  #include <alloc_page.h>
>>  #include "x86/desc.h"
>>  #include "x86/processor.h"
>> +#include "x86/usermode.h"
>>  #include "x86/vm.h"
>>
>>  volatile int pf_count = 0;
>> @@ -89,6 +90,31 @@ static void check_smap_nowp(void)
>>         write_cr3(read_cr3());
>>  }
>>
>> +#ifdef __x86_64__
>> +static void iret(void)
>> +{
>> +       asm volatile(
>> +           "mov %%rsp, %%rcx;"
>> +           "movl %%ss, %%ebx; pushq %%rbx; pushq %%rcx;"
>> +           "pushf;"
>> +           "movl %%cs, %%ebx; pushq %%rbx; "
>> +           "lea 1f(%%rip), %%rbx; pushq %%rbx; iretq; 1:"
>> +
>> +               : : : "ebx", "ecx", "cc"); /* RPL=0 */
>> +}
>> +
>> +static void test_user_iret(void)
>> +{
>> +       bool raised_vector;
>> +       uintptr_t user_iret = (uintptr_t)iret + USER_BASE;
>> +
>> +       run_in_user((usermode_func)user_iret, PF_VECTOR, 0, 0, 0, 0,
>> +                   &raised_vector);
>> +
>> +       report(!raised_vector, "No #PF on CPL=3 DPL=3 iret");
>> +}
>> +#endif
>> +
>>  int main(int ac, char **av)
>>  {
>>         unsigned long i;
>> @@ -196,7 +222,9 @@ int main(int ac, char **av)
>>
>>         check_smap_nowp();
>>
>> -       // TODO: implicit kernel access from ring 3 (e.g. int)
>> +#ifdef __x86_64__
>> +       test_user_iret();
>> +#endif
>>
>>         return report_summary();
>>  }
>>
>>
>>
>>
[PATCH 01/10] target/i386/tcg: Remove SEG_ADDL
Posted by Paolo Bonzini 5 months, 3 weeks ago
From: Richard Henderson <richard.henderson@linaro.org>

This truncation is now handled by MMU_*32_IDX.  The introduction of
MMU_*32_IDX in fact applied correct 32-bit wraparound to 16-bit accesses
with a high segment base (e.g.  big real mode or vm86 mode), which did
not use SEG_ADDL.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Link: https://lore.kernel.org/r/20240617161210.4639-3-richard.henderson@linaro.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/seg_helper.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c
index aee3d19f29b..19d6b41a589 100644
--- a/target/i386/tcg/seg_helper.c
+++ b/target/i386/tcg/seg_helper.c
@@ -579,10 +579,6 @@ int exception_has_error_code(int intno)
     } while (0)
 #endif
 
-/* in 64-bit machines, this can overflow. So this segment addition macro
- * can be used to trim the value to 32-bit whenever needed */
-#define SEG_ADDL(ssp, sp, sp_mask) ((uint32_t)((ssp) + (sp & (sp_mask))))
-
 /* XXX: add a is_user flag to have proper security support */
 #define PUSHW_RA(ssp, sp, sp_mask, val, ra)                      \
     {                                                            \
@@ -593,7 +589,7 @@ int exception_has_error_code(int intno)
 #define PUSHL_RA(ssp, sp, sp_mask, val, ra)                             \
     {                                                                   \
         sp -= 4;                                                        \
-        cpu_stl_kernel_ra(env, SEG_ADDL(ssp, sp, sp_mask), (uint32_t)(val), ra); \
+        cpu_stl_kernel_ra(env, (ssp) + (sp & (sp_mask)), (val), ra); \
     }
 
 #define POPW_RA(ssp, sp, sp_mask, val, ra)                       \
@@ -604,7 +600,7 @@ int exception_has_error_code(int intno)
 
 #define POPL_RA(ssp, sp, sp_mask, val, ra)                              \
     {                                                                   \
-        val = (uint32_t)cpu_ldl_kernel_ra(env, SEG_ADDL(ssp, sp, sp_mask), ra); \
+        val = (uint32_t)cpu_ldl_kernel_ra(env, (ssp) + (sp & (sp_mask)), ra); \
         sp += 4;                                                        \
     }
 
-- 
2.45.2
[PATCH 02/10] target/i386/tcg: Allow IRET from user mode to user mode with SMAP
Posted by Paolo Bonzini 5 months, 3 weeks ago
This fixes a bug wherein i386/tcg assumed an interrupt return using
the IRET instruction was always returning from kernel mode to either
kernel mode or user mode. This assumption is violated when IRET is used
as a clever way to restore thread state, as for example in the dotnet
runtime. There, IRET returns from user mode to user mode.

This bug is that stack accesses from IRET and RETF, as well as accesses
to the parameters in a call gate, are normal data accesses using the
current CPL.  This manifested itself as a page fault in the guest Linux
kernel due to SMAP preventing the access.

This bug appears to have been in QEMU since the beginning.

Analyzed-by: Robert R. Henry <rrh.henry@gmail.com>
Co-developed-by: Robert R. Henry <rrh.henry@gmail.com>
Signed-off-by: Robert R. Henry <rrh.henry@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/seg_helper.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c
index 19d6b41a589..4977a5f5b3a 100644
--- a/target/i386/tcg/seg_helper.c
+++ b/target/i386/tcg/seg_helper.c
@@ -594,13 +594,13 @@ int exception_has_error_code(int intno)
 
 #define POPW_RA(ssp, sp, sp_mask, val, ra)                       \
     {                                                            \
-        val = cpu_lduw_kernel_ra(env, (ssp) + (sp & (sp_mask)), ra); \
+        val = cpu_lduw_data_ra(env, (ssp) + (sp & (sp_mask)), ra); \
         sp += 2;                                                 \
     }
 
 #define POPL_RA(ssp, sp, sp_mask, val, ra)                              \
     {                                                                   \
-        val = (uint32_t)cpu_ldl_kernel_ra(env, (ssp) + (sp & (sp_mask)), ra); \
+        val = (uint32_t)cpu_ldl_data_ra(env, (ssp) + (sp & (sp_mask)), ra); \
         sp += 4;                                                        \
     }
 
@@ -847,7 +847,7 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int,
 
 #define POPQ_RA(sp, val, ra)                    \
     {                                           \
-        val = cpu_ldq_kernel_ra(env, sp, ra);   \
+        val = cpu_ldq_data_ra(env, sp, ra);   \
         sp += 8;                                \
     }
 
@@ -1797,18 +1797,18 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip,
                 PUSHL_RA(ssp, sp, sp_mask, env->segs[R_SS].selector, GETPC());
                 PUSHL_RA(ssp, sp, sp_mask, env->regs[R_ESP], GETPC());
                 for (i = param_count - 1; i >= 0; i--) {
-                    val = cpu_ldl_kernel_ra(env, old_ssp +
-                                            ((env->regs[R_ESP] + i * 4) &
-                                             old_sp_mask), GETPC());
+                    val = cpu_ldl_data_ra(env,
+					  old_ssp + ((env->regs[R_ESP] + i * 4) & old_sp_mask),
+					  GETPC());
                     PUSHL_RA(ssp, sp, sp_mask, val, GETPC());
                 }
             } else {
                 PUSHW_RA(ssp, sp, sp_mask, env->segs[R_SS].selector, GETPC());
                 PUSHW_RA(ssp, sp, sp_mask, env->regs[R_ESP], GETPC());
                 for (i = param_count - 1; i >= 0; i--) {
-                    val = cpu_lduw_kernel_ra(env, old_ssp +
-                                             ((env->regs[R_ESP] + i * 2) &
-                                              old_sp_mask), GETPC());
+                    val = cpu_lduw_data_ra(env,
+					   old_ssp + ((env->regs[R_ESP] + i * 2) & old_sp_mask),
+					   GETPC());
                     PUSHW_RA(ssp, sp, sp_mask, val, GETPC());
                 }
             }
-- 
2.45.2
Re: [PATCH 02/10] target/i386/tcg: Allow IRET from user mode to user mode with SMAP
Posted by Richard Henderson 5 months, 2 weeks ago
On 7/9/24 23:29, Paolo Bonzini wrote:
> This fixes a bug wherein i386/tcg assumed an interrupt return using
> the IRET instruction was always returning from kernel mode to either
> kernel mode or user mode. This assumption is violated when IRET is used
> as a clever way to restore thread state, as for example in the dotnet
> runtime. There, IRET returns from user mode to user mode.
> 
> This bug is that stack accesses from IRET and RETF, as well as accesses
> to the parameters in a call gate, are normal data accesses using the
> current CPL.  This manifested itself as a page fault in the guest Linux
> kernel due to SMAP preventing the access.
> 
> This bug appears to have been in QEMU since the beginning.
> 
> Analyzed-by: Robert R. Henry<rrh.henry@gmail.com>
> Co-developed-by: Robert R. Henry<rrh.henry@gmail.com>
> Signed-off-by: Robert R. Henry<rrh.henry@gmail.com>
> Signed-off-by: Paolo Bonzini<pbonzini@redhat.com>
> ---
>   target/i386/tcg/seg_helper.c | 18 +++++++++---------
>   1 file changed, 9 insertions(+), 9 deletions(-)

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>


r~
[PATCH 03/10] target/i386/tcg: use PUSHL/PUSHW for error code
Posted by Paolo Bonzini 5 months, 3 weeks ago
Do not pre-decrement esp, let the macros subtract the appropriate
operand size.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/seg_helper.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c
index 4977a5f5b3a..0653bc10936 100644
--- a/target/i386/tcg/seg_helper.c
+++ b/target/i386/tcg/seg_helper.c
@@ -670,22 +670,20 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int,
         }
         shift = switch_tss(env, intno * 8, e1, e2, SWITCH_TSS_CALL, old_eip);
         if (has_error_code) {
-            uint32_t mask;
-
             /* push the error code */
             if (env->segs[R_SS].flags & DESC_B_MASK) {
-                mask = 0xffffffff;
+                sp_mask = 0xffffffff;
             } else {
-                mask = 0xffff;
+                sp_mask = 0xffff;
             }
-            esp = (env->regs[R_ESP] - (2 << shift)) & mask;
-            ssp = env->segs[R_SS].base + esp;
+            esp = env->regs[R_ESP];
+            ssp = env->segs[R_SS].base;
             if (shift) {
-                cpu_stl_kernel(env, ssp, error_code);
+                PUSHL(ssp, esp, sp_mask, error_code);
             } else {
-                cpu_stw_kernel(env, ssp, error_code);
+                PUSHW(ssp, esp, sp_mask, error_code);
             }
-            SET_ESP(esp, mask);
+            SET_ESP(esp, sp_mask);
         }
         return;
     }
-- 
2.45.2
Re: [PATCH 03/10] target/i386/tcg: use PUSHL/PUSHW for error code
Posted by Richard Henderson 5 months, 2 weeks ago
On 7/9/24 23:29, Paolo Bonzini wrote:
> Do not pre-decrement esp, let the macros subtract the appropriate
> operand size.
> 
> Signed-off-by: Paolo Bonzini<pbonzini@redhat.com>
> ---
>   target/i386/tcg/seg_helper.c | 16 +++++++---------
>   1 file changed, 7 insertions(+), 9 deletions(-)

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>

r~
[PATCH 04/10] target/i386/tcg: Reorg push/pop within seg_helper.c
Posted by Paolo Bonzini 5 months, 3 weeks ago
From: Richard Henderson <richard.henderson@linaro.org>

Interrupts and call gates should use accesses with the DPL as
the privilege level.  While computing the applicable MMU index
is easy, the harder thing is how to plumb it in the code.

One possibility could be to add a single argument to the PUSH* macros
for the privilege level, but this is repetitive and risks confusion
between the involved privilege levels.

Another possibility is to pass both CPL and DPL, and adjusting both
PUSH* and POP* to use specific privilege levels (instead of using
cpu_{ld,st}*_data). This makes the code more symmetric.

However, a more complicated but much nicer approach is to use a structure
to contain the stack parameters, env, unwind return address, and rewrite
the macros into functions.  The struct provides an easy home for the MMU
index as well.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Link: https://lore.kernel.org/r/20240617161210.4639-4-richard.henderson@linaro.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/seg_helper.c | 439 +++++++++++++++++++----------------
 1 file changed, 238 insertions(+), 201 deletions(-)

diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c
index 0653bc10936..6b3de7a2be4 100644
--- a/target/i386/tcg/seg_helper.c
+++ b/target/i386/tcg/seg_helper.c
@@ -579,35 +579,47 @@ int exception_has_error_code(int intno)
     } while (0)
 #endif
 
-/* XXX: add a is_user flag to have proper security support */
-#define PUSHW_RA(ssp, sp, sp_mask, val, ra)                      \
-    {                                                            \
-        sp -= 2;                                                 \
-        cpu_stw_kernel_ra(env, (ssp) + (sp & (sp_mask)), (val), ra); \
-    }
+/* XXX: use mmu_index to have proper DPL support */
+typedef struct StackAccess
+{
+    CPUX86State *env;
+    uintptr_t ra;
+    target_ulong ss_base;
+    target_ulong sp;
+    target_ulong sp_mask;
+} StackAccess;
 
-#define PUSHL_RA(ssp, sp, sp_mask, val, ra)                             \
-    {                                                                   \
-        sp -= 4;                                                        \
-        cpu_stl_kernel_ra(env, (ssp) + (sp & (sp_mask)), (val), ra); \
-    }
+static void pushw(StackAccess *sa, uint16_t val)
+{
+    sa->sp -= 2;
+    cpu_stw_kernel_ra(sa->env, sa->ss_base + (sa->sp & sa->sp_mask),
+                      val, sa->ra);
+}
 
-#define POPW_RA(ssp, sp, sp_mask, val, ra)                       \
-    {                                                            \
-        val = cpu_lduw_data_ra(env, (ssp) + (sp & (sp_mask)), ra); \
-        sp += 2;                                                 \
-    }
+static void pushl(StackAccess *sa, uint32_t val)
+{
+    sa->sp -= 4;
+    cpu_stl_kernel_ra(sa->env, sa->ss_base + (sa->sp & sa->sp_mask),
+                      val, sa->ra);
+}
 
-#define POPL_RA(ssp, sp, sp_mask, val, ra)                              \
-    {                                                                   \
-        val = (uint32_t)cpu_ldl_data_ra(env, (ssp) + (sp & (sp_mask)), ra); \
-        sp += 4;                                                        \
-    }
+static uint16_t popw(StackAccess *sa)
+{
+    uint16_t ret = cpu_lduw_data_ra(sa->env,
+                                    sa->ss_base + (sa->sp & sa->sp_mask),
+                                    sa->ra);
+    sa->sp += 2;
+    return ret;
+}
 
-#define PUSHW(ssp, sp, sp_mask, val) PUSHW_RA(ssp, sp, sp_mask, val, 0)
-#define PUSHL(ssp, sp, sp_mask, val) PUSHL_RA(ssp, sp, sp_mask, val, 0)
-#define POPW(ssp, sp, sp_mask, val) POPW_RA(ssp, sp, sp_mask, val, 0)
-#define POPL(ssp, sp, sp_mask, val) POPL_RA(ssp, sp, sp_mask, val, 0)
+static uint32_t popl(StackAccess *sa)
+{
+    uint32_t ret = cpu_ldl_data_ra(sa->env,
+                                   sa->ss_base + (sa->sp & sa->sp_mask),
+                                   sa->ra);
+    sa->sp += 4;
+    return ret;
+}
 
 /* protected mode interrupt */
 static void do_interrupt_protected(CPUX86State *env, int intno, int is_int,
@@ -615,12 +627,13 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int,
                                    int is_hw)
 {
     SegmentCache *dt;
-    target_ulong ptr, ssp;
+    target_ulong ptr;
     int type, dpl, selector, ss_dpl, cpl;
     int has_error_code, new_stack, shift;
-    uint32_t e1, e2, offset, ss = 0, esp, ss_e1 = 0, ss_e2 = 0;
-    uint32_t old_eip, sp_mask, eflags;
+    uint32_t e1, e2, offset, ss = 0, ss_e1 = 0, ss_e2 = 0;
+    uint32_t old_eip, eflags;
     int vm86 = env->eflags & VM_MASK;
+    StackAccess sa;
     bool set_rf;
 
     has_error_code = 0;
@@ -662,6 +675,9 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int,
         raise_exception_err(env, EXCP0D_GPF, intno * 8 + 2);
     }
 
+    sa.env = env;
+    sa.ra = 0;
+
     if (type == 5) {
         /* task gate */
         /* must do that check here to return the correct error code */
@@ -672,18 +688,18 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int,
         if (has_error_code) {
             /* push the error code */
             if (env->segs[R_SS].flags & DESC_B_MASK) {
-                sp_mask = 0xffffffff;
+                sa.sp_mask = 0xffffffff;
             } else {
-                sp_mask = 0xffff;
+                sa.sp_mask = 0xffff;
             }
-            esp = env->regs[R_ESP];
-            ssp = env->segs[R_SS].base;
+            sa.sp = env->regs[R_ESP];
+            sa.ss_base = env->segs[R_SS].base;
             if (shift) {
-                PUSHL(ssp, esp, sp_mask, error_code);
+                pushl(&sa, error_code);
             } else {
-                PUSHW(ssp, esp, sp_mask, error_code);
+                pushw(&sa, error_code);
             }
-            SET_ESP(esp, sp_mask);
+            SET_ESP(sa.sp, sa.sp_mask);
         }
         return;
     }
@@ -717,6 +733,7 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int,
     }
     if (dpl < cpl) {
         /* to inner privilege */
+        uint32_t esp;
         get_ss_esp_from_tss(env, &ss, &esp, dpl, 0);
         if ((ss & 0xfffc) == 0) {
             raise_exception_err(env, EXCP0A_TSS, ss & 0xfffc);
@@ -740,17 +757,18 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int,
             raise_exception_err(env, EXCP0A_TSS, ss & 0xfffc);
         }
         new_stack = 1;
-        sp_mask = get_sp_mask(ss_e2);
-        ssp = get_seg_base(ss_e1, ss_e2);
+        sa.sp = esp;
+        sa.sp_mask = get_sp_mask(ss_e2);
+        sa.ss_base = get_seg_base(ss_e1, ss_e2);
     } else  {
         /* to same privilege */
         if (vm86) {
             raise_exception_err(env, EXCP0D_GPF, selector & 0xfffc);
         }
         new_stack = 0;
-        sp_mask = get_sp_mask(env->segs[R_SS].flags);
-        ssp = env->segs[R_SS].base;
-        esp = env->regs[R_ESP];
+        sa.sp = env->regs[R_ESP];
+        sa.sp_mask = get_sp_mask(env->segs[R_SS].flags);
+        sa.ss_base = env->segs[R_SS].base;
     }
 
     shift = type >> 3;
@@ -775,36 +793,36 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int,
     if (shift == 1) {
         if (new_stack) {
             if (vm86) {
-                PUSHL(ssp, esp, sp_mask, env->segs[R_GS].selector);
-                PUSHL(ssp, esp, sp_mask, env->segs[R_FS].selector);
-                PUSHL(ssp, esp, sp_mask, env->segs[R_DS].selector);
-                PUSHL(ssp, esp, sp_mask, env->segs[R_ES].selector);
+                pushl(&sa, env->segs[R_GS].selector);
+                pushl(&sa, env->segs[R_FS].selector);
+                pushl(&sa, env->segs[R_DS].selector);
+                pushl(&sa, env->segs[R_ES].selector);
             }
-            PUSHL(ssp, esp, sp_mask, env->segs[R_SS].selector);
-            PUSHL(ssp, esp, sp_mask, env->regs[R_ESP]);
+            pushl(&sa, env->segs[R_SS].selector);
+            pushl(&sa, env->regs[R_ESP]);
         }
-        PUSHL(ssp, esp, sp_mask, eflags);
-        PUSHL(ssp, esp, sp_mask, env->segs[R_CS].selector);
-        PUSHL(ssp, esp, sp_mask, old_eip);
+        pushl(&sa, eflags);
+        pushl(&sa, env->segs[R_CS].selector);
+        pushl(&sa, old_eip);
         if (has_error_code) {
-            PUSHL(ssp, esp, sp_mask, error_code);
+            pushl(&sa, error_code);
         }
     } else {
         if (new_stack) {
             if (vm86) {
-                PUSHW(ssp, esp, sp_mask, env->segs[R_GS].selector);
-                PUSHW(ssp, esp, sp_mask, env->segs[R_FS].selector);
-                PUSHW(ssp, esp, sp_mask, env->segs[R_DS].selector);
-                PUSHW(ssp, esp, sp_mask, env->segs[R_ES].selector);
+                pushw(&sa, env->segs[R_GS].selector);
+                pushw(&sa, env->segs[R_FS].selector);
+                pushw(&sa, env->segs[R_DS].selector);
+                pushw(&sa, env->segs[R_ES].selector);
             }
-            PUSHW(ssp, esp, sp_mask, env->segs[R_SS].selector);
-            PUSHW(ssp, esp, sp_mask, env->regs[R_ESP]);
+            pushw(&sa, env->segs[R_SS].selector);
+            pushw(&sa, env->regs[R_ESP]);
         }
-        PUSHW(ssp, esp, sp_mask, eflags);
-        PUSHW(ssp, esp, sp_mask, env->segs[R_CS].selector);
-        PUSHW(ssp, esp, sp_mask, old_eip);
+        pushw(&sa, eflags);
+        pushw(&sa, env->segs[R_CS].selector);
+        pushw(&sa, old_eip);
         if (has_error_code) {
-            PUSHW(ssp, esp, sp_mask, error_code);
+            pushw(&sa, error_code);
         }
     }
 
@@ -822,10 +840,10 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int,
             cpu_x86_load_seg_cache(env, R_GS, 0, 0, 0, 0);
         }
         ss = (ss & ~3) | dpl;
-        cpu_x86_load_seg_cache(env, R_SS, ss,
-                               ssp, get_seg_limit(ss_e1, ss_e2), ss_e2);
+        cpu_x86_load_seg_cache(env, R_SS, ss, sa.ss_base,
+                               get_seg_limit(ss_e1, ss_e2), ss_e2);
     }
-    SET_ESP(esp, sp_mask);
+    SET_ESP(sa.sp, sa.sp_mask);
 
     selector = (selector & ~3) | dpl;
     cpu_x86_load_seg_cache(env, R_CS, selector,
@@ -837,20 +855,18 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int,
 
 #ifdef TARGET_X86_64
 
-#define PUSHQ_RA(sp, val, ra)                   \
-    {                                           \
-        sp -= 8;                                \
-        cpu_stq_kernel_ra(env, sp, (val), ra);  \
-    }
+static void pushq(StackAccess *sa, uint64_t val)
+{
+    sa->sp -= 8;
+    cpu_stq_kernel_ra(sa->env, sa->sp, val, sa->ra);
+}
 
-#define POPQ_RA(sp, val, ra)                    \
-    {                                           \
-        val = cpu_ldq_data_ra(env, sp, ra);   \
-        sp += 8;                                \
-    }
-
-#define PUSHQ(sp, val) PUSHQ_RA(sp, val, 0)
-#define POPQ(sp, val) POPQ_RA(sp, val, 0)
+static uint64_t popq(StackAccess *sa)
+{
+    uint64_t ret = cpu_ldq_data_ra(sa->env, sa->sp, sa->ra);
+    sa->sp += 8;
+    return ret;
+}
 
 static inline target_ulong get_rsp_from_tss(CPUX86State *env, int level)
 {
@@ -893,8 +909,9 @@ static void do_interrupt64(CPUX86State *env, int intno, int is_int,
     int type, dpl, selector, cpl, ist;
     int has_error_code, new_stack;
     uint32_t e1, e2, e3, ss, eflags;
-    target_ulong old_eip, esp, offset;
+    target_ulong old_eip, offset;
     bool set_rf;
+    StackAccess sa;
 
     has_error_code = 0;
     if (!is_int && !is_hw) {
@@ -962,10 +979,15 @@ static void do_interrupt64(CPUX86State *env, int intno, int is_int,
     if (e2 & DESC_C_MASK) {
         dpl = cpl;
     }
+
+    sa.env = env;
+    sa.ra = 0;
+    sa.sp_mask = -1;
+    sa.ss_base = 0;
     if (dpl < cpl || ist != 0) {
         /* to inner privilege */
         new_stack = 1;
-        esp = get_rsp_from_tss(env, ist != 0 ? ist + 3 : dpl);
+        sa.sp = get_rsp_from_tss(env, ist != 0 ? ist + 3 : dpl);
         ss = 0;
     } else {
         /* to same privilege */
@@ -973,9 +995,9 @@ static void do_interrupt64(CPUX86State *env, int intno, int is_int,
             raise_exception_err(env, EXCP0D_GPF, selector & 0xfffc);
         }
         new_stack = 0;
-        esp = env->regs[R_ESP];
+        sa.sp = env->regs[R_ESP];
     }
-    esp &= ~0xfLL; /* align stack */
+    sa.sp &= ~0xfLL; /* align stack */
 
     /* See do_interrupt_protected.  */
     eflags = cpu_compute_eflags(env);
@@ -983,13 +1005,13 @@ static void do_interrupt64(CPUX86State *env, int intno, int is_int,
         eflags |= RF_MASK;
     }
 
-    PUSHQ(esp, env->segs[R_SS].selector);
-    PUSHQ(esp, env->regs[R_ESP]);
-    PUSHQ(esp, eflags);
-    PUSHQ(esp, env->segs[R_CS].selector);
-    PUSHQ(esp, old_eip);
+    pushq(&sa, env->segs[R_SS].selector);
+    pushq(&sa, env->regs[R_ESP]);
+    pushq(&sa, eflags);
+    pushq(&sa, env->segs[R_CS].selector);
+    pushq(&sa, old_eip);
     if (has_error_code) {
-        PUSHQ(esp, error_code);
+        pushq(&sa, error_code);
     }
 
     /* interrupt gate clear IF mask */
@@ -1002,7 +1024,7 @@ static void do_interrupt64(CPUX86State *env, int intno, int is_int,
         ss = 0 | dpl;
         cpu_x86_load_seg_cache(env, R_SS, ss, 0, 0, dpl << DESC_DPL_SHIFT);
     }
-    env->regs[R_ESP] = esp;
+    env->regs[R_ESP] = sa.sp;
 
     selector = (selector & ~3) | dpl;
     cpu_x86_load_seg_cache(env, R_CS, selector,
@@ -1074,10 +1096,11 @@ static void do_interrupt_real(CPUX86State *env, int intno, int is_int,
                               int error_code, unsigned int next_eip)
 {
     SegmentCache *dt;
-    target_ulong ptr, ssp;
+    target_ulong ptr;
     int selector;
-    uint32_t offset, esp;
+    uint32_t offset;
     uint32_t old_cs, old_eip;
+    StackAccess sa;
 
     /* real mode (simpler!) */
     dt = &env->idt;
@@ -1087,8 +1110,13 @@ static void do_interrupt_real(CPUX86State *env, int intno, int is_int,
     ptr = dt->base + intno * 4;
     offset = cpu_lduw_kernel(env, ptr);
     selector = cpu_lduw_kernel(env, ptr + 2);
-    esp = env->regs[R_ESP];
-    ssp = env->segs[R_SS].base;
+
+    sa.env = env;
+    sa.ra = 0;
+    sa.sp = env->regs[R_ESP];
+    sa.sp_mask = 0xffff;
+    sa.ss_base = env->segs[R_SS].base;
+
     if (is_int) {
         old_eip = next_eip;
     } else {
@@ -1096,12 +1124,12 @@ static void do_interrupt_real(CPUX86State *env, int intno, int is_int,
     }
     old_cs = env->segs[R_CS].selector;
     /* XXX: use SS segment size? */
-    PUSHW(ssp, esp, 0xffff, cpu_compute_eflags(env));
-    PUSHW(ssp, esp, 0xffff, old_cs);
-    PUSHW(ssp, esp, 0xffff, old_eip);
+    pushw(&sa, cpu_compute_eflags(env));
+    pushw(&sa, old_cs);
+    pushw(&sa, old_eip);
 
     /* update processor state */
-    env->regs[R_ESP] = (env->regs[R_ESP] & ~0xffff) | (esp & 0xffff);
+    SET_ESP(sa.sp, sa.sp_mask);
     env->eip = offset;
     env->segs[R_CS].selector = selector;
     env->segs[R_CS].base = (selector << 4);
@@ -1544,21 +1572,23 @@ void helper_ljmp_protected(CPUX86State *env, int new_cs, target_ulong new_eip,
 void helper_lcall_real(CPUX86State *env, uint32_t new_cs, uint32_t new_eip,
                        int shift, uint32_t next_eip)
 {
-    uint32_t esp, esp_mask;
-    target_ulong ssp;
+    StackAccess sa;
+
+    sa.env = env;
+    sa.ra = GETPC();
+    sa.sp = env->regs[R_ESP];
+    sa.sp_mask = get_sp_mask(env->segs[R_SS].flags);
+    sa.ss_base = env->segs[R_SS].base;
 
-    esp = env->regs[R_ESP];
-    esp_mask = get_sp_mask(env->segs[R_SS].flags);
-    ssp = env->segs[R_SS].base;
     if (shift) {
-        PUSHL_RA(ssp, esp, esp_mask, env->segs[R_CS].selector, GETPC());
-        PUSHL_RA(ssp, esp, esp_mask, next_eip, GETPC());
+        pushl(&sa, env->segs[R_CS].selector);
+        pushl(&sa, next_eip);
     } else {
-        PUSHW_RA(ssp, esp, esp_mask, env->segs[R_CS].selector, GETPC());
-        PUSHW_RA(ssp, esp, esp_mask, next_eip, GETPC());
+        pushw(&sa, env->segs[R_CS].selector);
+        pushw(&sa, next_eip);
     }
 
-    SET_ESP(esp, esp_mask);
+    SET_ESP(sa.sp, sa.sp_mask);
     env->eip = new_eip;
     env->segs[R_CS].selector = new_cs;
     env->segs[R_CS].base = (new_cs << 4);
@@ -1570,9 +1600,10 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip,
 {
     int new_stack, i;
     uint32_t e1, e2, cpl, dpl, rpl, selector, param_count;
-    uint32_t ss = 0, ss_e1 = 0, ss_e2 = 0, type, ss_dpl, sp_mask;
+    uint32_t ss = 0, ss_e1 = 0, ss_e2 = 0, type, ss_dpl;
     uint32_t val, limit, old_sp_mask;
-    target_ulong ssp, old_ssp, offset, sp;
+    target_ulong old_ssp, offset;
+    StackAccess sa;
 
     LOG_PCALL("lcall %04x:" TARGET_FMT_lx " s=%d\n", new_cs, new_eip, shift);
     LOG_PCALL_STATE(env_cpu(env));
@@ -1584,6 +1615,10 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip,
     }
     cpl = env->hflags & HF_CPL_MASK;
     LOG_PCALL("desc=%08x:%08x\n", e1, e2);
+
+    sa.env = env;
+    sa.ra = GETPC();
+
     if (e2 & DESC_S_MASK) {
         if (!(e2 & DESC_CS_MASK)) {
             raise_exception_err_ra(env, EXCP0D_GPF, new_cs & 0xfffc, GETPC());
@@ -1611,14 +1646,14 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip,
 #ifdef TARGET_X86_64
         /* XXX: check 16/32 bit cases in long mode */
         if (shift == 2) {
-            target_ulong rsp;
-
             /* 64 bit case */
-            rsp = env->regs[R_ESP];
-            PUSHQ_RA(rsp, env->segs[R_CS].selector, GETPC());
-            PUSHQ_RA(rsp, next_eip, GETPC());
+            sa.sp = env->regs[R_ESP];
+            sa.sp_mask = -1;
+            sa.ss_base = 0;
+            pushq(&sa, env->segs[R_CS].selector);
+            pushq(&sa, next_eip);
             /* from this point, not restartable */
-            env->regs[R_ESP] = rsp;
+            env->regs[R_ESP] = sa.sp;
             cpu_x86_load_seg_cache(env, R_CS, (new_cs & 0xfffc) | cpl,
                                    get_seg_base(e1, e2),
                                    get_seg_limit(e1, e2), e2);
@@ -1626,15 +1661,15 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip,
         } else
 #endif
         {
-            sp = env->regs[R_ESP];
-            sp_mask = get_sp_mask(env->segs[R_SS].flags);
-            ssp = env->segs[R_SS].base;
+            sa.sp = env->regs[R_ESP];
+            sa.sp_mask = get_sp_mask(env->segs[R_SS].flags);
+            sa.ss_base = env->segs[R_SS].base;
             if (shift) {
-                PUSHL_RA(ssp, sp, sp_mask, env->segs[R_CS].selector, GETPC());
-                PUSHL_RA(ssp, sp, sp_mask, next_eip, GETPC());
+                pushl(&sa, env->segs[R_CS].selector);
+                pushl(&sa, next_eip);
             } else {
-                PUSHW_RA(ssp, sp, sp_mask, env->segs[R_CS].selector, GETPC());
-                PUSHW_RA(ssp, sp, sp_mask, next_eip, GETPC());
+                pushw(&sa, env->segs[R_CS].selector);
+                pushw(&sa, next_eip);
             }
 
             limit = get_seg_limit(e1, e2);
@@ -1642,7 +1677,7 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip,
                 raise_exception_err_ra(env, EXCP0D_GPF, new_cs & 0xfffc, GETPC());
             }
             /* from this point, not restartable */
-            SET_ESP(sp, sp_mask);
+            SET_ESP(sa.sp, sa.sp_mask);
             cpu_x86_load_seg_cache(env, R_CS, (new_cs & 0xfffc) | cpl,
                                    get_seg_base(e1, e2), limit, e2);
             env->eip = new_eip;
@@ -1737,13 +1772,13 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip,
             /* to inner privilege */
 #ifdef TARGET_X86_64
             if (shift == 2) {
-                sp = get_rsp_from_tss(env, dpl);
                 ss = dpl;  /* SS = NULL selector with RPL = new CPL */
                 new_stack = 1;
-                sp_mask = 0;
-                ssp = 0;  /* SS base is always zero in IA-32e mode */
+                sa.sp = get_rsp_from_tss(env, dpl);
+                sa.sp_mask = -1;
+                sa.ss_base = 0;  /* SS base is always zero in IA-32e mode */
                 LOG_PCALL("new ss:rsp=%04x:%016llx env->regs[R_ESP]="
-                          TARGET_FMT_lx "\n", ss, sp, env->regs[R_ESP]);
+                          TARGET_FMT_lx "\n", ss, sa.sp, env->regs[R_ESP]);
             } else
 #endif
             {
@@ -1752,7 +1787,6 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip,
                 LOG_PCALL("new ss:esp=%04x:%08x param_count=%d env->regs[R_ESP]="
                           TARGET_FMT_lx "\n", ss, sp32, param_count,
                           env->regs[R_ESP]);
-                sp = sp32;
                 if ((ss & 0xfffc) == 0) {
                     raise_exception_err_ra(env, EXCP0A_TSS, ss & 0xfffc, GETPC());
                 }
@@ -1775,63 +1809,64 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip,
                     raise_exception_err_ra(env, EXCP0A_TSS, ss & 0xfffc, GETPC());
                 }
 
-                sp_mask = get_sp_mask(ss_e2);
-                ssp = get_seg_base(ss_e1, ss_e2);
+                sa.sp = sp32;
+                sa.sp_mask = get_sp_mask(ss_e2);
+                sa.ss_base = get_seg_base(ss_e1, ss_e2);
             }
 
             /* push_size = ((param_count * 2) + 8) << shift; */
-
             old_sp_mask = get_sp_mask(env->segs[R_SS].flags);
             old_ssp = env->segs[R_SS].base;
+
 #ifdef TARGET_X86_64
             if (shift == 2) {
                 /* XXX: verify if new stack address is canonical */
-                PUSHQ_RA(sp, env->segs[R_SS].selector, GETPC());
-                PUSHQ_RA(sp, env->regs[R_ESP], GETPC());
+                pushq(&sa, env->segs[R_SS].selector);
+                pushq(&sa, env->regs[R_ESP]);
                 /* parameters aren't supported for 64-bit call gates */
             } else
 #endif
             if (shift == 1) {
-                PUSHL_RA(ssp, sp, sp_mask, env->segs[R_SS].selector, GETPC());
-                PUSHL_RA(ssp, sp, sp_mask, env->regs[R_ESP], GETPC());
+                pushl(&sa, env->segs[R_SS].selector);
+                pushl(&sa, env->regs[R_ESP]);
                 for (i = param_count - 1; i >= 0; i--) {
                     val = cpu_ldl_data_ra(env,
-					  old_ssp + ((env->regs[R_ESP] + i * 4) & old_sp_mask),
-					  GETPC());
-                    PUSHL_RA(ssp, sp, sp_mask, val, GETPC());
+                                          old_ssp + ((env->regs[R_ESP] + i * 4) & old_sp_mask),
+                                          GETPC());
+                    pushl(&sa, val);
                 }
             } else {
-                PUSHW_RA(ssp, sp, sp_mask, env->segs[R_SS].selector, GETPC());
-                PUSHW_RA(ssp, sp, sp_mask, env->regs[R_ESP], GETPC());
+                pushw(&sa, env->segs[R_SS].selector);
+                pushw(&sa, env->regs[R_ESP]);
                 for (i = param_count - 1; i >= 0; i--) {
                     val = cpu_lduw_data_ra(env,
-					   old_ssp + ((env->regs[R_ESP] + i * 2) & old_sp_mask),
-					   GETPC());
-                    PUSHW_RA(ssp, sp, sp_mask, val, GETPC());
+                                           old_ssp + ((env->regs[R_ESP] + i * 2) & old_sp_mask),
+                                           GETPC());
+                    pushw(&sa, val);
                 }
             }
             new_stack = 1;
         } else {
             /* to same privilege */
-            sp = env->regs[R_ESP];
-            sp_mask = get_sp_mask(env->segs[R_SS].flags);
-            ssp = env->segs[R_SS].base;
+            sa.sp = env->regs[R_ESP];
+            sa.sp_mask = get_sp_mask(env->segs[R_SS].flags);
+            sa.ss_base = env->segs[R_SS].base;
             /* push_size = (4 << shift); */
             new_stack = 0;
         }
 
 #ifdef TARGET_X86_64
         if (shift == 2) {
-            PUSHQ_RA(sp, env->segs[R_CS].selector, GETPC());
-            PUSHQ_RA(sp, next_eip, GETPC());
+            pushq(&sa, env->segs[R_CS].selector);
+            pushq(&sa, next_eip);
         } else
 #endif
         if (shift == 1) {
-            PUSHL_RA(ssp, sp, sp_mask, env->segs[R_CS].selector, GETPC());
-            PUSHL_RA(ssp, sp, sp_mask, next_eip, GETPC());
+            pushl(&sa, env->segs[R_CS].selector);
+            pushl(&sa, next_eip);
         } else {
-            PUSHW_RA(ssp, sp, sp_mask, env->segs[R_CS].selector, GETPC());
-            PUSHW_RA(ssp, sp, sp_mask, next_eip, GETPC());
+            pushw(&sa, env->segs[R_CS].selector);
+            pushw(&sa, next_eip);
         }
 
         /* from this point, not restartable */
@@ -1845,7 +1880,7 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip,
             {
                 ss = (ss & ~3) | dpl;
                 cpu_x86_load_seg_cache(env, R_SS, ss,
-                                       ssp,
+                                       sa.ss_base,
                                        get_seg_limit(ss_e1, ss_e2),
                                        ss_e2);
             }
@@ -1856,7 +1891,7 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip,
                        get_seg_base(e1, e2),
                        get_seg_limit(e1, e2),
                        e2);
-        SET_ESP(sp, sp_mask);
+        SET_ESP(sa.sp, sa.sp_mask);
         env->eip = offset;
     }
 }
@@ -1864,26 +1899,28 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip,
 /* real and vm86 mode iret */
 void helper_iret_real(CPUX86State *env, int shift)
 {
-    uint32_t sp, new_cs, new_eip, new_eflags, sp_mask;
-    target_ulong ssp;
+    uint32_t new_cs, new_eip, new_eflags;
     int eflags_mask;
+    StackAccess sa;
+
+    sa.env = env;
+    sa.ra = GETPC();
+    sa.sp_mask = 0xffff; /* XXXX: use SS segment size? */
+    sa.sp = env->regs[R_ESP];
+    sa.ss_base = env->segs[R_SS].base;
 
-    sp_mask = 0xffff; /* XXXX: use SS segment size? */
-    sp = env->regs[R_ESP];
-    ssp = env->segs[R_SS].base;
     if (shift == 1) {
         /* 32 bits */
-        POPL_RA(ssp, sp, sp_mask, new_eip, GETPC());
-        POPL_RA(ssp, sp, sp_mask, new_cs, GETPC());
-        new_cs &= 0xffff;
-        POPL_RA(ssp, sp, sp_mask, new_eflags, GETPC());
+        new_eip = popl(&sa);
+        new_cs = popl(&sa) & 0xffff;
+        new_eflags = popl(&sa);
     } else {
         /* 16 bits */
-        POPW_RA(ssp, sp, sp_mask, new_eip, GETPC());
-        POPW_RA(ssp, sp, sp_mask, new_cs, GETPC());
-        POPW_RA(ssp, sp, sp_mask, new_eflags, GETPC());
+        new_eip = popw(&sa);
+        new_cs = popw(&sa);
+        new_eflags = popw(&sa);
     }
-    env->regs[R_ESP] = (env->regs[R_ESP] & ~sp_mask) | (sp & sp_mask);
+    SET_ESP(sa.sp, sa.sp_mask);
     env->segs[R_CS].selector = new_cs;
     env->segs[R_CS].base = (new_cs << 4);
     env->eip = new_eip;
@@ -1936,47 +1973,49 @@ static inline void helper_ret_protected(CPUX86State *env, int shift,
     uint32_t new_es, new_ds, new_fs, new_gs;
     uint32_t e1, e2, ss_e1, ss_e2;
     int cpl, dpl, rpl, eflags_mask, iopl;
-    target_ulong ssp, sp, new_eip, new_esp, sp_mask;
+    target_ulong new_eip, new_esp;
+    StackAccess sa;
+
+    sa.env = env;
+    sa.ra = retaddr;
 
 #ifdef TARGET_X86_64
     if (shift == 2) {
-        sp_mask = -1;
+        sa.sp_mask = -1;
     } else
 #endif
     {
-        sp_mask = get_sp_mask(env->segs[R_SS].flags);
+        sa.sp_mask = get_sp_mask(env->segs[R_SS].flags);
     }
-    sp = env->regs[R_ESP];
-    ssp = env->segs[R_SS].base;
+    sa.sp = env->regs[R_ESP];
+    sa.ss_base = env->segs[R_SS].base;
     new_eflags = 0; /* avoid warning */
 #ifdef TARGET_X86_64
     if (shift == 2) {
-        POPQ_RA(sp, new_eip, retaddr);
-        POPQ_RA(sp, new_cs, retaddr);
-        new_cs &= 0xffff;
+        new_eip = popq(&sa);
+        new_cs = popq(&sa) & 0xffff;
         if (is_iret) {
-            POPQ_RA(sp, new_eflags, retaddr);
+            new_eflags = popq(&sa);
         }
     } else
 #endif
     {
         if (shift == 1) {
             /* 32 bits */
-            POPL_RA(ssp, sp, sp_mask, new_eip, retaddr);
-            POPL_RA(ssp, sp, sp_mask, new_cs, retaddr);
-            new_cs &= 0xffff;
+            new_eip = popl(&sa);
+            new_cs = popl(&sa) & 0xffff;
             if (is_iret) {
-                POPL_RA(ssp, sp, sp_mask, new_eflags, retaddr);
+                new_eflags = popl(&sa);
                 if (new_eflags & VM_MASK) {
                     goto return_to_vm86;
                 }
             }
         } else {
             /* 16 bits */
-            POPW_RA(ssp, sp, sp_mask, new_eip, retaddr);
-            POPW_RA(ssp, sp, sp_mask, new_cs, retaddr);
+            new_eip = popw(&sa);
+            new_cs = popw(&sa);
             if (is_iret) {
-                POPW_RA(ssp, sp, sp_mask, new_eflags, retaddr);
+                new_eflags = popw(&sa);
             }
         }
     }
@@ -2012,7 +2051,7 @@ static inline void helper_ret_protected(CPUX86State *env, int shift,
         raise_exception_err_ra(env, EXCP0B_NOSEG, new_cs & 0xfffc, retaddr);
     }
 
-    sp += addend;
+    sa.sp += addend;
     if (rpl == cpl && (!(env->hflags & HF_CS64_MASK) ||
                        ((env->hflags & HF_CS64_MASK) && !is_iret))) {
         /* return to same privilege level */
@@ -2024,21 +2063,19 @@ static inline void helper_ret_protected(CPUX86State *env, int shift,
         /* return to different privilege level */
 #ifdef TARGET_X86_64
         if (shift == 2) {
-            POPQ_RA(sp, new_esp, retaddr);
-            POPQ_RA(sp, new_ss, retaddr);
-            new_ss &= 0xffff;
+            new_esp = popq(&sa);
+            new_ss = popq(&sa) & 0xffff;
         } else
 #endif
         {
             if (shift == 1) {
                 /* 32 bits */
-                POPL_RA(ssp, sp, sp_mask, new_esp, retaddr);
-                POPL_RA(ssp, sp, sp_mask, new_ss, retaddr);
-                new_ss &= 0xffff;
+                new_esp = popl(&sa);
+                new_ss = popl(&sa) & 0xffff;
             } else {
                 /* 16 bits */
-                POPW_RA(ssp, sp, sp_mask, new_esp, retaddr);
-                POPW_RA(ssp, sp, sp_mask, new_ss, retaddr);
+                new_esp = popw(&sa);
+                new_ss = popw(&sa);
             }
         }
         LOG_PCALL("new ss:esp=%04x:" TARGET_FMT_lx "\n",
@@ -2088,14 +2125,14 @@ static inline void helper_ret_protected(CPUX86State *env, int shift,
                        get_seg_base(e1, e2),
                        get_seg_limit(e1, e2),
                        e2);
-        sp = new_esp;
+        sa.sp = new_esp;
 #ifdef TARGET_X86_64
         if (env->hflags & HF_CS64_MASK) {
-            sp_mask = -1;
+            sa.sp_mask = -1;
         } else
 #endif
         {
-            sp_mask = get_sp_mask(ss_e2);
+            sa.sp_mask = get_sp_mask(ss_e2);
         }
 
         /* validate data segments */
@@ -2104,9 +2141,9 @@ static inline void helper_ret_protected(CPUX86State *env, int shift,
         validate_seg(env, R_FS, rpl);
         validate_seg(env, R_GS, rpl);
 
-        sp += addend;
+        sa.sp += addend;
     }
-    SET_ESP(sp, sp_mask);
+    SET_ESP(sa.sp, sa.sp_mask);
     env->eip = new_eip;
     if (is_iret) {
         /* NOTE: 'cpl' is the _old_ CPL */
@@ -2126,12 +2163,12 @@ static inline void helper_ret_protected(CPUX86State *env, int shift,
     return;
 
  return_to_vm86:
-    POPL_RA(ssp, sp, sp_mask, new_esp, retaddr);
-    POPL_RA(ssp, sp, sp_mask, new_ss, retaddr);
-    POPL_RA(ssp, sp, sp_mask, new_es, retaddr);
-    POPL_RA(ssp, sp, sp_mask, new_ds, retaddr);
-    POPL_RA(ssp, sp, sp_mask, new_fs, retaddr);
-    POPL_RA(ssp, sp, sp_mask, new_gs, retaddr);
+    new_esp = popl(&sa);
+    new_ss = popl(&sa);
+    new_es = popl(&sa);
+    new_ds = popl(&sa);
+    new_fs = popl(&sa);
+    new_gs = popl(&sa);
 
     /* modify processor state */
     cpu_load_eflags(env, new_eflags, TF_MASK | AC_MASK | ID_MASK |
-- 
2.45.2
[PATCH 05/10] target/i386/tcg: Introduce x86_mmu_index_{kernel_,}pl
Posted by Paolo Bonzini 5 months, 3 weeks ago
From: Richard Henderson <richard.henderson@linaro.org>

Disconnect mmu index computation from the current pl
as stored in env->hflags.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Link: https://lore.kernel.org/r/20240617161210.4639-2-richard.henderson@linaro.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/cpu.h | 11 ++---------
 target/i386/cpu.c | 27 ++++++++++++++++++++++++---
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index c43ac01c794..1e121acef54 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -2445,15 +2445,8 @@ static inline bool is_mmu_index_32(int mmu_index)
     return mmu_index & 1;
 }
 
-static inline int cpu_mmu_index_kernel(CPUX86State *env)
-{
-    int mmu_index_32 = (env->hflags & HF_LMA_MASK) ? 0 : 1;
-    int mmu_index_base =
-        !(env->hflags & HF_SMAP_MASK) ? MMU_KNOSMAP64_IDX :
-        ((env->hflags & HF_CPL_MASK) < 3 && (env->eflags & AC_MASK)) ? MMU_KNOSMAP64_IDX : MMU_KSMAP64_IDX;
-
-    return mmu_index_base + mmu_index_32;
-}
+int x86_mmu_index_pl(CPUX86State *env, unsigned pl);
+int cpu_mmu_index_kernel(CPUX86State *env);
 
 #define CC_DST  (env->cc_dst)
 #define CC_SRC  (env->cc_src)
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index c05765eeafc..4688d140c2d 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -8122,18 +8122,39 @@ static bool x86_cpu_has_work(CPUState *cs)
     return x86_cpu_pending_interrupt(cs, cs->interrupt_request) != 0;
 }
 
-static int x86_cpu_mmu_index(CPUState *cs, bool ifetch)
+int x86_mmu_index_pl(CPUX86State *env, unsigned pl)
 {
-    CPUX86State *env = cpu_env(cs);
     int mmu_index_32 = (env->hflags & HF_CS64_MASK) ? 0 : 1;
     int mmu_index_base =
-        (env->hflags & HF_CPL_MASK) == 3 ? MMU_USER64_IDX :
+        pl == 3 ? MMU_USER64_IDX :
         !(env->hflags & HF_SMAP_MASK) ? MMU_KNOSMAP64_IDX :
         (env->eflags & AC_MASK) ? MMU_KNOSMAP64_IDX : MMU_KSMAP64_IDX;
 
     return mmu_index_base + mmu_index_32;
 }
 
+static int x86_cpu_mmu_index(CPUState *cs, bool ifetch)
+{
+    CPUX86State *env = cpu_env(cs);
+    return x86_mmu_index_pl(env, env->hflags & HF_CPL_MASK);
+}
+
+static int x86_mmu_index_kernel_pl(CPUX86State *env, unsigned pl)
+{
+    int mmu_index_32 = (env->hflags & HF_LMA_MASK) ? 0 : 1;
+    int mmu_index_base =
+        !(env->hflags & HF_SMAP_MASK) ? MMU_KNOSMAP64_IDX :
+        (pl < 3 && (env->eflags & AC_MASK)
+         ? MMU_KNOSMAP64_IDX : MMU_KSMAP64_IDX);
+
+    return mmu_index_base + mmu_index_32;
+}
+
+int cpu_mmu_index_kernel(CPUX86State *env)
+{
+    return x86_mmu_index_kernel_pl(env, env->hflags & HF_CPL_MASK);
+}
+
 static void x86_disas_set_info(CPUState *cs, disassemble_info *info)
 {
     X86CPU *cpu = X86_CPU(cs);
-- 
2.45.2
[PATCH 06/10] target/i386/tcg: Compute MMU index once
Posted by Paolo Bonzini 5 months, 3 weeks ago
Add the MMU index to the StackAccess struct, so that it can be cached
or (in the next patch) computed from information that is not in
CPUX86State.

Co-developed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/seg_helper.c | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c
index 6b3de7a2be4..07e3667639a 100644
--- a/target/i386/tcg/seg_helper.c
+++ b/target/i386/tcg/seg_helper.c
@@ -587,36 +587,37 @@ typedef struct StackAccess
     target_ulong ss_base;
     target_ulong sp;
     target_ulong sp_mask;
+    int mmu_index;
 } StackAccess;
 
 static void pushw(StackAccess *sa, uint16_t val)
 {
     sa->sp -= 2;
-    cpu_stw_kernel_ra(sa->env, sa->ss_base + (sa->sp & sa->sp_mask),
-                      val, sa->ra);
+    cpu_stw_mmuidx_ra(sa->env, sa->ss_base + (sa->sp & sa->sp_mask),
+                      val, sa->mmu_index, sa->ra);
 }
 
 static void pushl(StackAccess *sa, uint32_t val)
 {
     sa->sp -= 4;
-    cpu_stl_kernel_ra(sa->env, sa->ss_base + (sa->sp & sa->sp_mask),
-                      val, sa->ra);
+    cpu_stl_mmuidx_ra(sa->env, sa->ss_base + (sa->sp & sa->sp_mask),
+                      val, sa->mmu_index, sa->ra);
 }
 
 static uint16_t popw(StackAccess *sa)
 {
-    uint16_t ret = cpu_lduw_data_ra(sa->env,
-                                    sa->ss_base + (sa->sp & sa->sp_mask),
-                                    sa->ra);
+    uint16_t ret = cpu_lduw_mmuidx_ra(sa->env,
+                                      sa->ss_base + (sa->sp & sa->sp_mask),
+                                      sa->mmu_index, sa->ra);
     sa->sp += 2;
     return ret;
 }
 
 static uint32_t popl(StackAccess *sa)
 {
-    uint32_t ret = cpu_ldl_data_ra(sa->env,
-                                   sa->ss_base + (sa->sp & sa->sp_mask),
-                                   sa->ra);
+    uint32_t ret = cpu_ldl_mmuidx_ra(sa->env,
+                                     sa->ss_base + (sa->sp & sa->sp_mask),
+                                     sa->mmu_index, sa->ra);
     sa->sp += 4;
     return ret;
 }
@@ -677,6 +678,7 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int,
 
     sa.env = env;
     sa.ra = 0;
+    sa.mmu_index = cpu_mmu_index_kernel(env);
 
     if (type == 5) {
         /* task gate */
@@ -858,12 +860,12 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int,
 static void pushq(StackAccess *sa, uint64_t val)
 {
     sa->sp -= 8;
-    cpu_stq_kernel_ra(sa->env, sa->sp, val, sa->ra);
+    cpu_stq_mmuidx_ra(sa->env, sa->sp, val, sa->mmu_index, sa->ra);
 }
 
 static uint64_t popq(StackAccess *sa)
 {
-    uint64_t ret = cpu_ldq_data_ra(sa->env, sa->sp, sa->ra);
+    uint64_t ret = cpu_ldq_mmuidx_ra(sa->env, sa->sp, sa->mmu_index, sa->ra);
     sa->sp += 8;
     return ret;
 }
@@ -982,6 +984,7 @@ static void do_interrupt64(CPUX86State *env, int intno, int is_int,
 
     sa.env = env;
     sa.ra = 0;
+    sa.mmu_index = cpu_mmu_index_kernel(env);
     sa.sp_mask = -1;
     sa.ss_base = 0;
     if (dpl < cpl || ist != 0) {
@@ -1116,6 +1119,7 @@ static void do_interrupt_real(CPUX86State *env, int intno, int is_int,
     sa.sp = env->regs[R_ESP];
     sa.sp_mask = 0xffff;
     sa.ss_base = env->segs[R_SS].base;
+    sa.mmu_index = cpu_mmu_index_kernel(env);
 
     if (is_int) {
         old_eip = next_eip;
@@ -1579,6 +1583,7 @@ void helper_lcall_real(CPUX86State *env, uint32_t new_cs, uint32_t new_eip,
     sa.sp = env->regs[R_ESP];
     sa.sp_mask = get_sp_mask(env->segs[R_SS].flags);
     sa.ss_base = env->segs[R_SS].base;
+    sa.mmu_index = cpu_mmu_index_kernel(env);
 
     if (shift) {
         pushl(&sa, env->segs[R_CS].selector);
@@ -1618,6 +1623,7 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip,
 
     sa.env = env;
     sa.ra = GETPC();
+    sa.mmu_index = cpu_mmu_index_kernel(env);
 
     if (e2 & DESC_S_MASK) {
         if (!(e2 & DESC_CS_MASK)) {
@@ -1905,6 +1911,7 @@ void helper_iret_real(CPUX86State *env, int shift)
 
     sa.env = env;
     sa.ra = GETPC();
+    sa.mmu_index = x86_mmu_index_pl(env, 0);
     sa.sp_mask = 0xffff; /* XXXX: use SS segment size? */
     sa.sp = env->regs[R_ESP];
     sa.ss_base = env->segs[R_SS].base;
@@ -1976,8 +1983,11 @@ static inline void helper_ret_protected(CPUX86State *env, int shift,
     target_ulong new_eip, new_esp;
     StackAccess sa;
 
+    cpl = env->hflags & HF_CPL_MASK;
+
     sa.env = env;
     sa.ra = retaddr;
+    sa.mmu_index = x86_mmu_index_pl(env, cpl);
 
 #ifdef TARGET_X86_64
     if (shift == 2) {
@@ -2032,7 +2042,6 @@ static inline void helper_ret_protected(CPUX86State *env, int shift,
         !(e2 & DESC_CS_MASK)) {
         raise_exception_err_ra(env, EXCP0D_GPF, new_cs & 0xfffc, retaddr);
     }
-    cpl = env->hflags & HF_CPL_MASK;
     rpl = new_cs & 3;
     if (rpl < cpl) {
         raise_exception_err_ra(env, EXCP0D_GPF, new_cs & 0xfffc, retaddr);
-- 
2.45.2
Re: [PATCH 06/10] target/i386/tcg: Compute MMU index once
Posted by Richard Henderson 5 months, 2 weeks ago
On 7/9/24 23:29, Paolo Bonzini wrote:
> Add the MMU index to the StackAccess struct, so that it can be cached
> or (in the next patch) computed from information that is not in
> CPUX86State.
> 
> Co-developed-by: Richard Henderson <richard.henderson@linaro.org>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>


r~
[PATCH 07/10] target/i386/tcg: Use DPL-level accesses for interrupts and call gates
Posted by Paolo Bonzini 5 months, 3 weeks ago
This fixes a bug wherein i386/tcg assumed an interrupt return using
the CALL or JMP instructions were always going from kernel or user mode to
kernel mode, when using a call gate. This assumption is violated if
the call gate has a DPL that is greater than 0.

In addition, the stack accesses should count as explicit, not implicit
("kernel" in QEMU code), so that SMAP is not applied if DPL=3.

Analyzed-by: Robert R. Henry <rrh.henry@gmail.com>
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/249
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/seg_helper.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c
index 07e3667639a..1430f477c43 100644
--- a/target/i386/tcg/seg_helper.c
+++ b/target/i386/tcg/seg_helper.c
@@ -678,7 +678,7 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int,
 
     sa.env = env;
     sa.ra = 0;
-    sa.mmu_index = cpu_mmu_index_kernel(env);
+    sa.mmu_index = x86_mmu_index_pl(env, dpl);
 
     if (type == 5) {
         /* task gate */
@@ -984,7 +984,7 @@ static void do_interrupt64(CPUX86State *env, int intno, int is_int,
 
     sa.env = env;
     sa.ra = 0;
-    sa.mmu_index = cpu_mmu_index_kernel(env);
+    sa.mmu_index = x86_mmu_index_pl(env, dpl);
     sa.sp_mask = -1;
     sa.ss_base = 0;
     if (dpl < cpl || ist != 0) {
@@ -1119,7 +1119,7 @@ static void do_interrupt_real(CPUX86State *env, int intno, int is_int,
     sa.sp = env->regs[R_ESP];
     sa.sp_mask = 0xffff;
     sa.ss_base = env->segs[R_SS].base;
-    sa.mmu_index = cpu_mmu_index_kernel(env);
+    sa.mmu_index = x86_mmu_index_pl(env, 0);
 
     if (is_int) {
         old_eip = next_eip;
@@ -1583,7 +1583,7 @@ void helper_lcall_real(CPUX86State *env, uint32_t new_cs, uint32_t new_eip,
     sa.sp = env->regs[R_ESP];
     sa.sp_mask = get_sp_mask(env->segs[R_SS].flags);
     sa.ss_base = env->segs[R_SS].base;
-    sa.mmu_index = cpu_mmu_index_kernel(env);
+    sa.mmu_index = x86_mmu_index_pl(env, 0);
 
     if (shift) {
         pushl(&sa, env->segs[R_CS].selector);
@@ -1619,17 +1619,17 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip,
         raise_exception_err_ra(env, EXCP0D_GPF, new_cs & 0xfffc, GETPC());
     }
     cpl = env->hflags & HF_CPL_MASK;
+    dpl = (e2 >> DESC_DPL_SHIFT) & 3;
     LOG_PCALL("desc=%08x:%08x\n", e1, e2);
 
     sa.env = env;
     sa.ra = GETPC();
-    sa.mmu_index = cpu_mmu_index_kernel(env);
+    sa.mmu_index = x86_mmu_index_pl(env, dpl);
 
     if (e2 & DESC_S_MASK) {
         if (!(e2 & DESC_CS_MASK)) {
             raise_exception_err_ra(env, EXCP0D_GPF, new_cs & 0xfffc, GETPC());
         }
-        dpl = (e2 >> DESC_DPL_SHIFT) & 3;
         if (e2 & DESC_C_MASK) {
             /* conforming code segment */
             if (dpl > cpl) {
@@ -1691,7 +1691,6 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip,
     } else {
         /* check gate type */
         type = (e2 >> DESC_TYPE_SHIFT) & 0x1f;
-        dpl = (e2 >> DESC_DPL_SHIFT) & 3;
         rpl = new_cs & 3;
 
 #ifdef TARGET_X86_64
-- 
2.45.2
Re: [PATCH 07/10] target/i386/tcg: Use DPL-level accesses for interrupts and call gates
Posted by Richard Henderson 5 months, 2 weeks ago
On 7/9/24 23:29, Paolo Bonzini wrote:
> This fixes a bug wherein i386/tcg assumed an interrupt return using
> the CALL or JMP instructions were always going from kernel or user mode to
> kernel mode, when using a call gate. This assumption is violated if
> the call gate has a DPL that is greater than 0.
> 
> In addition, the stack accesses should count as explicit, not implicit
> ("kernel" in QEMU code), so that SMAP is not applied if DPL=3.
> 
> Analyzed-by: Robert R. Henry<rrh.henry@gmail.com>
> Resolves:https://gitlab.com/qemu-project/qemu/-/issues/249
> Signed-off-by: Paolo Bonzini<pbonzini@redhat.com>
> ---
>   target/i386/tcg/seg_helper.c | 13 ++++++-------
>   1 file changed, 6 insertions(+), 7 deletions(-)

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>


r~
[PATCH 08/10] target/i386/tcg: check for correct busy state before switching to a new task
Posted by Paolo Bonzini 5 months, 3 weeks ago
This step is listed in the Intel manual: "Checks that the new task is available
(call, jump, exception, or interrupt) or busy (IRET return)".

The AMD manual lists the same operation under the "Preventing recursion"
paragraph of "12.3.4 Nesting Tasks", though it is not clear if the processor
checks the busy bit in the IRET case.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/seg_helper.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c
index 1430f477c43..25af9d4a4ec 100644
--- a/target/i386/tcg/seg_helper.c
+++ b/target/i386/tcg/seg_helper.c
@@ -306,6 +306,11 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector,
         old_tss_limit_max = 43;
     }
 
+    /* new TSS must be busy iff the source is an IRET instruction  */
+    if (!!(e2 & DESC_TSS_BUSY_MASK) != (source == SWITCH_TSS_IRET)) {
+        raise_exception_err_ra(env, EXCP0A_TSS, tss_selector & 0xfffc, retaddr);
+    }
+
     /* read all the registers from the new TSS */
     if (type & 8) {
         /* 32 bit */
-- 
2.45.2
Re: [PATCH 08/10] target/i386/tcg: check for correct busy state before switching to a new task
Posted by Richard Henderson 5 months, 2 weeks ago
On 7/9/24 23:29, Paolo Bonzini wrote:
> This step is listed in the Intel manual: "Checks that the new task is available
> (call, jump, exception, or interrupt) or busy (IRET return)".
> 
> The AMD manual lists the same operation under the "Preventing recursion"
> paragraph of "12.3.4 Nesting Tasks", though it is not clear if the processor
> checks the busy bit in the IRET case.
> 
> Signed-off-by: Paolo Bonzini<pbonzini@redhat.com>
> ---
>   target/i386/tcg/seg_helper.c | 5 +++++
>   1 file changed, 5 insertions(+)

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>

r!
[PATCH 09/10] target/i386/tcg: use X86Access for TSS access
Posted by Paolo Bonzini 5 months, 3 weeks ago
This takes care of probing the vaddr range in advance, and is also faster
because it avoids repeated TLB lookups.  It also matches the Intel manual
better, as it says "Checks that the current (old) TSS, new TSS, and all
segment descriptors used in the task switch are paged into system memory";
note however that it's not clear how the processor checks for segment
descriptors, and this check is not included in the AMD manual.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/seg_helper.c | 101 ++++++++++++++++++-----------------
 1 file changed, 51 insertions(+), 50 deletions(-)

diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c
index 25af9d4a4ec..77f2c65c3cf 100644
--- a/target/i386/tcg/seg_helper.c
+++ b/target/i386/tcg/seg_helper.c
@@ -27,6 +27,7 @@
 #include "exec/log.h"
 #include "helper-tcg.h"
 #include "seg_helper.h"
+#include "access.h"
 
 int get_pg_mode(CPUX86State *env)
 {
@@ -250,7 +251,7 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector,
                          uint32_t e1, uint32_t e2, int source,
                          uint32_t next_eip, uintptr_t retaddr)
 {
-    int tss_limit, tss_limit_max, type, old_tss_limit_max, old_type, v1, v2, i;
+    int tss_limit, tss_limit_max, type, old_tss_limit_max, old_type, i;
     target_ulong tss_base;
     uint32_t new_regs[8], new_segs[6];
     uint32_t new_eflags, new_eip, new_cr3, new_ldt, new_trap;
@@ -258,6 +259,7 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector,
     SegmentCache *dt;
     int index;
     target_ulong ptr;
+    X86Access old, new;
 
     type = (e2 >> DESC_TYPE_SHIFT) & 0xf;
     LOG_PCALL("switch_tss: sel=0x%04x type=%d src=%d\n", tss_selector, type,
@@ -311,35 +313,44 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector,
         raise_exception_err_ra(env, EXCP0A_TSS, tss_selector & 0xfffc, retaddr);
     }
 
+    /* X86Access avoids memory exceptions during the task switch */
+    access_prepare_mmu(&old, env, env->tr.base, old_tss_limit_max,
+		       MMU_DATA_STORE, cpu_mmu_index_kernel(env), retaddr);
+
+    if (source == SWITCH_TSS_CALL) {
+        /* Probe for future write of parent task */
+        probe_access(env, tss_base, 2, MMU_DATA_STORE,
+		     cpu_mmu_index_kernel(env), retaddr);
+    }
+    access_prepare_mmu(&new, env, tss_base, tss_limit,
+		       MMU_DATA_LOAD, cpu_mmu_index_kernel(env), retaddr);
+
     /* read all the registers from the new TSS */
     if (type & 8) {
         /* 32 bit */
-        new_cr3 = cpu_ldl_kernel_ra(env, tss_base + 0x1c, retaddr);
-        new_eip = cpu_ldl_kernel_ra(env, tss_base + 0x20, retaddr);
-        new_eflags = cpu_ldl_kernel_ra(env, tss_base + 0x24, retaddr);
+        new_cr3 = access_ldl(&new, tss_base + 0x1c);
+        new_eip = access_ldl(&new, tss_base + 0x20);
+        new_eflags = access_ldl(&new, tss_base + 0x24);
         for (i = 0; i < 8; i++) {
-            new_regs[i] = cpu_ldl_kernel_ra(env, tss_base + (0x28 + i * 4),
-                                            retaddr);
+            new_regs[i] = access_ldl(&new, tss_base + (0x28 + i * 4));
         }
         for (i = 0; i < 6; i++) {
-            new_segs[i] = cpu_lduw_kernel_ra(env, tss_base + (0x48 + i * 4),
-                                             retaddr);
+            new_segs[i] = access_ldw(&new, tss_base + (0x48 + i * 4));
         }
-        new_ldt = cpu_lduw_kernel_ra(env, tss_base + 0x60, retaddr);
-        new_trap = cpu_ldl_kernel_ra(env, tss_base + 0x64, retaddr);
+        new_ldt = access_ldw(&new, tss_base + 0x60);
+        new_trap = access_ldl(&new, tss_base + 0x64);
     } else {
         /* 16 bit */
         new_cr3 = 0;
-        new_eip = cpu_lduw_kernel_ra(env, tss_base + 0x0e, retaddr);
-        new_eflags = cpu_lduw_kernel_ra(env, tss_base + 0x10, retaddr);
+        new_eip = access_ldw(&new, tss_base + 0x0e);
+        new_eflags = access_ldw(&new, tss_base + 0x10);
         for (i = 0; i < 8; i++) {
-            new_regs[i] = cpu_lduw_kernel_ra(env, tss_base + (0x12 + i * 2), retaddr);
+            new_regs[i] = access_ldw(&new, tss_base + (0x12 + i * 2));
         }
         for (i = 0; i < 4; i++) {
-            new_segs[i] = cpu_lduw_kernel_ra(env, tss_base + (0x22 + i * 2),
-                                             retaddr);
+            new_segs[i] = access_ldw(&new, tss_base + (0x22 + i * 2));
         }
-        new_ldt = cpu_lduw_kernel_ra(env, tss_base + 0x2a, retaddr);
+        new_ldt = access_ldw(&new, tss_base + 0x2a);
         new_segs[R_FS] = 0;
         new_segs[R_GS] = 0;
         new_trap = 0;
@@ -349,16 +360,6 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector,
      chapters 12.2.5 and 13.2.4 on how to implement TSS Trap bit */
     (void)new_trap;
 
-    /* NOTE: we must avoid memory exceptions during the task switch,
-       so we make dummy accesses before */
-    /* XXX: it can still fail in some cases, so a bigger hack is
-       necessary to valid the TLB after having done the accesses */
-
-    v1 = cpu_ldub_kernel_ra(env, env->tr.base, retaddr);
-    v2 = cpu_ldub_kernel_ra(env, env->tr.base + old_tss_limit_max, retaddr);
-    cpu_stb_kernel_ra(env, env->tr.base, v1, retaddr);
-    cpu_stb_kernel_ra(env, env->tr.base + old_tss_limit_max, v2, retaddr);
-
     /* clear busy bit (it is restartable) */
     if (source == SWITCH_TSS_JMP || source == SWITCH_TSS_IRET) {
         tss_set_busy(env, env->tr.selector, 0, retaddr);
@@ -371,35 +372,35 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector,
     /* save the current state in the old TSS */
     if (old_type & 8) {
         /* 32 bit */
-        cpu_stl_kernel_ra(env, env->tr.base + 0x20, next_eip, retaddr);
-        cpu_stl_kernel_ra(env, env->tr.base + 0x24, old_eflags, retaddr);
-        cpu_stl_kernel_ra(env, env->tr.base + (0x28 + 0 * 4), env->regs[R_EAX], retaddr);
-        cpu_stl_kernel_ra(env, env->tr.base + (0x28 + 1 * 4), env->regs[R_ECX], retaddr);
-        cpu_stl_kernel_ra(env, env->tr.base + (0x28 + 2 * 4), env->regs[R_EDX], retaddr);
-        cpu_stl_kernel_ra(env, env->tr.base + (0x28 + 3 * 4), env->regs[R_EBX], retaddr);
-        cpu_stl_kernel_ra(env, env->tr.base + (0x28 + 4 * 4), env->regs[R_ESP], retaddr);
-        cpu_stl_kernel_ra(env, env->tr.base + (0x28 + 5 * 4), env->regs[R_EBP], retaddr);
-        cpu_stl_kernel_ra(env, env->tr.base + (0x28 + 6 * 4), env->regs[R_ESI], retaddr);
-        cpu_stl_kernel_ra(env, env->tr.base + (0x28 + 7 * 4), env->regs[R_EDI], retaddr);
+        access_stl(&old, env->tr.base + 0x20, next_eip);
+        access_stl(&old, env->tr.base + 0x24, old_eflags);
+        access_stl(&old, env->tr.base + (0x28 + 0 * 4), env->regs[R_EAX]);
+        access_stl(&old, env->tr.base + (0x28 + 1 * 4), env->regs[R_ECX]);
+        access_stl(&old, env->tr.base + (0x28 + 2 * 4), env->regs[R_EDX]);
+        access_stl(&old, env->tr.base + (0x28 + 3 * 4), env->regs[R_EBX]);
+        access_stl(&old, env->tr.base + (0x28 + 4 * 4), env->regs[R_ESP]);
+        access_stl(&old, env->tr.base + (0x28 + 5 * 4), env->regs[R_EBP]);
+        access_stl(&old, env->tr.base + (0x28 + 6 * 4), env->regs[R_ESI]);
+        access_stl(&old, env->tr.base + (0x28 + 7 * 4), env->regs[R_EDI]);
         for (i = 0; i < 6; i++) {
-            cpu_stw_kernel_ra(env, env->tr.base + (0x48 + i * 4),
-                              env->segs[i].selector, retaddr);
+            access_stw(&old, env->tr.base + (0x48 + i * 4),
+                       env->segs[i].selector);
         }
     } else {
         /* 16 bit */
-        cpu_stw_kernel_ra(env, env->tr.base + 0x0e, next_eip, retaddr);
-        cpu_stw_kernel_ra(env, env->tr.base + 0x10, old_eflags, retaddr);
-        cpu_stw_kernel_ra(env, env->tr.base + (0x12 + 0 * 2), env->regs[R_EAX], retaddr);
-        cpu_stw_kernel_ra(env, env->tr.base + (0x12 + 1 * 2), env->regs[R_ECX], retaddr);
-        cpu_stw_kernel_ra(env, env->tr.base + (0x12 + 2 * 2), env->regs[R_EDX], retaddr);
-        cpu_stw_kernel_ra(env, env->tr.base + (0x12 + 3 * 2), env->regs[R_EBX], retaddr);
-        cpu_stw_kernel_ra(env, env->tr.base + (0x12 + 4 * 2), env->regs[R_ESP], retaddr);
-        cpu_stw_kernel_ra(env, env->tr.base + (0x12 + 5 * 2), env->regs[R_EBP], retaddr);
-        cpu_stw_kernel_ra(env, env->tr.base + (0x12 + 6 * 2), env->regs[R_ESI], retaddr);
-        cpu_stw_kernel_ra(env, env->tr.base + (0x12 + 7 * 2), env->regs[R_EDI], retaddr);
+        access_stw(&old, env->tr.base + 0x0e, next_eip);
+        access_stw(&old, env->tr.base + 0x10, old_eflags);
+        access_stw(&old, env->tr.base + (0x12 + 0 * 2), env->regs[R_EAX]);
+        access_stw(&old, env->tr.base + (0x12 + 1 * 2), env->regs[R_ECX]);
+        access_stw(&old, env->tr.base + (0x12 + 2 * 2), env->regs[R_EDX]);
+        access_stw(&old, env->tr.base + (0x12 + 3 * 2), env->regs[R_EBX]);
+        access_stw(&old, env->tr.base + (0x12 + 4 * 2), env->regs[R_ESP]);
+        access_stw(&old, env->tr.base + (0x12 + 5 * 2), env->regs[R_EBP]);
+        access_stw(&old, env->tr.base + (0x12 + 6 * 2), env->regs[R_ESI]);
+        access_stw(&old, env->tr.base + (0x12 + 7 * 2), env->regs[R_EDI]);
         for (i = 0; i < 4; i++) {
-            cpu_stw_kernel_ra(env, env->tr.base + (0x22 + i * 2),
-                              env->segs[i].selector, retaddr);
+            access_stw(&old, env->tr.base + (0x22 + i * 2),
+                       env->segs[i].selector);
         }
     }
 
-- 
2.45.2
Re: [PATCH 09/10] target/i386/tcg: use X86Access for TSS access
Posted by Richard Henderson 5 months, 2 weeks ago
On 7/9/24 23:29, Paolo Bonzini wrote:
> This takes care of probing the vaddr range in advance, and is also faster
> because it avoids repeated TLB lookups.  It also matches the Intel manual
> better, as it says "Checks that the current (old) TSS, new TSS, and all
> segment descriptors used in the task switch are paged into system memory";
> note however that it's not clear how the processor checks for segment
> descriptors, and this check is not included in the AMD manual.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>   target/i386/tcg/seg_helper.c | 101 ++++++++++++++++++-----------------
>   1 file changed, 51 insertions(+), 50 deletions(-)
> 
> diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c
> index 25af9d4a4ec..77f2c65c3cf 100644
> --- a/target/i386/tcg/seg_helper.c
> +++ b/target/i386/tcg/seg_helper.c
> @@ -27,6 +27,7 @@
>   #include "exec/log.h"
>   #include "helper-tcg.h"
>   #include "seg_helper.h"
> +#include "access.h"
>   
>   int get_pg_mode(CPUX86State *env)
>   {
> @@ -250,7 +251,7 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector,
>                            uint32_t e1, uint32_t e2, int source,
>                            uint32_t next_eip, uintptr_t retaddr)
>   {
> -    int tss_limit, tss_limit_max, type, old_tss_limit_max, old_type, v1, v2, i;
> +    int tss_limit, tss_limit_max, type, old_tss_limit_max, old_type, i;
>       target_ulong tss_base;
>       uint32_t new_regs[8], new_segs[6];
>       uint32_t new_eflags, new_eip, new_cr3, new_ldt, new_trap;
> @@ -258,6 +259,7 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector,
>       SegmentCache *dt;
>       int index;
>       target_ulong ptr;
> +    X86Access old, new;
>   
>       type = (e2 >> DESC_TYPE_SHIFT) & 0xf;
>       LOG_PCALL("switch_tss: sel=0x%04x type=%d src=%d\n", tss_selector, type,
> @@ -311,35 +313,44 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector,
>           raise_exception_err_ra(env, EXCP0A_TSS, tss_selector & 0xfffc, retaddr);
>       }
>   
> +    /* X86Access avoids memory exceptions during the task switch */
> +    access_prepare_mmu(&old, env, env->tr.base, old_tss_limit_max,
> +		       MMU_DATA_STORE, cpu_mmu_index_kernel(env), retaddr);
> +
> +    if (source == SWITCH_TSS_CALL) {
> +        /* Probe for future write of parent task */
> +        probe_access(env, tss_base, 2, MMU_DATA_STORE,
> +		     cpu_mmu_index_kernel(env), retaddr);
> +    }
> +    access_prepare_mmu(&new, env, tss_base, tss_limit,
> +		       MMU_DATA_LOAD, cpu_mmu_index_kernel(env), retaddr);

You're computing cpu_mmu_index_kernel 3 times.

This appears to be conservative in that you're requiring only 2 bytes (a minimum) of 0x68 
to be writable.  Is it legal to place the TSS at offset 0xffe of page 0, with the balance 
on page 1, with page 0 writable and page 1 read-only?  Otherwise I would think you could 
just check the entire TSS for writability.

Anyway, after the MMU_DATA_STORE probe, you have proved that 'X86Access new' contains an 
address range that may be stored.  So you can change the SWITCH_TSS_CALL store below to 
access_stw() too.

> @@ -349,16 +360,6 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector,
>        chapters 12.2.5 and 13.2.4 on how to implement TSS Trap bit */
>       (void)new_trap;
>   
> -    /* NOTE: we must avoid memory exceptions during the task switch,
> -       so we make dummy accesses before */
> -    /* XXX: it can still fail in some cases, so a bigger hack is
> -       necessary to valid the TLB after having done the accesses */
> -
> -    v1 = cpu_ldub_kernel_ra(env, env->tr.base, retaddr);
> -    v2 = cpu_ldub_kernel_ra(env, env->tr.base + old_tss_limit_max, retaddr);
> -    cpu_stb_kernel_ra(env, env->tr.base, v1, retaddr);
> -    cpu_stb_kernel_ra(env, env->tr.base + old_tss_limit_max, v2, retaddr);

OMG.

Looks like a fantastic cleanup overall.


r~
Re: [PATCH 09/10] target/i386/tcg: use X86Access for TSS access
Posted by Paolo Bonzini 5 months, 2 weeks ago
Il mer 10 lug 2024, 18:47 Richard Henderson <richard.henderson@linaro.org>
ha scritto:

> On 7/9/24 23:29, Paolo Bonzini wrote:
> > This takes care of probing the vaddr range in advance, and is also faster
> > because it avoids repeated TLB lookups.  It also matches the Intel manual
> > better, as it says "Checks that the current (old) TSS, new TSS, and all
> > segment descriptors used in the task switch are paged into system
> memory";
> > note however that it's not clear how the processor checks for segment
> > descriptors, and this check is not included in the AMD manual.
> >
> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> > ---
> >   target/i386/tcg/seg_helper.c | 101 ++++++++++++++++++-----------------
> >   1 file changed, 51 insertions(+), 50 deletions(-)
> >
> > diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c
> > index 25af9d4a4ec..77f2c65c3cf 100644
> > --- a/target/i386/tcg/seg_helper.c
> > +++ b/target/i386/tcg/seg_helper.c
> > @@ -311,35 +313,44 @@ static int switch_tss_ra(CPUX86State *env, int
> tss_selector,
> >           raise_exception_err_ra(env, EXCP0A_TSS, tss_selector & 0xfffc,
> retaddr);
> >       }
> >
> > +    /* X86Access avoids memory exceptions during the task switch */
> > +    access_prepare_mmu(&old, env, env->tr.base, old_tss_limit_max,
> > +                    MMU_DATA_STORE, cpu_mmu_index_kernel(env), retaddr);
> > +
> > +    if (source == SWITCH_TSS_CALL) {
> > +        /* Probe for future write of parent task */
> > +        probe_access(env, tss_base, 2, MMU_DATA_STORE,
> > +                  cpu_mmu_index_kernel(env), retaddr);
> > +    }
> > +    access_prepare_mmu(&new, env, tss_base, tss_limit,
> > +                    MMU_DATA_LOAD, cpu_mmu_index_kernel(env), retaddr);
>
> You're computing cpu_mmu_index_kernel 3 times.
>

Oh, indeed. Better than 30. :)

This appears to be conservative in that you're requiring only 2 bytes (a
> minimum) of 0x68
> to be writable.  Is it legal to place the TSS at offset 0xffe of page 0,
> with the balance
> on page 1, with page 0 writable and page 1 read-only?


Yes, paging is totally optional here. The only field that is written is the
link.

Otherwise I would think you could
> just check the entire TSS for writability.
>
> Anyway, after the MMU_DATA_STORE probe, you have proved that 'X86Access
> new' contains an
> address range that may be stored.  So you can change the SWITCH_TSS_CALL
> store below to
> access_stw() too.
>

Nice.

> -    /* NOTE: we must avoid memory exceptions during the task switch,
> > -       so we make dummy accesses before */
> > -    /* XXX: it can still fail in some cases, so a bigger hack is
> > -       necessary to valid the TLB after having done the accesses */
> > -
> > -    v1 = cpu_ldub_kernel_ra(env, env->tr.base, retaddr);
> > -    v2 = cpu_ldub_kernel_ra(env, env->tr.base + old_tss_limit_max,
> retaddr);
> > -    cpu_stb_kernel_ra(env, env->tr.base, v1, retaddr);
> > -    cpu_stb_kernel_ra(env, env->tr.base + old_tss_limit_max, v2,
> retaddr);
>
> OMG.
>

Haha, yeah X86Access is perfect here.

Paolo

Looks like a fantastic cleanup overall.
>
>
> r~
>
>
Re: [PATCH 09/10] target/i386/tcg: use X86Access for TSS access
Posted by Paolo Bonzini 5 months, 2 weeks ago
On 7/10/24 20:40, Paolo Bonzini wrote:
> 
> 
> Il mer 10 lug 2024, 18:47 Richard Henderson 
> <richard.henderson@linaro.org <mailto:richard.henderson@linaro.org>> ha 
> scritto:
> 
>     On 7/9/24 23:29, Paolo Bonzini wrote:
>      > This takes care of probing the vaddr range in advance, and is
>     also faster
>      > because it avoids repeated TLB lookups.  It also matches the
>     Intel manual
>      > better, as it says "Checks that the current (old) TSS, new TSS,
>     and all
>      > segment descriptors used in the task switch are paged into system
>     memory";
>      > note however that it's not clear how the processor checks for segment
>      > descriptors, and this check is not included in the AMD manual.
>      >
>      > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com
>     <mailto:pbonzini@redhat.com>>
>      > ---
>      >   target/i386/tcg/seg_helper.c | 101
>     ++++++++++++++++++-----------------
>      >   1 file changed, 51 insertions(+), 50 deletions(-)
>      >
>      > diff --git a/target/i386/tcg/seg_helper.c
>     b/target/i386/tcg/seg_helper.c
>      > index 25af9d4a4ec..77f2c65c3cf 100644
>      > --- a/target/i386/tcg/seg_helper.c
>      > +++ b/target/i386/tcg/seg_helper.c
>      > @@ -311,35 +313,44 @@ static int switch_tss_ra(CPUX86State *env,
>     int tss_selector,
>      >           raise_exception_err_ra(env, EXCP0A_TSS, tss_selector &
>     0xfffc, retaddr);
>      >       }
>      >
>      > +    /* X86Access avoids memory exceptions during the task switch */
>      > +    access_prepare_mmu(&old, env, env->tr.base, old_tss_limit_max,
>      > +                    MMU_DATA_STORE, cpu_mmu_index_kernel(env),
>     retaddr);
>      > +
>      > +    if (source == SWITCH_TSS_CALL) {
>      > +        /* Probe for future write of parent task */
>      > +        probe_access(env, tss_base, 2, MMU_DATA_STORE,
>      > +                  cpu_mmu_index_kernel(env), retaddr);
>      > +    }
>      > +    access_prepare_mmu(&new, env, tss_base, tss_limit,
>      > +                    MMU_DATA_LOAD, cpu_mmu_index_kernel(env),
>     retaddr);
> 
>     You're computing cpu_mmu_index_kernel 3 times.

Squashing this in (easier to review than the whole thing):

diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c
index 4123ff1245e..4edfd26135f 100644
--- a/target/i386/tcg/seg_helper.c
+++ b/target/i386/tcg/seg_helper.c
@@ -321,7 +321,7 @@ static void switch_tss_ra(CPUX86State *env, int tss_selector,
      uint32_t new_eflags, new_eip, new_cr3, new_ldt, new_trap;
      uint32_t old_eflags, eflags_mask;
      SegmentCache *dt;
-    int index;
+    int mmu_index, index;
      target_ulong ptr;
      X86Access old, new;
  
@@ -378,16 +378,17 @@ static void switch_tss_ra(CPUX86State *env, int tss_selector,
      }
  
      /* X86Access avoids memory exceptions during the task switch */
+    mmu_index = cpu_mmu_index_kernel(env);
      access_prepare_mmu(&old, env, env->tr.base, old_tss_limit_max,
-		       MMU_DATA_STORE, cpu_mmu_index_kernel(env), retaddr);
+		       MMU_DATA_STORE, mmu_index, retaddr);
  
      if (source == SWITCH_TSS_CALL) {
          /* Probe for future write of parent task */
          probe_access(env, tss_base, 2, MMU_DATA_STORE,
-		     cpu_mmu_index_kernel(env), retaddr);
+		     mmu_index, retaddr);
      }
      access_prepare_mmu(&new, env, tss_base, tss_limit,
-		       MMU_DATA_LOAD, cpu_mmu_index_kernel(env), retaddr);
+		       MMU_DATA_LOAD, mmu_index, retaddr);
  
      /* read all the registers from the new TSS */
      if (type & 8) {
@@ -468,7 +469,11 @@ static void switch_tss_ra(CPUX86State *env, int tss_selector,
         context */
  
      if (source == SWITCH_TSS_CALL) {
-        cpu_stw_kernel_ra(env, tss_base, env->tr.selector, retaddr);
+        /*
+         * Thanks to the probe_access above, we know the first two
+         * bytes addressed by &new are writable too.
+         */
+        access_stw(&new, tss_base, env->tr.selector);
          new_eflags |= NT_MASK;
      }
  
Paolo


Re: [PATCH 09/10] target/i386/tcg: use X86Access for TSS access
Posted by Richard Henderson 5 months, 2 weeks ago
On 7/10/24 23:28, Paolo Bonzini wrote:
> On 7/10/24 20:40, Paolo Bonzini wrote:
>>
>>
>> Il mer 10 lug 2024, 18:47 Richard Henderson <richard.henderson@linaro.org 
>> <mailto:richard.henderson@linaro.org>> ha scritto:
>>
>>     On 7/9/24 23:29, Paolo Bonzini wrote:
>>      > This takes care of probing the vaddr range in advance, and is
>>     also faster
>>      > because it avoids repeated TLB lookups.  It also matches the
>>     Intel manual
>>      > better, as it says "Checks that the current (old) TSS, new TSS,
>>     and all
>>      > segment descriptors used in the task switch are paged into system
>>     memory";
>>      > note however that it's not clear how the processor checks for segment
>>      > descriptors, and this check is not included in the AMD manual.
>>      >
>>      > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com
>>     <mailto:pbonzini@redhat.com>>
>>      > ---
>>      >   target/i386/tcg/seg_helper.c | 101
>>     ++++++++++++++++++-----------------
>>      >   1 file changed, 51 insertions(+), 50 deletions(-)
>>      >
>>      > diff --git a/target/i386/tcg/seg_helper.c
>>     b/target/i386/tcg/seg_helper.c
>>      > index 25af9d4a4ec..77f2c65c3cf 100644
>>      > --- a/target/i386/tcg/seg_helper.c
>>      > +++ b/target/i386/tcg/seg_helper.c
>>      > @@ -311,35 +313,44 @@ static int switch_tss_ra(CPUX86State *env,
>>     int tss_selector,
>>      >           raise_exception_err_ra(env, EXCP0A_TSS, tss_selector &
>>     0xfffc, retaddr);
>>      >       }
>>      >
>>      > +    /* X86Access avoids memory exceptions during the task switch */
>>      > +    access_prepare_mmu(&old, env, env->tr.base, old_tss_limit_max,
>>      > +                    MMU_DATA_STORE, cpu_mmu_index_kernel(env),
>>     retaddr);
>>      > +
>>      > +    if (source == SWITCH_TSS_CALL) {
>>      > +        /* Probe for future write of parent task */
>>      > +        probe_access(env, tss_base, 2, MMU_DATA_STORE,
>>      > +                  cpu_mmu_index_kernel(env), retaddr);
>>      > +    }
>>      > +    access_prepare_mmu(&new, env, tss_base, tss_limit,
>>      > +                    MMU_DATA_LOAD, cpu_mmu_index_kernel(env),
>>     retaddr);
>>
>>     You're computing cpu_mmu_index_kernel 3 times.
> 
> Squashing this in (easier to review than the whole thing):

Excellent, thanks!


r~

> 
> diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c
> index 4123ff1245e..4edfd26135f 100644
> --- a/target/i386/tcg/seg_helper.c
> +++ b/target/i386/tcg/seg_helper.c
> @@ -321,7 +321,7 @@ static void switch_tss_ra(CPUX86State *env, int tss_selector,
>       uint32_t new_eflags, new_eip, new_cr3, new_ldt, new_trap;
>       uint32_t old_eflags, eflags_mask;
>       SegmentCache *dt;
> -    int index;
> +    int mmu_index, index;
>       target_ulong ptr;
>       X86Access old, new;
> 
> @@ -378,16 +378,17 @@ static void switch_tss_ra(CPUX86State *env, int tss_selector,
>       }
> 
>       /* X86Access avoids memory exceptions during the task switch */
> +    mmu_index = cpu_mmu_index_kernel(env);
>       access_prepare_mmu(&old, env, env->tr.base, old_tss_limit_max,
> -               MMU_DATA_STORE, cpu_mmu_index_kernel(env), retaddr);
> +               MMU_DATA_STORE, mmu_index, retaddr);
> 
>       if (source == SWITCH_TSS_CALL) {
>           /* Probe for future write of parent task */
>           probe_access(env, tss_base, 2, MMU_DATA_STORE,
> -             cpu_mmu_index_kernel(env), retaddr);
> +             mmu_index, retaddr);
>       }
>       access_prepare_mmu(&new, env, tss_base, tss_limit,
> -               MMU_DATA_LOAD, cpu_mmu_index_kernel(env), retaddr);
> +               MMU_DATA_LOAD, mmu_index, retaddr);
> 
>       /* read all the registers from the new TSS */
>       if (type & 8) {
> @@ -468,7 +469,11 @@ static void switch_tss_ra(CPUX86State *env, int tss_selector,
>          context */
> 
>       if (source == SWITCH_TSS_CALL) {
> -        cpu_stw_kernel_ra(env, tss_base, env->tr.selector, retaddr);
> +        /*
> +         * Thanks to the probe_access above, we know the first two
> +         * bytes addressed by &new are writable too.
> +         */
> +        access_stw(&new, tss_base, env->tr.selector);
>           new_eflags |= NT_MASK;
>       }
> 
> Paolo
> 


[PATCH 10/10] target/i386/tcg: save current task state before loading new one
Posted by Paolo Bonzini 5 months, 3 weeks ago
This is how the steps are ordered in the manual.  EFLAGS.NT is
overwritten after the fact in the saved image.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/seg_helper.c | 85 +++++++++++++++++++-----------------
 1 file changed, 45 insertions(+), 40 deletions(-)

diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c
index 77f2c65c3cf..7663e653e84 100644
--- a/target/i386/tcg/seg_helper.c
+++ b/target/i386/tcg/seg_helper.c
@@ -325,6 +325,42 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector,
     access_prepare_mmu(&new, env, tss_base, tss_limit,
 		       MMU_DATA_LOAD, cpu_mmu_index_kernel(env), retaddr);
 
+    /* save the current state in the old TSS */
+    old_eflags = cpu_compute_eflags(env);
+    if (old_type & 8) {
+        /* 32 bit */
+        access_stl(&old, env->tr.base + 0x20, next_eip);
+        access_stl(&old, env->tr.base + 0x24, old_eflags);
+        access_stl(&old, env->tr.base + (0x28 + 0 * 4), env->regs[R_EAX]);
+        access_stl(&old, env->tr.base + (0x28 + 1 * 4), env->regs[R_ECX]);
+        access_stl(&old, env->tr.base + (0x28 + 2 * 4), env->regs[R_EDX]);
+        access_stl(&old, env->tr.base + (0x28 + 3 * 4), env->regs[R_EBX]);
+        access_stl(&old, env->tr.base + (0x28 + 4 * 4), env->regs[R_ESP]);
+        access_stl(&old, env->tr.base + (0x28 + 5 * 4), env->regs[R_EBP]);
+        access_stl(&old, env->tr.base + (0x28 + 6 * 4), env->regs[R_ESI]);
+        access_stl(&old, env->tr.base + (0x28 + 7 * 4), env->regs[R_EDI]);
+        for (i = 0; i < 6; i++) {
+            access_stw(&old, env->tr.base + (0x48 + i * 4),
+                       env->segs[i].selector);
+        }
+    } else {
+        /* 16 bit */
+        access_stw(&old, env->tr.base + 0x0e, next_eip);
+        access_stw(&old, env->tr.base + 0x10, old_eflags);
+        access_stw(&old, env->tr.base + (0x12 + 0 * 2), env->regs[R_EAX]);
+        access_stw(&old, env->tr.base + (0x12 + 1 * 2), env->regs[R_ECX]);
+        access_stw(&old, env->tr.base + (0x12 + 2 * 2), env->regs[R_EDX]);
+        access_stw(&old, env->tr.base + (0x12 + 3 * 2), env->regs[R_EBX]);
+        access_stw(&old, env->tr.base + (0x12 + 4 * 2), env->regs[R_ESP]);
+        access_stw(&old, env->tr.base + (0x12 + 5 * 2), env->regs[R_EBP]);
+        access_stw(&old, env->tr.base + (0x12 + 6 * 2), env->regs[R_ESI]);
+        access_stw(&old, env->tr.base + (0x12 + 7 * 2), env->regs[R_EDI]);
+        for (i = 0; i < 4; i++) {
+            access_stw(&old, env->tr.base + (0x22 + i * 2),
+                       env->segs[i].selector);
+        }
+    }
+
     /* read all the registers from the new TSS */
     if (type & 8) {
         /* 32 bit */
@@ -364,49 +400,16 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector,
     if (source == SWITCH_TSS_JMP || source == SWITCH_TSS_IRET) {
         tss_set_busy(env, env->tr.selector, 0, retaddr);
     }
-    old_eflags = cpu_compute_eflags(env);
+
     if (source == SWITCH_TSS_IRET) {
         old_eflags &= ~NT_MASK;
+        if (old_type & 8) {
+            access_stl(&old, env->tr.base + 0x24, old_eflags);
+        } else {
+            access_stw(&old, env->tr.base + 0x10, old_eflags);
+	}
     }
 
-    /* save the current state in the old TSS */
-    if (old_type & 8) {
-        /* 32 bit */
-        access_stl(&old, env->tr.base + 0x20, next_eip);
-        access_stl(&old, env->tr.base + 0x24, old_eflags);
-        access_stl(&old, env->tr.base + (0x28 + 0 * 4), env->regs[R_EAX]);
-        access_stl(&old, env->tr.base + (0x28 + 1 * 4), env->regs[R_ECX]);
-        access_stl(&old, env->tr.base + (0x28 + 2 * 4), env->regs[R_EDX]);
-        access_stl(&old, env->tr.base + (0x28 + 3 * 4), env->regs[R_EBX]);
-        access_stl(&old, env->tr.base + (0x28 + 4 * 4), env->regs[R_ESP]);
-        access_stl(&old, env->tr.base + (0x28 + 5 * 4), env->regs[R_EBP]);
-        access_stl(&old, env->tr.base + (0x28 + 6 * 4), env->regs[R_ESI]);
-        access_stl(&old, env->tr.base + (0x28 + 7 * 4), env->regs[R_EDI]);
-        for (i = 0; i < 6; i++) {
-            access_stw(&old, env->tr.base + (0x48 + i * 4),
-                       env->segs[i].selector);
-        }
-    } else {
-        /* 16 bit */
-        access_stw(&old, env->tr.base + 0x0e, next_eip);
-        access_stw(&old, env->tr.base + 0x10, old_eflags);
-        access_stw(&old, env->tr.base + (0x12 + 0 * 2), env->regs[R_EAX]);
-        access_stw(&old, env->tr.base + (0x12 + 1 * 2), env->regs[R_ECX]);
-        access_stw(&old, env->tr.base + (0x12 + 2 * 2), env->regs[R_EDX]);
-        access_stw(&old, env->tr.base + (0x12 + 3 * 2), env->regs[R_EBX]);
-        access_stw(&old, env->tr.base + (0x12 + 4 * 2), env->regs[R_ESP]);
-        access_stw(&old, env->tr.base + (0x12 + 5 * 2), env->regs[R_EBP]);
-        access_stw(&old, env->tr.base + (0x12 + 6 * 2), env->regs[R_ESI]);
-        access_stw(&old, env->tr.base + (0x12 + 7 * 2), env->regs[R_EDI]);
-        for (i = 0; i < 4; i++) {
-            access_stw(&old, env->tr.base + (0x22 + i * 2),
-                       env->segs[i].selector);
-        }
-    }
-
-    /* now if an exception occurs, it will occurs in the next task
-       context */
-
     if (source == SWITCH_TSS_CALL) {
         cpu_stw_kernel_ra(env, tss_base, env->tr.selector, retaddr);
         new_eflags |= NT_MASK;
@@ -418,7 +421,9 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector,
     }
 
     /* set the new CPU state */
-    /* from this point, any exception which occurs can give problems */
+
+    /* now if an exception occurs, it will occur in the next task context */
+
     env->cr[0] |= CR0_TS_MASK;
     env->hflags |= HF_TS_MASK;
     env->tr.selector = tss_selector;
-- 
2.45.2
Re: [PATCH 10/10] target/i386/tcg: save current task state before loading new one
Posted by Richard Henderson 5 months, 2 weeks ago
On 7/9/24 23:29, Paolo Bonzini wrote:
> This is how the steps are ordered in the manual.  EFLAGS.NT is
> overwritten after the fact in the saved image.
> 
> Signed-off-by: Paolo Bonzini<pbonzini@redhat.com>
> ---
>   target/i386/tcg/seg_helper.c | 85 +++++++++++++++++++-----------------
>   1 file changed, 45 insertions(+), 40 deletions(-)

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>

r~