tcg: fix guest memory ordering enforcement

[PATCH] tcg: fix guest memory ordering enforcement

Posted by Redha Gouicem 3 years, 2 months ago

This commit allows memory ordering enforcement to be performed more
precisely. The previous scheme with fences always inserted before the memory
access made it impossible to correctly enforce the x86 model on weakly ordered
architectures such as arm. With this change, the memory models of guests can be
defined more precisely, with a fence before and a fence after the access. This
allows for a precise mapping of the ordering, that relies less on what type of
fences the host architecture provides.

Signed-off-by: Redha Gouicem <redha.gouicem@gmail.com>
---
 target/alpha/cpu.h  |  4 ++++
 target/arm/cpu.h    |  4 ++++
 target/avr/cpu.h    |  4 ++++
 target/hppa/cpu.h   |  4 ++++
 target/i386/cpu.h   |  4 ++++
 target/mips/cpu.h   |  4 ++++
 target/ppc/cpu.h    |  4 ++++
 target/riscv/cpu.h  |  4 ++++
 target/s390x/cpu.h  |  4 ++++
 target/xtensa/cpu.h |  4 ++++
 tcg/tcg-op.c        | 19 ++++++++++++-------
 11 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/target/alpha/cpu.h b/target/alpha/cpu.h
index d0abc949a8..00220c66c8 100644
--- a/target/alpha/cpu.h
+++ b/target/alpha/cpu.h
@@ -26,6 +26,10 @@
 
 /* Alpha processors have a weak memory model */
 #define TCG_GUEST_DEFAULT_MO      (0)
+#define TCG_GUEST_MO_BEF_LD       (0)
+#define TCG_GUEST_MO_AFT_LD       (0)
+#define TCG_GUEST_MO_BEF_ST       (0)
+#define TCG_GUEST_MO_AFT_ST       (0)
 
 #define ICACHE_LINE_SIZE 32
 #define DCACHE_LINE_SIZE 32
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index db8ff04449..ec1407dc43 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -29,6 +29,10 @@
 
 /* ARM processors have a weak memory model */
 #define TCG_GUEST_DEFAULT_MO      (0)
+#define TCG_GUEST_MO_BEF_LD       (0)
+#define TCG_GUEST_MO_AFT_LD       (0)
+#define TCG_GUEST_MO_BEF_ST       (0)
+#define TCG_GUEST_MO_AFT_ST       (0)
 
 #ifdef TARGET_AARCH64
 #define KVM_HAVE_MCE_INJECTION 1
diff --git a/target/avr/cpu.h b/target/avr/cpu.h
index 55497f851d..adcab7c88b 100644
--- a/target/avr/cpu.h
+++ b/target/avr/cpu.h
@@ -33,6 +33,10 @@
 #define CPU_RESOLVING_TYPE TYPE_AVR_CPU
 
 #define TCG_GUEST_DEFAULT_MO 0
+#define TCG_GUEST_MO_BEF_LD  0
+#define TCG_GUEST_MO_AFT_LD  0
+#define TCG_GUEST_MO_BEF_ST  0
+#define TCG_GUEST_MO_AFT_ST  0
 
 /*
  * AVR has two memory spaces, data & code.
diff --git a/target/hppa/cpu.h b/target/hppa/cpu.h
index 6f3b6beecf..a1236548cf 100644
--- a/target/hppa/cpu.h
+++ b/target/hppa/cpu.h
@@ -29,6 +29,10 @@
    a weak memory model, but with TLB bits that force ordering on a per-page
    basis.  It's probably easier to fall back to a strong memory model.  */
 #define TCG_GUEST_DEFAULT_MO        TCG_MO_ALL
+#define TCG_GUEST_MO_BEF_LD       (TCG_MO_LD_LD | TCG_MO_ST_LD)
+#define TCG_GUEST_MO_AFT_LD       (0)
+#define TCG_GUEST_MO_BEF_ST       (TCG_MO_ST_ST | TCG_MO_LD_ST)
+#define TCG_GUEST_MO_AFT_ST       (0)
 
 #define MMU_KERNEL_IDX   0
 #define MMU_USER_IDX     3
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 9661f9fbd1..c6a7052d58 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -29,6 +29,10 @@
 
 /* The x86 has a strong memory model with some store-after-load re-ordering */
 #define TCG_GUEST_DEFAULT_MO      (TCG_MO_ALL & ~TCG_MO_ST_LD)
+#define TCG_GUEST_MO_BEF_LD       (0)
+#define TCG_GUEST_MO_AFT_LD       (TCG_MO_LD_ST | TCG_MO_LD_LD)
+#define TCG_GUEST_MO_BEF_ST       (TCG_MO_ST_ST)
+#define TCG_GUEST_MO_AFT_ST       (0)
 
 #define KVM_HAVE_MCE_INJECTION 1
 
diff --git a/target/mips/cpu.h b/target/mips/cpu.h
index 5335ac10a3..c2c0ca9c4a 100644
--- a/target/mips/cpu.h
+++ b/target/mips/cpu.h
@@ -8,6 +8,10 @@
 #include "mips-defs.h"
 
 #define TCG_GUEST_DEFAULT_MO (0)
+#define TCG_GUEST_MO_BEF_LD  (0)
+#define TCG_GUEST_MO_AFT_LD  (0)
+#define TCG_GUEST_MO_BEF_ST  (0)
+#define TCG_GUEST_MO_AFT_ST  (0)
 
 typedef struct CPUMIPSTLBContext CPUMIPSTLBContext;
 
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index c2b6c987c0..2763ba465a 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -27,6 +27,10 @@
 #include "qom/object.h"
 
 #define TCG_GUEST_DEFAULT_MO 0
+#define TCG_GUEST_MO_BEF_LD  0
+#define TCG_GUEST_MO_AFT_LD  0
+#define TCG_GUEST_MO_BEF_ST  0
+#define TCG_GUEST_MO_AFT_ST  0
 
 #define TARGET_PAGE_BITS_64K 16
 #define TARGET_PAGE_BITS_16M 24
diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
index 34c22d5d3b..5f3a9dd463 100644
--- a/target/riscv/cpu.h
+++ b/target/riscv/cpu.h
@@ -29,6 +29,10 @@
 #include "cpu_bits.h"
 
 #define TCG_GUEST_DEFAULT_MO 0
+#define TCG_GUEST_MO_BEF_LD  0
+#define TCG_GUEST_MO_AFT_LD  0
+#define TCG_GUEST_MO_BEF_ST  0
+#define TCG_GUEST_MO_AFT_ST  0
 
 #define TYPE_RISCV_CPU "riscv-cpu"
 
diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h
index 7d6d01325b..59684809cd 100644
--- a/target/s390x/cpu.h
+++ b/target/s390x/cpu.h
@@ -34,6 +34,10 @@
 
 /* The z/Architecture has a strong memory model with some store-after-load re-ordering */
 #define TCG_GUEST_DEFAULT_MO      (TCG_MO_ALL & ~TCG_MO_ST_LD)
+#define TCG_GUEST_MO_BEF_LD       (0)
+#define TCG_GUEST_MO_AFT_LD       (TCG_MO_LD_ST | TCG_MO_LD_LD)
+#define TCG_GUEST_MO_BEF_ST       (TCG_MO_ST_ST)
+#define TCG_GUEST_MO_AFT_ST       (0)
 
 #define TARGET_INSN_START_EXTRA_WORDS 2
 
diff --git a/target/xtensa/cpu.h b/target/xtensa/cpu.h
index d4b8268146..29ef6ec5b7 100644
--- a/target/xtensa/cpu.h
+++ b/target/xtensa/cpu.h
@@ -35,6 +35,10 @@
 
 /* Xtensa processors have a weak memory model */
 #define TCG_GUEST_DEFAULT_MO      (0)
+#define TCG_GUEST_MO_BEF_LD       (0)
+#define TCG_GUEST_MO_AFT_LD       (0)
+#define TCG_GUEST_MO_BEF_ST       (0)
+#define TCG_GUEST_MO_AFT_ST       (0)
 
 enum {
     /* Additional instructions */
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 5d48537927..d9b43bc38a 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -2834,9 +2834,6 @@ static void gen_ldst_i64(TCGOpcode opc, TCGv_i64 val, TCGv addr,
 
 static void tcg_gen_req_mo(TCGBar type)
 {
-#ifdef TCG_GUEST_DEFAULT_MO
-    type &= TCG_GUEST_DEFAULT_MO;
-#endif
     type &= ~TCG_TARGET_DEFAULT_MO;
     if (type) {
         tcg_gen_mb(type | TCG_BAR_SC);
@@ -2873,7 +2870,7 @@ void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, MemOp memop)
     MemOp orig_memop;
     MemOpIdx oi;
 
-    tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD);
+    tcg_gen_req_mo(TCG_GUEST_MO_BEF_LD);
     memop = tcg_canonicalize_memop(memop, 0, 0);
     oi = make_memop_idx(memop, idx);
 
@@ -2904,6 +2901,8 @@ void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, MemOp memop)
             g_assert_not_reached();
         }
     }
+
+    tcg_gen_req_mo(TCG_GUEST_MO_AFT_LD);
 }
 
 void tcg_gen_qemu_st_i32(TCGv_i32 val, TCGv addr, TCGArg idx, MemOp memop)
@@ -2911,7 +2910,7 @@ void tcg_gen_qemu_st_i32(TCGv_i32 val, TCGv addr, TCGArg idx, MemOp memop)
     TCGv_i32 swap = NULL;
     MemOpIdx oi;
 
-    tcg_gen_req_mo(TCG_MO_LD_ST | TCG_MO_ST_ST);
+    tcg_gen_req_mo(TCG_GUEST_MO_BEF_ST);
     memop = tcg_canonicalize_memop(memop, 0, 1);
     oi = make_memop_idx(memop, idx);
 
@@ -2942,6 +2941,8 @@ void tcg_gen_qemu_st_i32(TCGv_i32 val, TCGv addr, TCGArg idx, MemOp memop)
     if (swap) {
         tcg_temp_free_i32(swap);
     }
+
+    tcg_gen_req_mo(TCG_GUEST_MO_AFT_ST);
 }
 
 void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop)
@@ -2959,7 +2960,7 @@ void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop)
         return;
     }
 
-    tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD);
+    tcg_gen_req_mo(TCG_GUEST_MO_BEF_LD);
     memop = tcg_canonicalize_memop(memop, 1, 0);
     oi = make_memop_idx(memop, idx);
 
@@ -2994,6 +2995,8 @@ void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop)
             g_assert_not_reached();
         }
     }
+
+    tcg_gen_req_mo(TCG_GUEST_MO_AFT_LD);
 }
 
 void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop)
@@ -3006,7 +3009,7 @@ void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop)
         return;
     }
 
-    tcg_gen_req_mo(TCG_MO_LD_ST | TCG_MO_ST_ST);
+    tcg_gen_req_mo(TCG_GUEST_MO_BEF_ST);
     memop = tcg_canonicalize_memop(memop, 1, 1);
     oi = make_memop_idx(memop, idx);
 
@@ -3036,6 +3039,8 @@ void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop)
     if (swap) {
         tcg_temp_free_i64(swap);
     }
+
+    tcg_gen_req_mo(TCG_GUEST_MO_AFT_ST);
 }
 
 static void tcg_gen_ext_i32(TCGv_i32 ret, TCGv_i32 val, MemOp opc)
-- 
2.36.0

Re: [PATCH] tcg: fix guest memory ordering enforcement

Posted by Richard Henderson 3 years, 2 months ago

On 4/28/22 04:32, Redha Gouicem wrote:
> This commit allows memory ordering enforcement to be performed more
> precisely. The previous scheme with fences always inserted before the memory
> access made it impossible to correctly enforce the x86 model on weakly ordered
> architectures such as arm. With this change, the memory models of guests can be
> defined more precisely, with a fence before and a fence after the access. This
> allows for a precise mapping of the ordering, that relies less on what type of
> fences the host architecture provides.
> 
> Signed-off-by: Redha Gouicem <redha.gouicem@gmail.com>
> ---
>   target/alpha/cpu.h  |  4 ++++
>   target/arm/cpu.h    |  4 ++++
>   target/avr/cpu.h    |  4 ++++
>   target/hppa/cpu.h   |  4 ++++
>   target/i386/cpu.h   |  4 ++++
>   target/mips/cpu.h   |  4 ++++
>   target/ppc/cpu.h    |  4 ++++
>   target/riscv/cpu.h  |  4 ++++
>   target/s390x/cpu.h  |  4 ++++
>   target/xtensa/cpu.h |  4 ++++
>   tcg/tcg-op.c        | 19 ++++++++++++-------
>   11 files changed, 52 insertions(+), 7 deletions(-)

First of all, patches that don't compile aren't particularly helpful.  You've missed out 
on updating 10 of 20 targets.

I do not believe your assertion that the current scheme "makes it impossible to correctly 
enforce the memory model".  Please provide your rationale.

>   /* Alpha processors have a weak memory model */
>   #define TCG_GUEST_DEFAULT_MO      (0)
> +#define TCG_GUEST_MO_BEF_LD       (0)
> +#define TCG_GUEST_MO_AFT_LD       (0)
> +#define TCG_GUEST_MO_BEF_ST       (0)
> +#define TCG_GUEST_MO_AFT_ST       (0)

There's no new information here.

> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index 9661f9fbd1..c6a7052d58 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -29,6 +29,10 @@
>   
>   /* The x86 has a strong memory model with some store-after-load re-ordering */
>   #define TCG_GUEST_DEFAULT_MO      (TCG_MO_ALL & ~TCG_MO_ST_LD)
> +#define TCG_GUEST_MO_BEF_LD       (0)
> +#define TCG_GUEST_MO_AFT_LD       (TCG_MO_LD_ST | TCG_MO_LD_LD)
> +#define TCG_GUEST_MO_BEF_ST       (TCG_MO_ST_ST)
> +#define TCG_GUEST_MO_AFT_ST       (0)

Or even here -- you're making the executive decision that barriers go after loads and 
before stores.  Note that

   TCG_GUEST_MO_AFT_LD == (TCG_GUEST_DEFAULT_MO & (TCG_MO_LD_ST | TCG_MO_LD_LD))
   TCG_GUEST_MO_BEF_ST == (TCG_GUEST_DEFAULT_MO & TCG_MO_ST_ST)
   TCG_GUEST_MO_AFT_ST == (TCG_GUEST_DEFAULT_MO & TCG_MO_ST_LD)

You could just as easily make this selection in tcg-op.c alone.

I'm not sure why putting a ST_ST barrier before a store is better or worse than putting a 
ST_ST barrier after the store.  I could imagine that putting wmb barriers before the 
operation schedules better with the surrounding code, but I'd want that decision 
explicitly stated.

There *are* still missing barriers with the cpu_ld*_data* and cpu_st*_data* functions, as 
used by out-of-line helpers.  Certainly I expect those to have less impact than the 
generated code, but they are still required.

r~