hw/arm/virt: Add multiple nested SMMUs

[PATCH RFCv1 05/10] hw/arm/virt: Add VIRT_NESTED_SMMU

Posted by Nicolin Chen 2 months, 3 weeks ago

VIRT_SMMU can be used as an emulated SMMU, i.e. iommu=smmuv3. However, the
MMIO space for VIRT_SMMU isn't large enough to accommodate multiple nested
SMMUv3 instances. Add a new VIRT_NESTED_SMMU to separate MMIO space.

Meanwhile, a nested SMMUv3 could only work with a vfio-pci device that is
physically associated to a host-level SMMUv3 instance, meaning that such a
passthrough deivce must be linked to a nesteed SMMU corresponding to the
underlying host-level SMMUv3 instance:
      HW SMMU0 <-----------------> vSMMU0 (nested)
          ^                             ^
	  |                             |
	  ----> PCI dev0 & PCI dev1 <----
      HW SMMU1 <-----------------> vSMMU1 (nested)
          ^                             ^
	  |                             |
	  ----> PCI dev2 & PCI dev3 <----

Note that, on the other hand, an emulated deivce should not be associated
to a nested SMMU.

To prepare for that, create a PCIe Expander Bridge per nested SMMU as a
docker for pci-vfio devices. It eases the QEMU code to build ID mappings
in the IORT.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 hw/arm/virt.c         | 110 +++++++++++++++++++++++++++++++++++++-----
 include/hw/arm/virt.h |  17 +++++++
 2 files changed, 116 insertions(+), 11 deletions(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index ef59c79ca3..a54332fca8 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -55,6 +55,7 @@
 #include "qemu/bitops.h"
 #include "qemu/error-report.h"
 #include "qemu/module.h"
+#include "hw/pci/pci_bus.h"
 #include "hw/pci-host/gpex.h"
 #include "hw/virtio/virtio-pci.h"
 #include "hw/core/sysbus-fdt.h"
@@ -83,6 +84,7 @@
 #include "hw/virtio/virtio-md-pci.h"
 #include "hw/virtio/virtio-iommu.h"
 #include "hw/char/pl011.h"
+#include "qemu/config-file.h"
 #include "qemu/guest-random.h"
 
 static GlobalProperty arm_virt_compat[] = {
@@ -177,6 +179,7 @@ static const MemMapEntry base_memmap[] = {
     [VIRT_PVTIME] =             { 0x090a0000, 0x00010000 },
     [VIRT_SECURE_GPIO] =        { 0x090b0000, 0x00001000 },
     [VIRT_MMIO] =               { 0x0a000000, 0x00000200 },
+    [VIRT_NESTED_SMMU] =        { 0x0b000000, 0x01000000 },
     /* ...repeating for a total of NUM_VIRTIO_TRANSPORTS, each of that size */
     [VIRT_PLATFORM_BUS] =       { 0x0c000000, 0x02000000 },
     [VIRT_SECURE_MEM] =         { 0x0e000000, 0x01000000 },
@@ -221,7 +224,8 @@ static const int a15irqmap[] = {
     [VIRT_MMIO] = 16, /* ...to 16 + NUM_VIRTIO_TRANSPORTS - 1 */
     [VIRT_GIC_V2M] = 48, /* ...to 48 + NUM_GICV2M_SPIS - 1 */
     [VIRT_SMMU] = 74,    /* ...to 74 + NUM_SMMU_IRQS - 1 */
-    [VIRT_PLATFORM_BUS] = 112, /* ...to 112 + PLATFORM_BUS_NUM_IRQS -1 */
+    [VIRT_PLATFORM_BUS] = 112, /* ...to 112 + PLATFORM_BUS_NUM_IRQS -1 (179) */
+    [VIRT_NESTED_SMMU] = 200, /* Keep it at the end of list */
 };
 
 static void create_randomness(MachineState *ms, const char *node)
@@ -1383,21 +1387,19 @@ static void create_pcie_irq_map(const MachineState *ms,
                            0x7           /* PCI irq */);
 }
 
-static void create_smmu(const VirtMachineState *vms,
-                        PCIBus *bus)
+static DeviceState *_create_smmu(const VirtMachineState *vms, PCIBus *bus,
+                                 hwaddr base, hwaddr size, int irq,
+                                 uint32_t smmu_phandle)
 {
     char *node;
     const char compat[] = "arm,smmu-v3";
-    int irq =  vms->irqmap[VIRT_SMMU];
     int i;
-    hwaddr base = vms->memmap[VIRT_SMMU].base;
-    hwaddr size = vms->memmap[VIRT_SMMU].size;
     const char irq_names[] = "eventq\0priq\0cmdq-sync\0gerror";
     DeviceState *dev;
     MachineState *ms = MACHINE(vms);
 
-    if (!virt_has_smmuv3(vms) || !vms->iommu_phandle) {
-        return;
+    if (!virt_has_smmuv3(vms) || !smmu_phandle) {
+        return NULL;
     }
 
     dev = qdev_new(TYPE_ARM_SMMUV3);
@@ -1436,8 +1438,31 @@ static void create_smmu(const VirtMachineState *vms,
 
     qemu_fdt_setprop_cell(ms->fdt, node, "#iommu-cells", 1);
 
-    qemu_fdt_setprop_cell(ms->fdt, node, "phandle", vms->iommu_phandle);
+    qemu_fdt_setprop_cell(ms->fdt, node, "phandle", smmu_phandle);
     g_free(node);
+
+    return dev;
+}
+
+static DeviceState *create_smmu(const VirtMachineState *vms, PCIBus *bus)
+{
+    hwaddr base = vms->memmap[VIRT_SMMU].base;
+    hwaddr size = vms->memmap[VIRT_SMMU].size;
+    int irq = vms->irqmap[VIRT_SMMU];
+
+    return _create_smmu(vms, bus, base, size, irq, vms->iommu_phandle);
+}
+
+static DeviceState *create_nested_smmu(const VirtMachineState *vms, PCIBus *bus,
+                                       int i)
+{
+    hwaddr base = vms->memmap[VIRT_NESTED_SMMU].base + i * SMMU_IO_LEN;
+    int irq = vms->irqmap[VIRT_SMMU] + i * NUM_SMMU_IRQS;
+    hwaddr size = SMMU_IO_LEN;
+    DeviceState *dev;
+
+    dev = _create_smmu(vms, bus, base, size, irq, vms->nested_smmu_phandle[i]);
+    return dev;
 }
 
 static void create_virtio_iommu_dt_bindings(VirtMachineState *vms)
@@ -1466,6 +1491,48 @@ static void create_virtio_iommu_dt_bindings(VirtMachineState *vms)
                            bdf + 1, vms->iommu_phandle, bdf + 1, 0xffff - bdf);
 }
 
+/*
+ * FIXME this is used to reverse for hotplug devices, yet it could result in a
+ * big waste of PCI bus numbners.
+ */
+#define MAX_DEV_PER_NESTED_SMMU (4)
+
+static PCIBus *create_pcie_expander_bridge(VirtMachineState *vms, uint8_t idx)
+{
+    /* Total = PXB + MAX_DEV_PER_NESTED_SMMU x (Rootport + device) */
+    const uint8_t total_bus_nums = 1 + MAX_DEV_PER_NESTED_SMMU * 2;
+    uint8_t bus_nr = PCI_BUS_MAX -
+                     (vms->num_nested_smmus - idx) * total_bus_nums;
+
+    VirtNestedSmmu *nested_smmu = find_nested_smmu_by_index(vms, idx);
+    char *name_pxb = g_strdup_printf("pxb_for_smmu.%d", idx);
+    PCIBus *bus = vms->bus;
+    DeviceState *dev;
+
+    /* Create an expander bridge */
+    dev = qdev_new("pxb-pcie");
+    if (!qdev_set_id(dev, name_pxb, &error_fatal)) {
+        return NULL;
+    }
+
+    qdev_prop_set_uint8(dev, "bus_nr", bus_nr);
+    qdev_prop_set_uint16(dev, "numa_node", idx);
+    qdev_realize_and_unref(dev, BUS(bus), &error_fatal);
+
+    /* Get the pxb bus */
+    QLIST_FOREACH(bus, &bus->child, sibling) {
+        if (pci_bus_num(bus) == bus_nr) {
+            break;
+        }
+    }
+    g_assert(bus && pci_bus_num(bus) == bus_nr);
+    nested_smmu->pci_bus = bus;
+    nested_smmu->reserved_bus_nums = total_bus_nums;
+
+    /* Return the pxb bus */
+    return bus;
+}
+
 static void create_pcie(VirtMachineState *vms)
 {
     hwaddr base_mmio = vms->memmap[VIRT_PCIE_MMIO].base;
@@ -1580,12 +1647,33 @@ static void create_pcie(VirtMachineState *vms)
     qemu_fdt_setprop_cell(ms->fdt, nodename, "#interrupt-cells", 1);
     create_pcie_irq_map(ms, vms->gic_phandle, irq, nodename);
 
-    if (vms->iommu) {
+    /* Build PCI Expander Bridge + Root Port from the top of PCI_BUS_MAX */
+    if (vms->num_nested_smmus) {
+        /* VIRT_NESTED_SMMU must hold all vSMMUs */
+        g_assert(vms->num_nested_smmus <=
+                 vms->memmap[VIRT_NESTED_SMMU].size / SMMU_IO_LEN);
+
+        vms->nested_smmu_phandle = g_new0(uint32_t, vms->num_nested_smmus);
+
+        for (i = 0; i < vms->num_nested_smmus; i++) {
+            DeviceState *smmu_dev;
+            PCIBus *pxb_bus;
+
+            pxb_bus = create_pcie_expander_bridge(vms, i);
+            g_assert(pxb_bus);
+
+            vms->nested_smmu_phandle[i] = qemu_fdt_alloc_phandle(ms->fdt);
+            smmu_dev = create_nested_smmu(vms, pxb_bus, i);
+            g_assert(smmu_dev);
+
+            qemu_fdt_setprop_cells(ms->fdt, nodename, "iommu-map", 0x0,
+                                   vms->nested_smmu_phandle[i], 0x0, 0x10000);
+        }
+    } else if (vms->iommu) {
         vms->iommu_phandle = qemu_fdt_alloc_phandle(ms->fdt);
 
         switch (vms->iommu) {
         case VIRT_IOMMU_SMMUV3:
-        case VIRT_IOMMU_NESTED_SMMUV3:
             create_smmu(vms, vms->bus);
             qemu_fdt_setprop_cells(ms->fdt, nodename, "iommu-map",
                                    0x0, vms->iommu_phandle, 0x0, 0x10000);
diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
index b35c4f62d7..0a3f1ab8b5 100644
--- a/include/hw/arm/virt.h
+++ b/include/hw/arm/virt.h
@@ -63,6 +63,7 @@ enum {
     VIRT_GIC_ITS,
     VIRT_GIC_REDIST,
     VIRT_SMMU,
+    VIRT_NESTED_SMMU,
     VIRT_UART,
     VIRT_MMIO,
     VIRT_RTC,
@@ -141,6 +142,8 @@ struct VirtMachineClass {
 typedef struct VirtNestedSmmu {
     int index;
     char *smmu_node;
+    PCIBus *pci_bus;
+    int reserved_bus_nums;
     QLIST_ENTRY(VirtNestedSmmu) next;
 } VirtNestedSmmu;
 
@@ -179,6 +182,7 @@ struct VirtMachineState {
     uint32_t gic_phandle;
     uint32_t msi_phandle;
     uint32_t iommu_phandle;
+    uint32_t *nested_smmu_phandle;
     int psci_conduit;
     hwaddr highest_gpa;
     DeviceState *gic;
@@ -229,4 +233,17 @@ static inline bool virt_has_smmuv3(const VirtMachineState *vms)
            vms->iommu == VIRT_IOMMU_NESTED_SMMUV3;
 }
 
+static inline VirtNestedSmmu *
+find_nested_smmu_by_index(VirtMachineState *vms, int index)
+{
+    VirtNestedSmmu *nested_smmu;
+
+    QLIST_FOREACH(nested_smmu, &vms->nested_smmu_list, next) {
+        if (nested_smmu->index == index) {
+            return nested_smmu;
+        }
+    }
+    return NULL;
+}
+
 #endif /* QEMU_ARM_VIRT_H */
-- 
2.43.0

Re: [PATCH RFCv1 05/10] hw/arm/virt: Add VIRT_NESTED_SMMU

Posted by Eric Auger 2 months, 1 week ago

Hi Nicolin,

On 6/26/24 02:28, Nicolin Chen wrote:
> VIRT_SMMU can be used as an emulated SMMU, i.e. iommu=smmuv3. However, the
> MMIO space for VIRT_SMMU isn't large enough to accommodate multiple nested
> SMMUv3 instances. Add a new VIRT_NESTED_SMMU to separate MMIO space.
>
> Meanwhile, a nested SMMUv3 could only work with a vfio-pci device that is
> physically associated to a host-level SMMUv3 instance, meaning that such a
> passthrough deivce must be linked to a nesteed SMMU corresponding to the
> underlying host-level SMMUv3 instance:
>       HW SMMU0 <-----------------> vSMMU0 (nested)
>           ^                             ^
> 	  |                             |
> 	  ----> PCI dev0 & PCI dev1 <----
>       HW SMMU1 <-----------------> vSMMU1 (nested)
>           ^                             ^
> 	  |                             |
> 	  ----> PCI dev2 & PCI dev3 <----
>
> Note that, on the other hand, an emulated deivce should not be associated
> to a nested SMMU.
>
> To prepare for that, create a PCIe Expander Bridge per nested SMMU as a
> docker for pci-vfio devices. It eases the QEMU code to build ID mappings
> in the IORT.
>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> ---
>  hw/arm/virt.c         | 110 +++++++++++++++++++++++++++++++++++++-----
>  include/hw/arm/virt.h |  17 +++++++
>  2 files changed, 116 insertions(+), 11 deletions(-)
>
> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> index ef59c79ca3..a54332fca8 100644
> --- a/hw/arm/virt.c
> +++ b/hw/arm/virt.c
> @@ -55,6 +55,7 @@
>  #include "qemu/bitops.h"
>  #include "qemu/error-report.h"
>  #include "qemu/module.h"
> +#include "hw/pci/pci_bus.h"
>  #include "hw/pci-host/gpex.h"
>  #include "hw/virtio/virtio-pci.h"
>  #include "hw/core/sysbus-fdt.h"
> @@ -83,6 +84,7 @@
>  #include "hw/virtio/virtio-md-pci.h"
>  #include "hw/virtio/virtio-iommu.h"
>  #include "hw/char/pl011.h"
> +#include "qemu/config-file.h"
>  #include "qemu/guest-random.h"
>  
>  static GlobalProperty arm_virt_compat[] = {
> @@ -177,6 +179,7 @@ static const MemMapEntry base_memmap[] = {
>      [VIRT_PVTIME] =             { 0x090a0000, 0x00010000 },
>      [VIRT_SECURE_GPIO] =        { 0x090b0000, 0x00001000 },
>      [VIRT_MMIO] =               { 0x0a000000, 0x00000200 },
> +    [VIRT_NESTED_SMMU] =        { 0x0b000000, 0x01000000 },
>      /* ...repeating for a total of NUM_VIRTIO_TRANSPORTS, each of that size */
>      [VIRT_PLATFORM_BUS] =       { 0x0c000000, 0x02000000 },
>      [VIRT_SECURE_MEM] =         { 0x0e000000, 0x01000000 },
> @@ -221,7 +224,8 @@ static const int a15irqmap[] = {
>      [VIRT_MMIO] = 16, /* ...to 16 + NUM_VIRTIO_TRANSPORTS - 1 */
>      [VIRT_GIC_V2M] = 48, /* ...to 48 + NUM_GICV2M_SPIS - 1 */
>      [VIRT_SMMU] = 74,    /* ...to 74 + NUM_SMMU_IRQS - 1 */
> -    [VIRT_PLATFORM_BUS] = 112, /* ...to 112 + PLATFORM_BUS_NUM_IRQS -1 */
> +    [VIRT_PLATFORM_BUS] = 112, /* ...to 112 + PLATFORM_BUS_NUM_IRQS -1 (179) */
> +    [VIRT_NESTED_SMMU] = 200, /* Keep it at the end of list */
>  };
>  
>  static void create_randomness(MachineState *ms, const char *node)
> @@ -1383,21 +1387,19 @@ static void create_pcie_irq_map(const MachineState *ms,
>                             0x7           /* PCI irq */);
>  }
>  
> -static void create_smmu(const VirtMachineState *vms,
> -                        PCIBus *bus)
> +static DeviceState *_create_smmu(const VirtMachineState *vms, PCIBus *bus,
> +                                 hwaddr base, hwaddr size, int irq,
> +                                 uint32_t smmu_phandle)
>  {
>      char *node;
>      const char compat[] = "arm,smmu-v3";
> -    int irq =  vms->irqmap[VIRT_SMMU];
>      int i;
> -    hwaddr base = vms->memmap[VIRT_SMMU].base;
> -    hwaddr size = vms->memmap[VIRT_SMMU].size;
>      const char irq_names[] = "eventq\0priq\0cmdq-sync\0gerror";
>      DeviceState *dev;
>      MachineState *ms = MACHINE(vms);
>  
> -    if (!virt_has_smmuv3(vms) || !vms->iommu_phandle) {
> -        return;
> +    if (!virt_has_smmuv3(vms) || !smmu_phandle) {
> +        return NULL;
>      }
>  
>      dev = qdev_new(TYPE_ARM_SMMUV3);
> @@ -1436,8 +1438,31 @@ static void create_smmu(const VirtMachineState *vms,
>  
>      qemu_fdt_setprop_cell(ms->fdt, node, "#iommu-cells", 1);
>  
> -    qemu_fdt_setprop_cell(ms->fdt, node, "phandle", vms->iommu_phandle);
> +    qemu_fdt_setprop_cell(ms->fdt, node, "phandle", smmu_phandle);
>      g_free(node);
> +
> +    return dev;
> +}
> +
> +static DeviceState *create_smmu(const VirtMachineState *vms, PCIBus *bus)
> +{
> +    hwaddr base = vms->memmap[VIRT_SMMU].base;
> +    hwaddr size = vms->memmap[VIRT_SMMU].size;
> +    int irq = vms->irqmap[VIRT_SMMU];
> +
> +    return _create_smmu(vms, bus, base, size, irq, vms->iommu_phandle);
> +}
> +
> +static DeviceState *create_nested_smmu(const VirtMachineState *vms, PCIBus *bus,
> +                                       int i)
> +{
> +    hwaddr base = vms->memmap[VIRT_NESTED_SMMU].base + i * SMMU_IO_LEN;
> +    int irq = vms->irqmap[VIRT_SMMU] + i * NUM_SMMU_IRQS;
> +    hwaddr size = SMMU_IO_LEN;
> +    DeviceState *dev;
> +
> +    dev = _create_smmu(vms, bus, base, size, irq, vms->nested_smmu_phandle[i]);
> +    return dev;
>  }
>  
>  static void create_virtio_iommu_dt_bindings(VirtMachineState *vms)
> @@ -1466,6 +1491,48 @@ static void create_virtio_iommu_dt_bindings(VirtMachineState *vms)
>                             bdf + 1, vms->iommu_phandle, bdf + 1, 0xffff - bdf);
>  }
>  
> +/*
> + * FIXME this is used to reverse for hotplug devices, yet it could result in a
> + * big waste of PCI bus numbners.
> + */
> +#define MAX_DEV_PER_NESTED_SMMU (4)
> +
> +static PCIBus *create_pcie_expander_bridge(VirtMachineState *vms, uint8_t idx)
> +{
> +    /* Total = PXB + MAX_DEV_PER_NESTED_SMMU x (Rootport + device) */
> +    const uint8_t total_bus_nums = 1 + MAX_DEV_PER_NESTED_SMMU * 2;
> +    uint8_t bus_nr = PCI_BUS_MAX -
> +                     (vms->num_nested_smmus - idx) * total_bus_nums;
> +
> +    VirtNestedSmmu *nested_smmu = find_nested_smmu_by_index(vms, idx);
> +    char *name_pxb = g_strdup_printf("pxb_for_smmu.%d", idx);
> +    PCIBus *bus = vms->bus;
> +    DeviceState *dev;
> +
> +    /* Create an expander bridge */
> +    dev = qdev_new("pxb-pcie");
> +    if (!qdev_set_id(dev, name_pxb, &error_fatal)) {
> +        return NULL;
> +    }
> +
> +    qdev_prop_set_uint8(dev, "bus_nr", bus_nr);
> +    qdev_prop_set_uint16(dev, "numa_node", idx);
> +    qdev_realize_and_unref(dev, BUS(bus), &error_fatal);
> +
> +    /* Get the pxb bus */
> +    QLIST_FOREACH(bus, &bus->child, sibling) {
> +        if (pci_bus_num(bus) == bus_nr) {
> +            break;
> +        }
> +    }
> +    g_assert(bus && pci_bus_num(bus) == bus_nr);
> +    nested_smmu->pci_bus = bus;
> +    nested_smmu->reserved_bus_nums = total_bus_nums;
> +
> +    /* Return the pxb bus */
> +    return bus;
> +}
> +
>  static void create_pcie(VirtMachineState *vms)
>  {
>      hwaddr base_mmio = vms->memmap[VIRT_PCIE_MMIO].base;
> @@ -1580,12 +1647,33 @@ static void create_pcie(VirtMachineState *vms)
>      qemu_fdt_setprop_cell(ms->fdt, nodename, "#interrupt-cells", 1);
>      create_pcie_irq_map(ms, vms->gic_phandle, irq, nodename);
>  
> -    if (vms->iommu) {
> +    /* Build PCI Expander Bridge + Root Port from the top of PCI_BUS_MAX */
> +    if (vms->num_nested_smmus) {
> +        /* VIRT_NESTED_SMMU must hold all vSMMUs */
> +        g_assert(vms->num_nested_smmus <=
> +                 vms->memmap[VIRT_NESTED_SMMU].size / SMMU_IO_LEN);
> +
> +        vms->nested_smmu_phandle = g_new0(uint32_t, vms->num_nested_smmus);
> +
> +        for (i = 0; i < vms->num_nested_smmus; i++) {
> +            DeviceState *smmu_dev;
> +            PCIBus *pxb_bus;
> +
> +            pxb_bus = create_pcie_expander_bridge(vms, i);
> +            g_assert(pxb_bus);
> +
> +            vms->nested_smmu_phandle[i] = qemu_fdt_alloc_phandle(ms->fdt);
> +            smmu_dev = create_nested_smmu(vms, pxb_bus, i);
> +            g_assert(smmu_dev);
> +
> +            qemu_fdt_setprop_cells(ms->fdt, nodename, "iommu-map", 0x0,
> +                                   vms->nested_smmu_phandle[i], 0x0, 0x10000);
I think libvirt is supposed to create the pcie bus topology instead and
it shall not be created by qemu behind the scene.
The pcie elements you create here are not visible to libvirt and I guess
they may collide with elements explicitly created by libvirt at a given
pci bdf.

I think it would make more sense to be able to attach an smmu instance
to a given pci root or pxb either by adding an iommu id to a given
pxb-pcie option

-device
pxb-pcie,bus_nr=100,id=pci.12,numa_node=0,bus=pcie.0,addr=0x3,iommu=<id>
or
adding a list of pxb ids to the iommu option. It is unfortunate the
iommu option is a machine option. the smmu being a sysbus device the
platform bus framework could be considered to dynamically allocate them
using the -device option. This has been used along with dt generation
but with ACPI this would need to be studied. However at the time the
smmu was integrated the machine option was prefered.

Maybe using the 1st option would allow to infer that if there are
different iommu ids this implies that several IOMMU instances need to be
created.

 
Adding Andrea in cc

Thanks

Eric



> +        }
> +    } else if (vms->iommu) {
>          vms->iommu_phandle = qemu_fdt_alloc_phandle(ms->fdt);
>  
>          switch (vms->iommu) {
>          case VIRT_IOMMU_SMMUV3:
> -        case VIRT_IOMMU_NESTED_SMMUV3:
>              create_smmu(vms, vms->bus);
>              qemu_fdt_setprop_cells(ms->fdt, nodename, "iommu-map",
>                                     0x0, vms->iommu_phandle, 0x0, 0x10000);
> diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
> index b35c4f62d7..0a3f1ab8b5 100644
> --- a/include/hw/arm/virt.h
> +++ b/include/hw/arm/virt.h
> @@ -63,6 +63,7 @@ enum {
>      VIRT_GIC_ITS,
>      VIRT_GIC_REDIST,
>      VIRT_SMMU,
> +    VIRT_NESTED_SMMU,
>      VIRT_UART,
>      VIRT_MMIO,
>      VIRT_RTC,
> @@ -141,6 +142,8 @@ struct VirtMachineClass {
>  typedef struct VirtNestedSmmu {
>      int index;
>      char *smmu_node;
> +    PCIBus *pci_bus;
> +    int reserved_bus_nums;
>      QLIST_ENTRY(VirtNestedSmmu) next;
>  } VirtNestedSmmu;
>  
> @@ -179,6 +182,7 @@ struct VirtMachineState {
>      uint32_t gic_phandle;
>      uint32_t msi_phandle;
>      uint32_t iommu_phandle;
> +    uint32_t *nested_smmu_phandle;
>      int psci_conduit;
>      hwaddr highest_gpa;
>      DeviceState *gic;
> @@ -229,4 +233,17 @@ static inline bool virt_has_smmuv3(const VirtMachineState *vms)
>             vms->iommu == VIRT_IOMMU_NESTED_SMMUV3;
>  }
>  
> +static inline VirtNestedSmmu *
> +find_nested_smmu_by_index(VirtMachineState *vms, int index)
> +{
> +    VirtNestedSmmu *nested_smmu;
> +
> +    QLIST_FOREACH(nested_smmu, &vms->nested_smmu_list, next) {
> +        if (nested_smmu->index == index) {
> +            return nested_smmu;
> +        }
> +    }
> +    return NULL;
> +}
> +
>  #endif /* QEMU_ARM_VIRT_H */

Re: [PATCH RFCv1 05/10] hw/arm/virt: Add VIRT_NESTED_SMMU

Posted by Nicolin Chen 2 months, 1 week ago

On Tue, Jul 09, 2024 at 03:26:58PM +0200, Eric Auger wrote:

> > @@ -1580,12 +1647,33 @@ static void create_pcie(VirtMachineState *vms)
> >      qemu_fdt_setprop_cell(ms->fdt, nodename, "#interrupt-cells", 1);
> >      create_pcie_irq_map(ms, vms->gic_phandle, irq, nodename);
> >
> > -    if (vms->iommu) {
> > +    /* Build PCI Expander Bridge + Root Port from the top of PCI_BUS_MAX */
> > +    if (vms->num_nested_smmus) {
> > +        /* VIRT_NESTED_SMMU must hold all vSMMUs */
> > +        g_assert(vms->num_nested_smmus <=
> > +                 vms->memmap[VIRT_NESTED_SMMU].size / SMMU_IO_LEN);
> > +
> > +        vms->nested_smmu_phandle = g_new0(uint32_t, vms->num_nested_smmus);
> > +
> > +        for (i = 0; i < vms->num_nested_smmus; i++) {
> > +            DeviceState *smmu_dev;
> > +            PCIBus *pxb_bus;
> > +
> > +            pxb_bus = create_pcie_expander_bridge(vms, i);
> > +            g_assert(pxb_bus);
> > +
> > +            vms->nested_smmu_phandle[i] = qemu_fdt_alloc_phandle(ms->fdt);
> > +            smmu_dev = create_nested_smmu(vms, pxb_bus, i);
> > +            g_assert(smmu_dev);
> > +
> > +            qemu_fdt_setprop_cells(ms->fdt, nodename, "iommu-map", 0x0,
> > +                                   vms->nested_smmu_phandle[i], 0x0, 0x10000);
> I think libvirt is supposed to create the pcie bus topology instead and
> it shall not be created by qemu behind the scene.
> The pcie elements you create here are not visible to libvirt and I guess
> they may collide with elements explicitly created by libvirt at a given
> pci bdf.

Yea, the bdf conflict is a concern. So I allocated the bdf list
from the top of the bus number... one of the reasons of doing
this is to ease users so they don't need to deal with the over-
complicated topology. I will try libvirt and see how it goes.

> I think it would make more sense to be able to attach an smmu instance
> to a given pci root or pxb either by adding an iommu id to a given
> pxb-pcie option
> 
> -device
> pxb-pcie,bus_nr=100,id=pci.12,numa_node=0,bus=pcie.0,addr=0x3,iommu=<id>
> or
> adding a list of pxb ids to the iommu option. It is unfortunate the
> iommu option is a machine option. 

Yes. I had thought about that too, but the virt-machine code
creates all the instance at this moment...

> platform bus framework could be considered to dynamically allocate them
> using the -device option. This has been used along with dt generation
> but with ACPI this would need to be studied. However at the time the
> smmu was integrated the machine option was prefered.
> 
> Maybe using the 1st option would allow to infer that if there are
> different iommu ids this implies that several IOMMU instances need to be
> created.

Yea, I like the idea of creating iommu instance with a "-device"
string.

One more question. Let's say we have 2 smmus/pxb-buses:
  [ pxb0] <---> vSMMU0/pSMMU0 [ devA, devB, devC ]
  [ pxb1] <---> vSMMU1/pSMMU1 [ devD, devE, devF ]
How would a user know that devA/devB should be attached to pxb0
without doing like devA->pxb0 and devB->pxb1? Should QEMU just
error out until the user associate them correctly? Or they may
rely on libvirt to figure that out, i.e. moving the iommu node
matching from QEMU to libvirt?

Thanks
Nicolin

Re: [PATCH RFCv1 05/10] hw/arm/virt: Add VIRT_NESTED_SMMU

Posted by Andrea Bolognani 2 months, 1 week ago

On Tue, Jul 09, 2024 at 10:59:30AM GMT, Nicolin Chen wrote:
> On Tue, Jul 09, 2024 at 03:26:58PM +0200, Eric Auger wrote:
> > > +    /* Build PCI Expander Bridge + Root Port from the top of PCI_BUS_MAX */
> > > +    if (vms->num_nested_smmus) {
> > > +        /* VIRT_NESTED_SMMU must hold all vSMMUs */
> > > +        g_assert(vms->num_nested_smmus <=
> > > +                 vms->memmap[VIRT_NESTED_SMMU].size / SMMU_IO_LEN);
> > > +
> > > +        vms->nested_smmu_phandle = g_new0(uint32_t, vms->num_nested_smmus);
> > > +
> > > +        for (i = 0; i < vms->num_nested_smmus; i++) {
> > > +            DeviceState *smmu_dev;
> > > +            PCIBus *pxb_bus;
> > > +
> > > +            pxb_bus = create_pcie_expander_bridge(vms, i);
> > > +            g_assert(pxb_bus);
> > > +
> > > +            vms->nested_smmu_phandle[i] = qemu_fdt_alloc_phandle(ms->fdt);
> > > +            smmu_dev = create_nested_smmu(vms, pxb_bus, i);
> > > +            g_assert(smmu_dev);
> > > +
> > > +            qemu_fdt_setprop_cells(ms->fdt, nodename, "iommu-map", 0x0,
> > > +                                   vms->nested_smmu_phandle[i], 0x0, 0x10000);
>
> > I think libvirt is supposed to create the pcie bus topology instead and
> > it shall not be created by qemu behind the scene.
> > The pcie elements you create here are not visible to libvirt and I guess
> > they may collide with elements explicitly created by libvirt at a given
> > pci bdf.
>
> Yea, the bdf conflict is a concern. So I allocated the bdf list
> from the top of the bus number... one of the reasons of doing
> this is to ease users so they don't need to deal with the over-
> complicated topology. I will try libvirt and see how it goes.

libvirt developer here. Eric is absolutely right that creating PCI
controllers implicitly will not fly, as libvirt's PCI address
allocation algorithm would not know about them and would produce
incorrect output as a result.

> > I think it would make more sense to be able to attach an smmu instance
> > to a given pci root or pxb either by adding an iommu id to a given
> > pxb-pcie option
> >
> > -device
> > pxb-pcie,bus_nr=100,id=pci.12,numa_node=0,bus=pcie.0,addr=0x3,iommu=<id>
> > or
> > adding a list of pxb ids to the iommu option. It is unfortunate the
> > iommu option is a machine option.
>
> Yes. I had thought about that too, but the virt-machine code
> creates all the instance at this moment...
>
> > platform bus framework could be considered to dynamically allocate them
> > using the -device option. This has been used along with dt generation
> > but with ACPI this would need to be studied. However at the time the
> > smmu was integrated the machine option was prefered.
> >
> > Maybe using the 1st option would allow to infer that if there are
> > different iommu ids this implies that several IOMMU instances need to be
> > created.
>
> Yea, I like the idea of creating iommu instance with a "-device"
> string.

This sounds like the best option to me as well. Incidentally, it's
also how the intel-iommu device works. From libvirt's test suite:

  -device '{"driver":"intel-iommu","id":"iommu0"}'

Looks like a sensible model to follow. You could then associate each
specific IOMMU instance to a PCI controller using an attribute, as
Eric suggested above.

> One more question. Let's say we have 2 smmus/pxb-buses:
>   [ pxb0] <---> vSMMU0/pSMMU0 [ devA, devB, devC ]
>   [ pxb1] <---> vSMMU1/pSMMU1 [ devD, devE, devF ]
> How would a user know that devA/devB should be attached to pxb0
> without doing like devA->pxb0 and devB->pxb1? Should QEMU just
> error out until the user associate them correctly? Or they may
> rely on libvirt to figure that out, i.e. moving the iommu node
> matching from QEMU to libvirt?

IIUC having devices associated to the "correct" SMMU is only a
requirement for optimal performance, not pure functionality, right?
In other words, getting the association wrong will make things slower
but they will keep working.

Is a 1:1 match between pSMMU and vSMMU mandatory? On a system with 4
SMMUs, could I create a VMs with

  vSMMU0 <-> pSMMU0
  vSMMU1 <-> pSMMU1

and another one with

  vSMMU0 <-> pSMMU2
  vSMMU1 <-> pSMMU3

assuming of course that devices are assigned to the correct VM?

How is the association between vSMMU and pSMMU created anyway? Is
that something that the user can control, or is it done automatically
somehow?

-- 
Andrea Bolognani / Red Hat / Virtualization

Re: [PATCH RFCv1 05/10] hw/arm/virt: Add VIRT_NESTED_SMMU

Posted by Jason Gunthorpe 2 months, 1 week ago

On Thu, Jul 11, 2024 at 08:48:10AM -0700, Andrea Bolognani wrote:

> IIUC having devices associated to the "correct" SMMU is only a
> requirement for optimal performance, not pure functionality, right?
> In other words, getting the association wrong will make things slower
> but they will keep working.

As part of declaring the iommu as a device libvirt should also specify
the iommu capabilities and features.

There are capabilities that require perfect 1:1 pIOMMU to vIOMMU
matching, or it can't work.

For instance if the pIOMMU has a direct path for invalidations then
the pIOMMU, pPCI, vIOMMU, and vPCI all must be perfectly matched
because the vIOMMU will deliver invalidations directly to a single
pIOMMU with no possibility to steer them to the correct place in
hypervisor SW.

> Is a 1:1 match between pSMMU and vSMMU mandatory? On a system with 4
> SMMUs, could I create a VMs with
> 
>   vSMMU0 <-> pSMMU0
>   vSMMU1 <-> pSMMU1
> 
> and another one with
> 
>   vSMMU0 <-> pSMMU2
>   vSMMU1 <-> pSMMU3

This is fine

> assuming of course that devices are assigned to the correct VM?

But devices on pSMMU0 cannot be assigned to the second VM in above
situations.

> How is the association between vSMMU and pSMMU created anyway? Is
> that something that the user can control, or is it done automatically
> somehow?

From a kernel side VIOMMU will be created in IOMMUFD and it will get
as input a reference to a VFIO device. The VIOMMU will be linked to
that VFIO device's pIOMMU.

Also creating a dummy vIOMMU without any devices and thus no way to
reach the pIOMMU is definately "tricky", I'm not sure exactly what is
possible there.

Jason