TSEG (Top of Memory Segment) is one of many regions that SMM (System Management
Mode) can occupy.  This one, however is special, because a) most of the SMM code
lives in TSEG nowadays and b) QEMU just (well, some time ago) added support for
so called 'extended' TSEG.  The difference to the TSEG implemented in real q35's
MCH (Memory Controller Hub) is that it can have any size from 1 MiB up to 65534
MiB in 1 MiB increments.  But more about that in the QEMU patch.
Signed-off-by: Martin Kletzander <mkletzan@redhat.com>
---
 docs/formatdomain.html.in           | 39 +++++++++++++++++++
 docs/schemas/domaincommon.rng       |  5 +++
 src/conf/domain_conf.c              | 60 ++++++++++++++++++++++++++++-
 src/conf/domain_conf.h              |  1 +
 tests/genericxml2xmlindata/tseg.xml | 23 +++++++++++
 tests/genericxml2xmltest.c          |  2 +
 6 files changed, 129 insertions(+), 1 deletion(-)
 create mode 100644 tests/genericxml2xmlindata/tseg.xml
diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in
index 403b638bd4bd..39ebfe398bd7 100644
--- a/docs/formatdomain.html.in
+++ b/docs/formatdomain.html.in
@@ -1897,6 +1897,9 @@
   <ioapic driver='qemu'/>
   <hpt resizing='required'/>
   <vmcoreinfo state='on'/>
+  <smm state='on'>
+    <tseg unit='MiB'>48</tseg>
+  </smm>
 </features>
 ...</pre>
 
@@ -2056,6 +2059,42 @@
           <code>off</code>, default <code>on</code>) enable or disable
           System Management Mode.
           <span class="since">Since 2.1.0</span>
+
+          Optional sub-element <code>tseg</code> can be used to specify the
+          amount of memory dedicated to SMM TSEG. The size can be specified as a
+          value of that element, optional attribute <code>unit</code> can be
+          used to specify the unit of the aforementioned value (defaults to
+          'MiB').
+
+          This value is configurable due to the fact that the calculation cannot
+          be done right with the guarantee that it will work correctly.  For
+          QEMU TSEG was disabled up to and including <code>pc-q35-2.9</code> (it
+          does not make sense fo any other machine type than q35).
+          From <code>pc-q35-2.10</code> the default value was changed to 16 MiB.
+          That should suffice for up to 272 VCPUs, 5 GiB guest RAM in total, no
+          hotplug memory range, and 32 GiB of 64-bit PCI MMIO aperture.  Or for
+          48 VCPUs, with 1TB of guest RAM, no hotplug DIMM range, and 32GB of
+          64-bit PCI MMIO aperture. The values may also vary based on the loader
+          the VM is using.
+
+          Additional size might be needed for significantly higher VCPU counts
+          or increased address space (that can be memory, maxMemory, 64-bit PCI
+          MMIO aperture size; roughly 8 MiB of TSEG per 1 TiB of address space)
+          which can also be rounded up.
+
+          Due to the nature of this setting being similar to "how much RAM
+          should the guest have" users are advised to either consult the
+          documentation of the guest OS or loader (if there is any), or test
+          this by trial-and-error changing the value until the VM boots
+          successfully.  Yet another guiding value for users might be the fact
+          that 48 MiB should be enough for pretty large guests (240 VCPUs and
+          4TB guest RAM), but it is on purpose not set as default as 48 MiB of
+          unavailable RAM might be too much for small guests (e.g. with 512 MiB
+          of RAM).
+
+          See <a href="#elementsMemoryAllocation">Memory Allocation</a>
+          for more details about the <code>unit</code> attribute.
+          <span class="since">Since 4.5.0</span> (QEMU only)
       </dd>
       <dt><code>ioapic</code></dt>
       <dd>Tune the I/O APIC. Possible values for the
diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng
index 71ac3d079c32..664ec79933f0 100644
--- a/docs/schemas/domaincommon.rng
+++ b/docs/schemas/domaincommon.rng
@@ -4785,6 +4785,11 @@
                   <ref name="virOnOff"/>
                 </attribute>
               </optional>
+              <optional>
+                <element name="tseg">
+                  <ref name="scaledInteger"/>
+                </element>
+              </optional>
             </element>
           </optional>
           <optional>
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
index 3689ac0a82ce..6a4f8243183d 100644
--- a/src/conf/domain_conf.c
+++ b/src/conf/domain_conf.c
@@ -19647,6 +19647,16 @@ virDomainDefParseXML(xmlDocPtr xml,
         VIR_FREE(nodes);
     }
 
+    if (def->features[VIR_DOMAIN_FEATURE_SMM] == VIR_TRISTATE_SWITCH_ON &&
+        virDomainParseScaledValue("string(./features/smm/tseg)",
+                                  "string(./features/smm/tseg/@unit)",
+                                  ctxt,
+                                  &def->tseg_size,
+                                  1024 * 1024, /* Defaults to mebibytes */
+                                  ULLONG_MAX,
+                                  false) < 0)
+        goto error;
+
     if ((n = virXPathNodeSet("./features/capabilities/*", ctxt, &nodes)) < 0)
         goto error;
 
@@ -21741,6 +21751,23 @@ virDomainDefFeaturesCheckABIStability(virDomainDefPtr src,
         }
     }
 
+    /* smm */
+    if (src->features[VIR_DOMAIN_FEATURE_SMM] == VIR_TRISTATE_SWITCH_ON &&
+        src->tseg_size != dst->tseg_size) {
+        const char *unit_src, *unit_dst;
+        unsigned long long short_size_src = virFormatIntPretty(src->tseg_size,
+                                                               &unit_src);
+        unsigned long long short_size_dst = virFormatIntPretty(dst->tseg_size,
+                                                               &unit_dst);
+
+        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+                       _("Size of SMM TSEG size differs: "
+                         "source: '%llu %s', destination: '%llu %s'"),
+                       short_size_src, unit_src,
+                       short_size_dst, unit_dst);
+        return false;
+    }
+
     return true;
 }
 
@@ -27059,7 +27086,6 @@ virDomainDefFormatInternal(virDomainDefPtr def,
             case VIR_DOMAIN_FEATURE_PMU:
             case VIR_DOMAIN_FEATURE_PVSPINLOCK:
             case VIR_DOMAIN_FEATURE_VMPORT:
-            case VIR_DOMAIN_FEATURE_SMM:
                 switch ((virTristateSwitch) def->features[i]) {
                 case VIR_TRISTATE_SWITCH_LAST:
                 case VIR_TRISTATE_SWITCH_ABSENT:
@@ -27076,6 +27102,38 @@ virDomainDefFormatInternal(virDomainDefPtr def,
 
                 break;
 
+            case VIR_DOMAIN_FEATURE_SMM:
+                switch ((virTristateSwitch) def->features[i]) {
+                case VIR_TRISTATE_SWITCH_LAST:
+                case VIR_TRISTATE_SWITCH_ABSENT:
+                    break;
+
+                case VIR_TRISTATE_SWITCH_ON:
+                    virBufferAddLit(buf, "<smm state='on'");
+                    if (!def->tseg_size) {
+                        virBufferAddLit(buf, "/>\n");
+                    } else {
+                        const char *unit;
+                        unsigned long long short_size = virFormatIntPretty(def->tseg_size,
+                                                                           &unit);
+
+                        virBufferAddLit(buf, ">\n");
+                        virBufferAdjustIndent(buf, 2);
+                        virBufferAsprintf(buf, "<tseg unit='%s'>%llu</tseg>\n",
+                                          unit, short_size);
+                        virBufferAdjustIndent(buf, -2);
+                        virBufferAddLit(buf, "</smm>\n");
+                    }
+
+                    break;
+
+                case VIR_TRISTATE_SWITCH_OFF:
+                    virBufferAddLit(buf, "<smm state='off'/>\n");
+                    break;
+                }
+
+                break;
+
             case VIR_DOMAIN_FEATURE_APIC:
                 if (def->features[i] == VIR_TRISTATE_SWITCH_ON) {
                     virBufferAddLit(buf, "<apic");
diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h
index a78fdee40c65..c230dd3d09af 100644
--- a/src/conf/domain_conf.h
+++ b/src/conf/domain_conf.h
@@ -2377,6 +2377,7 @@ struct _virDomainDef {
     virGICVersion gic_version;
     char *hyperv_vendor_id;
     int apic_eoi;
+    unsigned long long tseg_size;
 
     virDomainClockDef clock;
 
diff --git a/tests/genericxml2xmlindata/tseg.xml b/tests/genericxml2xmlindata/tseg.xml
new file mode 100644
index 000000000000..49483f874cd4
--- /dev/null
+++ b/tests/genericxml2xmlindata/tseg.xml
@@ -0,0 +1,23 @@
+<domain type='qemu'>
+  <name>QEMUGuest1</name>
+  <uuid>c7a5fdbd-edaf-9455-926a-d65c16db1809</uuid>
+  <memory unit='KiB'>219100</memory>
+  <currentMemory unit='KiB'>219100</currentMemory>
+  <vcpu placement='static'>1</vcpu>
+  <os>
+    <type arch='x86_64' machine='q35'>hvm</type>
+    <boot dev='hd'/>
+  </os>
+  <features>
+    <smm state='on'>
+      <tseg unit='MiB'>48</tseg>
+    </smm>
+  </features>
+  <clock offset='utc'/>
+  <on_poweroff>destroy</on_poweroff>
+  <on_reboot>restart</on_reboot>
+  <on_crash>destroy</on_crash>
+  <devices>
+    <emulator>/usr/bin/qemu-system-x86_64</emulator>
+  </devices>
+</domain>
diff --git a/tests/genericxml2xmltest.c b/tests/genericxml2xmltest.c
index d8270a6cae82..daad6e0f78d8 100644
--- a/tests/genericxml2xmltest.c
+++ b/tests/genericxml2xmltest.c
@@ -141,6 +141,8 @@ mymain(void)
     DO_TEST_FULL("cachetune-colliding-types", false, true,
                  TEST_COMPARE_DOM_XML2XML_RESULT_FAIL_PARSE);
 
+    DO_TEST("tseg");
+
     virObjectUnref(caps);
     virObjectUnref(xmlopt);
 
-- 
2.17.0
--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list
On 05/21/2018 11:00 AM, Martin Kletzander wrote:
> TSEG (Top of Memory Segment) is one of many regions that SMM (System Management
> Mode) can occupy.  This one, however is special, because a) most of the SMM code
> lives in TSEG nowadays and b) QEMU just (well, some time ago) added support for
> so called 'extended' TSEG.  The difference to the TSEG implemented in real q35's
> MCH (Memory Controller Hub) is that it can have any size from 1 MiB up to 65534
> MiB in 1 MiB increments.  But more about that in the QEMU patch.
                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Which some reader, but not this one, may be eager to find ;-)
Still is there a valid range QEMU would accept? Or do we just let QEMU
fail if the range is too high?
I think QEMU has MCH_HOST_BRIDGE_EXT_TSEG_MBYTES_MAX
> 
> Signed-off-by: Martin Kletzander <mkletzan@redhat.com>
> ---
>  docs/formatdomain.html.in           | 39 +++++++++++++++++++
>  docs/schemas/domaincommon.rng       |  5 +++
>  src/conf/domain_conf.c              | 60 ++++++++++++++++++++++++++++-
>  src/conf/domain_conf.h              |  1 +
>  tests/genericxml2xmlindata/tseg.xml | 23 +++++++++++
>  tests/genericxml2xmltest.c          |  2 +
>  6 files changed, 129 insertions(+), 1 deletion(-)
>  create mode 100644 tests/genericxml2xmlindata/tseg.xml
> 
In the category of I hate it when that happens, git am -3 "merged" in
two chunks incorrectly!  Probably wouldn't have happened if I'd done
this sooner!  The virDomainDefFeaturesCheckABIStability hunk got merged
into virDomainRedirFilterDefCheckABIStability and the tseg grammar got
merged under "vmport" and just before "gic".  As you can imagine the
results weren't pretty ;-).
> diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in
> index 403b638bd4bd..39ebfe398bd7 100644
> --- a/docs/formatdomain.html.in
> +++ b/docs/formatdomain.html.in
> @@ -1897,6 +1897,9 @@
>    <ioapic driver='qemu'/>
>    <hpt resizing='required'/>
>    <vmcoreinfo state='on'/>
> +  <smm state='on'>
> +    <tseg unit='MiB'>48</tseg>
> +  </smm>
>  </features>
>  ...</pre>
>  
> @@ -2056,6 +2059,42 @@
>            <code>off</code>, default <code>on</code>) enable or disable
>            System Management Mode.
>            <span class="since">Since 2.1.0</span>
> +
> +          Optional sub-element <code>tseg</code> can be used to specify the
> +          amount of memory dedicated to SMM TSEG. The size can be specified as a
> +          value of that element, optional attribute <code>unit</code> can be
> +          used to specify the unit of the aforementioned value (defaults to
> +          'MiB').
> +
If this is to be a true paragraph break then these paragraphs needs to
be wrapped in <p> ... </p>; otherwise, this just becomes one long run on
(and quite ugly IMO) paragraph.
> +          This value is configurable due to the fact that the calculation cannot
> +          be done right with the guarantee that it will work correctly.  For
> +          QEMU TSEG was disabled up to and including <code>pc-q35-2.9</code> (it
> +          does not make sense fo any other machine type than q35).
s/fo/for/
This also appears to be a paragraph break...
> +          From <code>pc-q35-2.10</code> the default value was changed to 16 MiB.
s/From/Starting with/ ??? (not required, just a though)
> +          That should suffice for up to 272 VCPUs, 5 GiB guest RAM in total, no
> +          hotplug memory range, and 32 GiB of 64-bit PCI MMIO aperture.  Or for
> +          48 VCPUs, with 1TB of guest RAM, no hotplug DIMM range, and 32GB of
> +          64-bit PCI MMIO aperture. The values may also vary based on the loader
> +          the VM is using.
> +
> +          Additional size might be needed for significantly higher VCPU counts
> +          or increased address space (that can be memory, maxMemory, 64-bit PCI
> +          MMIO aperture size; roughly 8 MiB of TSEG per 1 TiB of address space)
> +          which can also be rounded up.
Uh, oh, hmmm... We seem to have this (perhaps more recent) "thing" about
libvirt having to supply some attribute based on some (mostly difficult
to describe) algorithm that QEMU would use in order to create the
"optimum" size or use for some alternate algorithm. Of course, a few
libvir-list patches like that have been NACK'd because of the feeling
that providing a useful algorithm for a user to "decide upon" a
satisfactory attribute value cannot really be done. Off the top of my
head I can come up with two:
1. Add poll-max-ns property of each iothread:
https://www.redhat.com/archives/libvir-list/2017-February/msg01047.html
2. Add support for qcow2 cache (many times, but most recently):
https://www.redhat.com/archives/libvir-list/2017-September/msg00553.html
> +
> +          Due to the nature of this setting being similar to "how much RAM
> +          should the guest have" users are advised to either consult the
> +          documentation of the guest OS or loader (if there is any), or test
> +          this by trial-and-error changing the value until the VM boots
> +          successfully.  Yet another guiding value for users might be the fact
> +          that 48 MiB should be enough for pretty large guests (240 VCPUs and
> +          4TB guest RAM), but it is on purpose not set as default as 48 MiB of
> +          unavailable RAM might be too much for small guests (e.g. with 512 MiB
> +          of RAM).
and this is the exact reason why patches like this get NACKd - because
trial and error should not be a 'desired' means to calculate.  Even the
bz referenced in patch 5 has an incredible amount of data and
calculations that provide even more insight and details that are lost
when we try to summarize in a libvirt meaningful patch.
What it seems is really needed is an attribute that libvirt provides
that tells QEMU to calculate the optimum size.
> +
> +          See <a href="#elementsMemoryAllocation">Memory Allocation</a>
> +          for more details about the <code>unit</code> attribute.
> +          <span class="since">Since 4.5.0</span> (QEMU only)
haha - see you put 4.5.0 and this is the 4.4.0 release - so it was
ignored until 4.5.0 was "on the clock" ;-)
Ironically this is pointed out as QEMU only; however, genericxml2xmltest
is used/updated.
So, I personally don't mind if this attribute is added; however, I think
we become hypocrites to a certain degree if patches continue to be
blocked/NACKed to other subsystems that have attributes that allow
certain amount of control, but don't come with exact sizing references.
Still if this is pushed, then perhaps those others can use this as the
means to provide a reference to other knobs added.
You can have my :
Reviewed-by: John Ferlan <jferlan@redhat.com>
with a few adjustments above; however, another R-By should be obtained
here as well as perhaps a policy change so that other similar such
series could be merged... I guess I'm curious what "thoughts" others may
have regarding adding this "knob" while not allowing others.
John
>        </dd>
>        <dt><code>ioapic</code></dt>
>        <dd>Tune the I/O APIC. Possible values for the
> diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng
> index 71ac3d079c32..664ec79933f0 100644
> --- a/docs/schemas/domaincommon.rng
> +++ b/docs/schemas/domaincommon.rng
> @@ -4785,6 +4785,11 @@
>                    <ref name="virOnOff"/>
>                  </attribute>
>                </optional>
> +              <optional>
> +                <element name="tseg">
> +                  <ref name="scaledInteger"/>
> +                </element>
> +              </optional>
>              </element>
>            </optional>
>            <optional>
> diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
> index 3689ac0a82ce..6a4f8243183d 100644
> --- a/src/conf/domain_conf.c
> +++ b/src/conf/domain_conf.c
> @@ -19647,6 +19647,16 @@ virDomainDefParseXML(xmlDocPtr xml,
>          VIR_FREE(nodes);
>      }
>  
> +    if (def->features[VIR_DOMAIN_FEATURE_SMM] == VIR_TRISTATE_SWITCH_ON &&
> +        virDomainParseScaledValue("string(./features/smm/tseg)",
> +                                  "string(./features/smm/tseg/@unit)",
> +                                  ctxt,
> +                                  &def->tseg_size,
> +                                  1024 * 1024, /* Defaults to mebibytes */
> +                                  ULLONG_MAX,
> +                                  false) < 0)
> +        goto error;
> +
>      if ((n = virXPathNodeSet("./features/capabilities/*", ctxt, &nodes)) < 0)
>          goto error;
>  
> @@ -21741,6 +21751,23 @@ virDomainDefFeaturesCheckABIStability(virDomainDefPtr src,
>          }
>      }
>  
> +    /* smm */
> +    if (src->features[VIR_DOMAIN_FEATURE_SMM] == VIR_TRISTATE_SWITCH_ON &&
> +        src->tseg_size != dst->tseg_size) {
> +        const char *unit_src, *unit_dst;
> +        unsigned long long short_size_src = virFormatIntPretty(src->tseg_size,
> +                                                               &unit_src);
> +        unsigned long long short_size_dst = virFormatIntPretty(dst->tseg_size,
> +                                                               &unit_dst);
> +
> +        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
> +                       _("Size of SMM TSEG size differs: "
> +                         "source: '%llu %s', destination: '%llu %s'"),
> +                       short_size_src, unit_src,
> +                       short_size_dst, unit_dst);
> +        return false;
> +    }
> +
>      return true;
>  }
>  
> @@ -27059,7 +27086,6 @@ virDomainDefFormatInternal(virDomainDefPtr def,
>              case VIR_DOMAIN_FEATURE_PMU:
>              case VIR_DOMAIN_FEATURE_PVSPINLOCK:
>              case VIR_DOMAIN_FEATURE_VMPORT:
> -            case VIR_DOMAIN_FEATURE_SMM:
>                  switch ((virTristateSwitch) def->features[i]) {
>                  case VIR_TRISTATE_SWITCH_LAST:
>                  case VIR_TRISTATE_SWITCH_ABSENT:
> @@ -27076,6 +27102,38 @@ virDomainDefFormatInternal(virDomainDefPtr def,
>  
>                  break;
>  
> +            case VIR_DOMAIN_FEATURE_SMM:
> +                switch ((virTristateSwitch) def->features[i]) {
> +                case VIR_TRISTATE_SWITCH_LAST:
> +                case VIR_TRISTATE_SWITCH_ABSENT:
> +                    break;
> +
> +                case VIR_TRISTATE_SWITCH_ON:
> +                    virBufferAddLit(buf, "<smm state='on'");
> +                    if (!def->tseg_size) {
> +                        virBufferAddLit(buf, "/>\n");
> +                    } else {
> +                        const char *unit;
> +                        unsigned long long short_size = virFormatIntPretty(def->tseg_size,
> +                                                                           &unit);
> +
> +                        virBufferAddLit(buf, ">\n");
> +                        virBufferAdjustIndent(buf, 2);
> +                        virBufferAsprintf(buf, "<tseg unit='%s'>%llu</tseg>\n",
> +                                          unit, short_size);
> +                        virBufferAdjustIndent(buf, -2);
> +                        virBufferAddLit(buf, "</smm>\n");
> +                    }
> +
> +                    break;
> +
> +                case VIR_TRISTATE_SWITCH_OFF:
> +                    virBufferAddLit(buf, "<smm state='off'/>\n");
> +                    break;
> +                }
> +
> +                break;
> +
>              case VIR_DOMAIN_FEATURE_APIC:
>                  if (def->features[i] == VIR_TRISTATE_SWITCH_ON) {
>                      virBufferAddLit(buf, "<apic");
> diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h
> index a78fdee40c65..c230dd3d09af 100644
> --- a/src/conf/domain_conf.h
> +++ b/src/conf/domain_conf.h
> @@ -2377,6 +2377,7 @@ struct _virDomainDef {
>      virGICVersion gic_version;
>      char *hyperv_vendor_id;
>      int apic_eoi;
> +    unsigned long long tseg_size;
>  
>      virDomainClockDef clock;
>  
> diff --git a/tests/genericxml2xmlindata/tseg.xml b/tests/genericxml2xmlindata/tseg.xml
> new file mode 100644
> index 000000000000..49483f874cd4
> --- /dev/null
> +++ b/tests/genericxml2xmlindata/tseg.xml
> @@ -0,0 +1,23 @@
> +<domain type='qemu'>
> +  <name>QEMUGuest1</name>
> +  <uuid>c7a5fdbd-edaf-9455-926a-d65c16db1809</uuid>
> +  <memory unit='KiB'>219100</memory>
> +  <currentMemory unit='KiB'>219100</currentMemory>
> +  <vcpu placement='static'>1</vcpu>
> +  <os>
> +    <type arch='x86_64' machine='q35'>hvm</type>
> +    <boot dev='hd'/>
> +  </os>
> +  <features>
> +    <smm state='on'>
> +      <tseg unit='MiB'>48</tseg>
> +    </smm>
> +  </features>
> +  <clock offset='utc'/>
> +  <on_poweroff>destroy</on_poweroff>
> +  <on_reboot>restart</on_reboot>
> +  <on_crash>destroy</on_crash>
> +  <devices>
> +    <emulator>/usr/bin/qemu-system-x86_64</emulator>
> +  </devices>
> +</domain>
> diff --git a/tests/genericxml2xmltest.c b/tests/genericxml2xmltest.c
> index d8270a6cae82..daad6e0f78d8 100644
> --- a/tests/genericxml2xmltest.c
> +++ b/tests/genericxml2xmltest.c
> @@ -141,6 +141,8 @@ mymain(void)
>      DO_TEST_FULL("cachetune-colliding-types", false, true,
>                   TEST_COMPARE_DOM_XML2XML_RESULT_FAIL_PARSE);
>  
> +    DO_TEST("tseg");
> +
>      virObjectUnref(caps);
>      virObjectUnref(xmlopt);
>  
> 
--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list
                
            On Wed, May 30, 2018 at 12:00:26PM -0400, John Ferlan wrote: > > >On 05/21/2018 11:00 AM, Martin Kletzander wrote: >> TSEG (Top of Memory Segment) is one of many regions that SMM (System Management >> Mode) can occupy. This one, however is special, because a) most of the SMM code >> lives in TSEG nowadays and b) QEMU just (well, some time ago) added support for >> so called 'extended' TSEG. The difference to the TSEG implemented in real q35's >> MCH (Memory Controller Hub) is that it can have any size from 1 MiB up to 65534 >> MiB in 1 MiB increments. But more about that in the QEMU patch. > > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > >Which some reader, but not this one, may be eager to find ;-) > >Still is there a valid range QEMU would accept? Or do we just let QEMU >fail if the range is too high? > >I think QEMU has MCH_HOST_BRIDGE_EXT_TSEG_MBYTES_MAX > Rather than promising some value, I adjusted it so that it is no longer false, no matter what the max is there. >> >> Signed-off-by: Martin Kletzander <mkletzan@redhat.com> >> --- >> docs/formatdomain.html.in | 39 +++++++++++++++++++ >> docs/schemas/domaincommon.rng | 5 +++ >> src/conf/domain_conf.c | 60 ++++++++++++++++++++++++++++- >> src/conf/domain_conf.h | 1 + >> tests/genericxml2xmlindata/tseg.xml | 23 +++++++++++ >> tests/genericxml2xmltest.c | 2 + >> 6 files changed, 129 insertions(+), 1 deletion(-) >> create mode 100644 tests/genericxml2xmlindata/tseg.xml >> > >In the category of I hate it when that happens, git am -3 "merged" in >two chunks incorrectly! Probably wouldn't have happened if I'd done You can enable/disable 3-way merges if you do (not) like them. >this sooner! The virDomainDefFeaturesCheckABIStability hunk got merged >into virDomainRedirFilterDefCheckABIStability and the tseg grammar got >merged under "vmport" and just before "gic". As you can imagine the >results weren't pretty ;-). > Yeah, happened to me as well, I should've resent this, but I forgot about the merge issue and I also wanted to show that this was posted way before the freeze. Anyway, it's pointless to force it now, I'll leave it for later (meaning "after the release"). Anyway, I keep my branches updated (every now and then) on my github repo [1], so if you want to check that, you always can. [1] https://github.com/nertpinx/libvirt >> diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in >> index 403b638bd4bd..39ebfe398bd7 100644 >> --- a/docs/formatdomain.html.in >> +++ b/docs/formatdomain.html.in >> @@ -1897,6 +1897,9 @@ >> <ioapic driver='qemu'/> >> <hpt resizing='required'/> >> <vmcoreinfo state='on'/> >> + <smm state='on'> >> + <tseg unit='MiB'>48</tseg> >> + </smm> >> </features> >> ...</pre> >> >> @@ -2056,6 +2059,42 @@ >> <code>off</code>, default <code>on</code>) enable or disable >> System Management Mode. >> <span class="since">Since 2.1.0</span> >> + >> + Optional sub-element <code>tseg</code> can be used to specify the >> + amount of memory dedicated to SMM TSEG. The size can be specified as a >> + value of that element, optional attribute <code>unit</code> can be >> + used to specify the unit of the aforementioned value (defaults to >> + 'MiB'). >> + > >If this is to be a true paragraph break then these paragraphs needs to >be wrapped in <p> ... </p>; otherwise, this just becomes one long run on >(and quite ugly IMO) paragraph. > >> + This value is configurable due to the fact that the calculation cannot >> + be done right with the guarantee that it will work correctly. For >> + QEMU TSEG was disabled up to and including <code>pc-q35-2.9</code> (it >> + does not make sense fo any other machine type than q35). > >s/fo/for/ > >This also appears to be a paragraph break... > >> + From <code>pc-q35-2.10</code> the default value was changed to 16 MiB. > >s/From/Starting with/ ??? (not required, just a though) > >> + That should suffice for up to 272 VCPUs, 5 GiB guest RAM in total, no >> + hotplug memory range, and 32 GiB of 64-bit PCI MMIO aperture. Or for >> + 48 VCPUs, with 1TB of guest RAM, no hotplug DIMM range, and 32GB of >> + 64-bit PCI MMIO aperture. The values may also vary based on the loader >> + the VM is using. >> + >> + Additional size might be needed for significantly higher VCPU counts >> + or increased address space (that can be memory, maxMemory, 64-bit PCI >> + MMIO aperture size; roughly 8 MiB of TSEG per 1 TiB of address space) >> + which can also be rounded up. > >Uh, oh, hmmm... We seem to have this (perhaps more recent) "thing" about >libvirt having to supply some attribute based on some (mostly difficult >to describe) algorithm that QEMU would use in order to create the >"optimum" size or use for some alternate algorithm. Of course, a few >libvir-list patches like that have been NACK'd because of the feeling >that providing a useful algorithm for a user to "decide upon" a >satisfactory attribute value cannot really be done. Off the top of my >head I can come up with two: > It's kind of a different story. Think of this as a memory size. You cannot determine the "right" amount of memory the VM should have. You can try to boot with X and double it until the OS installation succeeds. And hope you won't need to change it later. >1. Add poll-max-ns property of each iothread: >https://www.redhat.com/archives/libvir-list/2017-February/msg01047.html > This is about tunables. It might change the performance/latency of QEMU slightly, but that's about it. >2. Add support for qcow2 cache (many times, but most recently): >https://www.redhat.com/archives/libvir-list/2017-September/msg00553.html > Similarly here, it allows setting something that can be (at least slightly) abstracted and in the worst case the performance will be slightly hindered. Contrast this with TSEG which, in case it is set incorrectly, will prevent the machine from booting at all. If we go to the extreme, not only can you easily try to find out the right amount to set for a particular machine, but you can even do that programmatically since when OVMF fails due to small extended TSEG size it will reboot very fast. And you can get that in form of events. When I tried it now it even looks like you get rtc-change event when the domain doesn't reboot immediately due to small TSEG size. I will not put it in the docs because I will not guarantee that this is the right way to go, but this is how events look for default TSEG size for a guest that needs a lot more (it has 240 possible vCPUs and 256 TiB of maximum memory, but because it starts with only 1 vCPU and 1GiB memory I can try it out easily on my machine: virsh # event --domain nixos --all --loop --timestamp 2018-05-31 09:42:21.060+0000: event 'lifecycle' for domain nixos: Resumed Unpaused 2018-05-31 09:42:21.066+0000: event 'lifecycle' for domain nixos: Started Booted 2018-05-31 09:42:21.514+0000: event 'reboot' for domain nixos 2018-05-31 09:42:21.964+0000: event 'reboot' for domain nixos 2018-05-31 09:42:22.414+0000: event 'reboot' for domain nixos 2018-05-31 09:42:22.868+0000: event 'reboot' for domain nixos 2018-05-31 09:42:23.325+0000: event 'reboot' for domain nixos 2018-05-31 09:42:23.778+0000: event 'reboot' for domain nixos 2018-05-31 09:42:24.230+0000: event 'reboot' for domain nixos 2018-05-31 09:42:24.681+0000: event 'reboot' for domain nixos ... you get the point. And this is how it looks when I start it with increased size: virsh # event --domain nixos --all --loop --timestamp 2018-05-31 09:43:24.578+0000: event 'lifecycle' for domain nixos: Resumed Unpaused 2018-05-31 09:43:24.584+0000: event 'lifecycle' for domain nixos: Started Booted 2018-05-31 09:43:31.808+0000: event 'rtc-change' for domain nixos: 0 2018-05-31 09:43:32.808+0000: event 'rtc-change' for domain nixos: 0 The reasons for this not being done automatically are (from the top of my head): - The above is just something I figured out myself, but it's not the recommended way written anywhere. Maybe I'm wrong and it doesn't really work, but it still can be done manually. - You cannot change it however you would like automatically, it is part of the guest ABI and we are striving for keeping that stable. - Trying to figure this out by 1 MiB increments might take some time, but increasing it faster might be wasteful. Basically there is no one-size-fits-all value, no easy way to do it automatically (maybe what I tried), but very good explanation how to do that manually and very easy way to do that. Also, from the SW POV, it doesn't even depend on the guest OS, just on the loader/bios so if you have two same domains (like a template in OpenStack for example) you try it out once and then you have the value that just works and will continue working until you change something for the domain. And what it depends on is clearly written in the documentation. >> + >> + Due to the nature of this setting being similar to "how much RAM >> + should the guest have" users are advised to either consult the >> + documentation of the guest OS or loader (if there is any), or test >> + this by trial-and-error changing the value until the VM boots >> + successfully. Yet another guiding value for users might be the fact >> + that 48 MiB should be enough for pretty large guests (240 VCPUs and >> + 4TB guest RAM), but it is on purpose not set as default as 48 MiB of >> + unavailable RAM might be too much for small guests (e.g. with 512 MiB >> + of RAM). > >and this is the exact reason why patches like this get NACKd - because >trial and error should not be a 'desired' means to calculate. It is not. They are rejected because either a) there is no documentation how to properly check if the value is the right value when doing trial-end-error (this is not the case here since you can see if the machine boots or not) or b) the values being set are too specific instead of being abstracted -- setting value in KiB between 0 and the size of a disk instead of "max_performance" or "min_latency" (this is not the case here, the documentation explains what the size is and why it is not about few guessable values). Basically we are NACKing simple pass-through values without understanding them and adding some documentation for them. For example stuff for which we have documentation along the lines of: "Element asdf can be used to set the asdf of the domain." >bz referenced in patch 5 has an incredible amount of data and >calculations that provide even more insight and details that are lost >when we try to summarize in a libvirt meaningful patch. > Let me know what relevant information from the bz you are missing in the documentation and I'll gladly add it. >What it seems is really needed is an attribute that libvirt provides >that tells QEMU to calculate the optimum size. > >> + >> + See <a href="#elementsMemoryAllocation">Memory Allocation</a> >> + for more details about the <code>unit</code> attribute. >> + <span class="since">Since 4.5.0</span> (QEMU only) > >haha - see you put 4.5.0 and this is the 4.4.0 release - so it was >ignored until 4.5.0 was "on the clock" ;-) > >Ironically this is pointed out as QEMU only; however, genericxml2xmltest >is used/updated. > So? >So, I personally don't mind if this attribute is added; however, I think >we become hypocrites to a certain degree if patches continue to be >blocked/NACKed to other subsystems that have attributes that allow >certain amount of control, but don't come with exact sizing references. >Still if this is pushed, then perhaps those others can use this as the >means to provide a reference to other knobs added. > How much more exact would you wanted to be in terms of sizing? If it could be any more exact we wouldn't need the tunable at all. Please don't compare it to other tunables that we didn't want exposed just because it "sounds similar". I hear lot of people just put stuff like this into "unknown knobs" box and they treat it the same. But there are differences and it's not all >You can have my : > >Reviewed-by: John Ferlan <jferlan@redhat.com> > I wasn't ever against the R-b, but I guess I'm missing the point. You disagree with me on the change made here, but then I get a R-b? :) And then I go further down and read that the R-b has actually no value at all and I should wait for another one :D Maybe I'm overthinking this, but it didn't used to happen back when we used ACKs :) >with a few adjustments above; however, another R-By should be obtained >here as well as perhaps a policy change so that other similar such >series could be merged... I guess I'm curious what "thoughts" others may >have regarding adding this "knob" while not allowing others. > >John -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
On 05/31/2018 08:14 AM, Martin Kletzander wrote: > On Wed, May 30, 2018 at 12:00:26PM -0400, John Ferlan wrote: >> >> >> On 05/21/2018 11:00 AM, Martin Kletzander wrote: >>> TSEG (Top of Memory Segment) is one of many regions that SMM (System >>> Management >>> Mode) can occupy. This one, however is special, because a) most of >>> the SMM code >>> lives in TSEG nowadays and b) QEMU just (well, some time ago) added >>> support for >>> so called 'extended' TSEG. The difference to the TSEG implemented in >>> real q35's >>> MCH (Memory Controller Hub) is that it can have any size from 1 MiB >>> up to 65534 >>> MiB in 1 MiB increments. But more about that in the QEMU patch. >> >> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ >> >> Which some reader, but not this one, may be eager to find ;-) >> >> Still is there a valid range QEMU would accept? Or do we just let QEMU >> fail if the range is too high? >> >> I think QEMU has MCH_HOST_BRIDGE_EXT_TSEG_MBYTES_MAX >> > > Rather than promising some value, I adjusted it so that it is no longer > false, > no matter what the max is there. > >>> >>> Signed-off-by: Martin Kletzander <mkletzan@redhat.com> >>> --- >>> docs/formatdomain.html.in | 39 +++++++++++++++++++ >>> docs/schemas/domaincommon.rng | 5 +++ >>> src/conf/domain_conf.c | 60 ++++++++++++++++++++++++++++- >>> src/conf/domain_conf.h | 1 + >>> tests/genericxml2xmlindata/tseg.xml | 23 +++++++++++ >>> tests/genericxml2xmltest.c | 2 + >>> 6 files changed, 129 insertions(+), 1 deletion(-) >>> create mode 100644 tests/genericxml2xmlindata/tseg.xml >>> >> >> In the category of I hate it when that happens, git am -3 "merged" in >> two chunks incorrectly! Probably wouldn't have happened if I'd done > > You can enable/disable 3-way merges if you do (not) like them. > >> this sooner! The virDomainDefFeaturesCheckABIStability hunk got merged >> into virDomainRedirFilterDefCheckABIStability and the tseg grammar got >> merged under "vmport" and just before "gic". As you can imagine the >> results weren't pretty ;-). >> > > Yeah, happened to me as well, I should've resent this, but I forgot > about the > merge issue and I also wanted to show that this was posted way before the > freeze. Anyway, it's pointless to force it now, I'll leave it for later > (meaning "after the release"). > *Lots* of things were posted after this that got reviewed and pushed before this - that just seems to be "the norm" (for whatever reason)... Not receiving timely reviews is one of those areas that really bothers me. I've been making the effort more recently to try to go in order of longest without review - sometimes a ping makes me lose order though. > Anyway, I keep my branches updated (every now and then) on my github > repo [1], > so if you want to check that, you always can. > > [1] https://github.com/nertpinx/libvirt > OK - I've added that as a remote... >>> diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in >>> index 403b638bd4bd..39ebfe398bd7 100644 >>> --- a/docs/formatdomain.html.in >>> +++ b/docs/formatdomain.html.in >>> @@ -1897,6 +1897,9 @@ >>> <ioapic driver='qemu'/> >>> <hpt resizing='required'/> >>> <vmcoreinfo state='on'/> >>> + <smm state='on'> >>> + <tseg unit='MiB'>48</tseg> >>> + </smm> >>> </features> >>> ...</pre> >>> >>> @@ -2056,6 +2059,42 @@ >>> <code>off</code>, default <code>on</code>) enable or disable >>> System Management Mode. >>> <span class="since">Since 2.1.0</span> >>> + >>> + Optional sub-element <code>tseg</code> can be used to >>> specify the >>> + amount of memory dedicated to SMM TSEG. The size can be >>> specified as a >>> + value of that element, optional attribute >>> <code>unit</code> can be >>> + used to specify the unit of the aforementioned value >>> (defaults to >>> + 'MiB'). >>> + >> >> If this is to be a true paragraph break then these paragraphs needs to >> be wrapped in <p> ... </p>; otherwise, this just becomes one long run on >> (and quite ugly IMO) paragraph. >> >>> + This value is configurable due to the fact that the >>> calculation cannot >>> + be done right with the guarantee that it will work >>> correctly. For >>> + QEMU TSEG was disabled up to and including >>> <code>pc-q35-2.9</code> (it >>> + does not make sense fo any other machine type than q35). >> >> s/fo/for/ >> >> This also appears to be a paragraph break... >> >>> + From <code>pc-q35-2.10</code> the default value was >>> changed to 16 MiB. >> >> s/From/Starting with/ ??? (not required, just a though) >> >>> + That should suffice for up to 272 VCPUs, 5 GiB guest RAM >>> in total, no >>> + hotplug memory range, and 32 GiB of 64-bit PCI MMIO >>> aperture. Or for >>> + 48 VCPUs, with 1TB of guest RAM, no hotplug DIMM range, >>> and 32GB of >>> + 64-bit PCI MMIO aperture. The values may also vary based >>> on the loader >>> + the VM is using. >>> + >>> + Additional size might be needed for significantly higher >>> VCPU counts >>> + or increased address space (that can be memory, maxMemory, >>> 64-bit PCI >>> + MMIO aperture size; roughly 8 MiB of TSEG per 1 TiB of >>> address space) >>> + which can also be rounded up. >> >> Uh, oh, hmmm... We seem to have this (perhaps more recent) "thing" about >> libvirt having to supply some attribute based on some (mostly difficult >> to describe) algorithm that QEMU would use in order to create the >> "optimum" size or use for some alternate algorithm. Of course, a few >> libvir-list patches like that have been NACK'd because of the feeling >> that providing a useful algorithm for a user to "decide upon" a >> satisfactory attribute value cannot really be done. Off the top of my >> head I can come up with two: >> > > It's kind of a different story. Think of this as a memory size. You > cannot determine the "right" amount of memory the VM should have. You > can try to boot with X and double it until the OS installation succeeds. > And hope you won't need to change it later. > >> 1. Add poll-max-ns property of each iothread: >> https://www.redhat.com/archives/libvir-list/2017-February/msg01047.html >> > > This is about tunables. It might change the performance/latency of QEMU > slightly, but that's about it. > and there are those that would find it useful to have (bz 1545732)... If you don't have enough memory and your VM is paging like crazy, you just add more memory. Requires a reboot. Likewise, if your VM doesn't boot you add/alter the magic TSEG value using some algorithm as described above. From a 90,000 foot customer view is there a difference? It's just a knob that the hypervisor has to allow something to be accomplished for which libvirt provides the attribute to fine tune. >> 2. Add support for qcow2 cache (many times, but most recently): >> https://www.redhat.com/archives/libvir-list/2017-September/msg00553.html >> > > Similarly here, it allows setting something that can be (at least > slightly) abstracted and in the worst case the performance will be > slightly hindered. This one I understand more why it would be rejected, but still providing the value allows certain things to work a whole lot better. I also know Berto has been "fine tuning" the algorithm in later QEMU releases - so that's like hitting a moving target. > > Contrast this with TSEG which, in case it is set incorrectly, will > prevent the machine from booting at all. If we go to the extreme, not > only can you easily try to find out the right amount to set for a > particular machine, but you can even do that programmatically since when > OVMF fails due to small extended TSEG size it will reboot very fast. > And you can get that in form of events. When I tried it now it even > looks like you get rtc-change event when the domain doesn't reboot > immediately due to small TSEG size. Is there something that is provided that would give the "hint" to adjust the TSEG size? > > I will not put it in the docs because I will not guarantee that this is > the right way to go, but this is how events look for default TSEG size > for a guest that needs a lot more (it has 240 possible vCPUs and 256 TiB > of maximum memory, but because it starts with only 1 vCPU and 1GiB > memory I can try it out easily on my machine: > > virsh # event --domain nixos --all --loop --timestamp > 2018-05-31 09:42:21.060+0000: event 'lifecycle' for domain nixos: > Resumed Unpaused > 2018-05-31 09:42:21.066+0000: event 'lifecycle' for domain nixos: > Started Booted > 2018-05-31 09:42:21.514+0000: event 'reboot' for domain nixos > 2018-05-31 09:42:21.964+0000: event 'reboot' for domain nixos > 2018-05-31 09:42:22.414+0000: event 'reboot' for domain nixos > 2018-05-31 09:42:22.868+0000: event 'reboot' for domain nixos > 2018-05-31 09:42:23.325+0000: event 'reboot' for domain nixos > 2018-05-31 09:42:23.778+0000: event 'reboot' for domain nixos > 2018-05-31 09:42:24.230+0000: event 'reboot' for domain nixos > 2018-05-31 09:42:24.681+0000: event 'reboot' for domain nixos > ... > > you get the point. > > And this is how it looks when I start it with increased size: > > virsh # event --domain nixos --all --loop --timestamp > 2018-05-31 09:43:24.578+0000: event 'lifecycle' for domain nixos: > Resumed Unpaused > 2018-05-31 09:43:24.584+0000: event 'lifecycle' for domain nixos: > Started Booted > 2018-05-31 09:43:31.808+0000: event 'rtc-change' for domain nixos: 0 > 2018-05-31 09:43:32.808+0000: event 'rtc-change' for domain nixos: 0 > > The reasons for this not being done automatically are (from the top of > my head): > > - The above is just something I figured out myself, but it's not the > recommended way written anywhere. Maybe I'm wrong and it doesn't > really work, but it still can be done manually. > > - You cannot change it however you would like automatically, it is part > of the guest ABI and we are striving for keeping that stable. > > - Trying to figure this out by 1 MiB increments might take some time, > but increasing it faster might be wasteful. > > Basically there is no one-size-fits-all value, no easy way to do it > automatically (maybe what I tried), but very good explanation how to do > that manually and very easy way to do that. Also, from the SW POV, it > doesn't even depend on the guest OS, just on the loader/bios so if you > have two same domains (like a template in OpenStack for example) you try > it out once and then you have the value that just works and will > continue working until you change something for the domain. And what > it depends on is clearly written in the documentation. > And there are those that could say if the underlying hypervisor knows that for certain memory sizes and/or vCPU counts that the TSEG will be too small for specific machine types that then the underlying hypervisor should be the one to "choose" a value that's programatically appropriate which to a degree IIUC is the argument being used against allowing a libvirt knob for the poll-max-ns and qcow2 cache sizes. >>> + >>> + Due to the nature of this setting being similar to "how >>> much RAM >>> + should the guest have" users are advised to either consult >>> the >>> + documentation of the guest OS or loader (if there is any), >>> or test >>> + this by trial-and-error changing the value until the VM boots >>> + successfully. Yet another guiding value for users might >>> be the fact >>> + that 48 MiB should be enough for pretty large guests (240 >>> VCPUs and >>> + 4TB guest RAM), but it is on purpose not set as default as >>> 48 MiB of >>> + unavailable RAM might be too much for small guests (e.g. >>> with 512 MiB >>> + of RAM). >> >> and this is the exact reason why patches like this get NACKd - because >> trial and error should not be a 'desired' means to calculate. > > It is not. They are rejected because either a) there is no > documentation how to properly check if the value is the right value when > doing trial-end-error (this is not the case here since you can see if > the machine boots or not) or b) the values being set are too specific > instead of being abstracted -- setting value in KiB between 0 and the > size of a disk instead of "max_performance" or "min_latency" (this is > not the case here, the documentation explains what the size is and why > it is not about few guessable values). > > Basically we are NACKing simple pass-through values without > understanding them and adding some documentation for them. For example > stuff for which we have documentation along the lines of: "Element asdf > can be used to set the asdf of the domain." > I guess it's all a matter of how things are being viewed. I can certainly see how someone without a redhat.com email may view this series as similar to the referenced ones. It's more about mitigating failure possibilities by being able to provide knobs that can help (or get you into trouble). Not everyone has large configurations - so do they really need the value? Not everyone needs every knob, but having them available for when you do need them isn't a bad thing. >> bz referenced in patch 5 has an incredible amount of data and >> calculations that provide even more insight and details that are lost >> when we try to summarize in a libvirt meaningful patch. >> > > Let me know what relevant information from the bz you are missing in the > documentation and I'll gladly add it. > If you think what's provided above is enough, then leave it as is. If there's something in that bz or in the ensuing discussion, then add it. Even though it's difficult when you're in the middle of the code - if you were someone who ran into this situation without knowing anything about it - would what's provided above give enough information to help you decide which value to use? That whole trial and error comment is what leaves me wondering as a reader who knew nothing about this before I started. And now I have more I need a pub to help forget ;-) >> What it seems is really needed is an attribute that libvirt provides >> that tells QEMU to calculate the optimum size. >> >>> + >>> + See <a href="#elementsMemoryAllocation">Memory Allocation</a> >>> + for more details about the <code>unit</code> attribute. >>> + <span class="since">Since 4.5.0</span> (QEMU only) >> >> haha - see you put 4.5.0 and this is the 4.4.0 release - so it was >> ignored until 4.5.0 was "on the clock" ;-) >> >> Ironically this is pointed out as QEMU only; however, genericxml2xmltest >> is used/updated. >> > > So? > Why would someone add this generically if it only works for QEMU? >> So, I personally don't mind if this attribute is added; however, I think >> we become hypocrites to a certain degree if patches continue to be >> blocked/NACKed to other subsystems that have attributes that allow >> certain amount of control, but don't come with exact sizing references. >> Still if this is pushed, then perhaps those others can use this as the >> means to provide a reference to other knobs added. >> > > How much more exact would you wanted to be in terms of sizing? If it > could be any more exact we wouldn't need the tunable at all. Please > don't compare it to other tunables that we didn't want exposed just > because it "sounds similar". I hear lot of people just put stuff like > this into "unknown knobs" box and they treat it the same. But there are > differences and it's not all > >> You can have my : >> >> Reviewed-by: John Ferlan <jferlan@redhat.com> >> > > I wasn't ever against the R-b, but I guess I'm missing the point. You > disagree with me on the change made here, but then I get a R-b? :) And > then I go further down and read that the R-b has actually no value at > all and I should wait for another one :D Maybe I'm overthinking this, > but it didn't used to happen back when we used ACKs :) > >> with a few adjustments above; however, another R-By should be obtained >> here as well as perhaps a policy change so that other similar such >> series could be merged... I guess I'm curious what "thoughts" others may >> have regarding adding this "knob" while not allowing others. >> >> John Did I disagree on the change or was I pointing out that from a certain abstraction level it could be said the code is "similar to" the patches I referenced w/r/t libvirt supplying some value that perhaps the underlying hypervisor could/should have adjusted instead based on it's knowledge of the rather involved algorithms. FWIW: I'm not opposed to the referenced patches, but I'm in the minority on them. AIUI, the R-b is essentially stating that I've looked at the patch and in general agree with it. As part of that - there are a few things that need adjustment and since I felt that it had some similarities to the referenced patches w/r/t adding a knob that some may feel isn't necessary I said let's make sure there's "enough consensus" that this should be added (which it seems there is). IIRC, the following was referenced at some point during the R-b discussion: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/submitting-patches.rst?h=v4.13#n560 John -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
On Thu, May 31, 2018 at 11:19:04AM -0400, John Ferlan wrote:
>
>
>On 05/31/2018 08:14 AM, Martin Kletzander wrote:
>> On Wed, May 30, 2018 at 12:00:26PM -0400, John Ferlan wrote:
>>>
>>>
>>> On 05/21/2018 11:00 AM, Martin Kletzander wrote:
>>>> TSEG (Top of Memory Segment) is one of many regions that SMM (System
>>>> Management
>>>> Mode) can occupy.  This one, however is special, because a) most of
>>>> the SMM code
>>>> lives in TSEG nowadays and b) QEMU just (well, some time ago) added
>>>> support for
>>>> so called 'extended' TSEG.  The difference to the TSEG implemented in
>>>> real q35's
>>>> MCH (Memory Controller Hub) is that it can have any size from 1 MiB
>>>> up to 65534
>>>> MiB in 1 MiB increments.  But more about that in the QEMU patch.
>>>
>>>                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>>>
>>> Which some reader, but not this one, may be eager to find ;-)
>>>
>>> Still is there a valid range QEMU would accept? Or do we just let QEMU
>>> fail if the range is too high?
>>>
>>> I think QEMU has MCH_HOST_BRIDGE_EXT_TSEG_MBYTES_MAX
>>>
>>
>> Rather than promising some value, I adjusted it so that it is no longer
>> false,
>> no matter what the max is there.
>>
>>>>
>>>> Signed-off-by: Martin Kletzander <mkletzan@redhat.com>
>>>> ---
>>>>  docs/formatdomain.html.in           | 39 +++++++++++++++++++
>>>>  docs/schemas/domaincommon.rng       |  5 +++
>>>>  src/conf/domain_conf.c              | 60 ++++++++++++++++++++++++++++-
>>>>  src/conf/domain_conf.h              |  1 +
>>>>  tests/genericxml2xmlindata/tseg.xml | 23 +++++++++++
>>>>  tests/genericxml2xmltest.c          |  2 +
>>>>  6 files changed, 129 insertions(+), 1 deletion(-)
>>>>  create mode 100644 tests/genericxml2xmlindata/tseg.xml
>>>>
>>>
>>> In the category of I hate it when that happens, git am -3 "merged" in
>>> two chunks incorrectly!  Probably wouldn't have happened if I'd done
>>
>> You can enable/disable 3-way merges if you do (not) like them.
>>
>>> this sooner!  The virDomainDefFeaturesCheckABIStability hunk got merged
>>> into virDomainRedirFilterDefCheckABIStability and the tseg grammar got
>>> merged under "vmport" and just before "gic".  As you can imagine the
>>> results weren't pretty ;-).
>>>
>>
>> Yeah, happened to me as well, I should've resent this, but I forgot
>> about the
>> merge issue and I also wanted to show that this was posted way before the
>> freeze.  Anyway, it's pointless to force it now, I'll leave it for later
>> (meaning "after the release").
>>
>
>*Lots* of things were posted after this that got reviewed and pushed
>before this - that just seems to be "the norm" (for whatever reason)...
>Not receiving timely reviews is one of those areas that really bothers
>me. I've been making the effort more recently to try to go in order of
>longest without review - sometimes a ping makes me lose order though.
>
>> Anyway, I keep my branches updated (every now and then) on my github
>> repo [1],
>> so if you want to check that, you always can.
>>
>> [1] https://github.com/nertpinx/libvirt
>>
>
>OK - I've added that as a remote...
>
>>>> diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in
>>>> index 403b638bd4bd..39ebfe398bd7 100644
>>>> --- a/docs/formatdomain.html.in
>>>> +++ b/docs/formatdomain.html.in
>>>> @@ -1897,6 +1897,9 @@
>>>>    <ioapic driver='qemu'/>
>>>>    <hpt resizing='required'/>
>>>>    <vmcoreinfo state='on'/>
>>>> +  <smm state='on'>
>>>> +    <tseg unit='MiB'>48</tseg>
>>>> +  </smm>
>>>>  </features>
>>>>  ...</pre>
>>>>
>>>> @@ -2056,6 +2059,42 @@
>>>>            <code>off</code>, default <code>on</code>) enable or disable
>>>>            System Management Mode.
>>>>            <span class="since">Since 2.1.0</span>
>>>> +
>>>> +          Optional sub-element <code>tseg</code> can be used to
>>>> specify the
>>>> +          amount of memory dedicated to SMM TSEG. The size can be
>>>> specified as a
>>>> +          value of that element, optional attribute
>>>> <code>unit</code> can be
>>>> +          used to specify the unit of the aforementioned value
>>>> (defaults to
>>>> +          'MiB').
>>>> +
>>>
>>> If this is to be a true paragraph break then these paragraphs needs to
>>> be wrapped in <p> ... </p>; otherwise, this just becomes one long run on
>>> (and quite ugly IMO) paragraph.
>>>
>>>> +          This value is configurable due to the fact that the
>>>> calculation cannot
>>>> +          be done right with the guarantee that it will work
>>>> correctly.  For
>>>> +          QEMU TSEG was disabled up to and including
>>>> <code>pc-q35-2.9</code> (it
>>>> +          does not make sense fo any other machine type than q35).
>>>
>>> s/fo/for/
>>>
>>> This also appears to be a paragraph break...
>>>
>>>> +          From <code>pc-q35-2.10</code> the default value was
>>>> changed to 16 MiB.
>>>
>>> s/From/Starting with/ ??? (not required, just a though)
>>>
>>>> +          That should suffice for up to 272 VCPUs, 5 GiB guest RAM
>>>> in total, no
>>>> +          hotplug memory range, and 32 GiB of 64-bit PCI MMIO
>>>> aperture.  Or for
>>>> +          48 VCPUs, with 1TB of guest RAM, no hotplug DIMM range,
>>>> and 32GB of
>>>> +          64-bit PCI MMIO aperture. The values may also vary based
>>>> on the loader
>>>> +          the VM is using.
>>>> +
>>>> +          Additional size might be needed for significantly higher
>>>> VCPU counts
>>>> +          or increased address space (that can be memory, maxMemory,
>>>> 64-bit PCI
>>>> +          MMIO aperture size; roughly 8 MiB of TSEG per 1 TiB of
>>>> address space)
>>>> +          which can also be rounded up.
>>>
>>> Uh, oh, hmmm... We seem to have this (perhaps more recent) "thing" about
>>> libvirt having to supply some attribute based on some (mostly difficult
>>> to describe) algorithm that QEMU would use in order to create the
>>> "optimum" size or use for some alternate algorithm. Of course, a few
>>> libvir-list patches like that have been NACK'd because of the feeling
>>> that providing a useful algorithm for a user to "decide upon" a
>>> satisfactory attribute value cannot really be done. Off the top of my
>>> head I can come up with two:
>>>
>>
>> It's kind of a different story.  Think of this as a memory size.  You
>> cannot determine the "right" amount of memory the VM should have.  You
>> can try to boot with X and double it until the OS installation succeeds.
>> And hope you won't need to change it later.
>>
>
>
>>> 1. Add poll-max-ns property of each iothread:
>>> https://www.redhat.com/archives/libvir-list/2017-February/msg01047.html
>>>
>>
>> This is about tunables.  It might change the performance/latency of QEMU
>> slightly, but that's about it.
>>
>
>and there are those that would find it useful to have (bz 1545732)... If
Much of what I read (underscore emphasis mine) suggests otherwise:
 - "There is a lot of sentiment *against* providing too many low level knobs
    like this _without proper guidance_ on how they should be set."
 - "To address this issue QEMU implements self-tuning algorithm that modifies
    the current polling time to _adapt to different workloads_ and it can also
    fallback to blocking syscalls."
 - "The QEMU commits say the tunables all default to sane parameters so I'm
    inclined to say we ignore them at the libvirt level entirely."
 - "I'm fine if libvirt doesn't add a dedicated API for setting <iothread>
    polling parameters.  It's _unlikely_ that users will need to change the
    setting.  In an emergency (e.g. disabling it due to a performance
    regression) _they can_ use <qemu:arg value='-newarg'/>."
The only points for the polling to be enabled were along the lines of:
It _may_ help in _some_ workloads when you want a bit more throughput
for the price of more CPU cycles.
With vague definitions of how much CPU, throughput and without
description of how to find out if a particular workload fits this.  Even
when all of that is there, then you need yet another explanation on how
to calculate the value to be set.  And then it all goes down back to the
fact that QEMU is already doing some automated balancing for this
(because they can, because this is not part of the guest ABI).  That way
you can never actually say if it will help and how much.
So for this one it is a clear "NO".
>you don't have enough memory and your VM is paging like crazy, you just
>add more memory.  Requires a reboot. Likewise, if your VM doesn't boot
>you add/alter the magic TSEG value using some algorithm as described
>above. From a 90,000 foot customer view is there a difference? It's just
>a knob that the hypervisor has to allow something to be accomplished for
>which libvirt provides the attribute to fine tune.
>
Yeah, you're right.  That's why I think both of them should be exposed.  Some
small differences to other knobs, just for completeness:
 - by the time you realize that the VM doesn't have enough memory, it might be
   too late as reboot isn't that easy of a thing for some production workloads
 - on the other hand, you have a way to see that happening (compare it to the
   polling interval above which you have no idea without proper benchmarks)
One more thing that's common to the memory size (and I hope TSEG in the future)
is that in mgmt apps the TSEG setting already has a place where to live and it
is exactly where the memory size lives currently.  In templates.  You have
"small vm" teplate and "ginormous vm".  For the latter one you can just add a
setting of TSEG _once_ per file.  How's TSEG better and easier than memory?  You
figure it once for the VM settings (and possibly firmware, but that's not going
to change much) and then it doesn't depend on the workflow, not even a little
bit.
Anyway.
What I see as the differences between tunables that make it in and tunables that
don't is that:
 - the former are usually understandable and easy to see what they are in bare
   metal.  Everyone knows what memory is in the hardware, how it looks like, how
   much is "not enough" and how much is "more that needed".  We are used to
   those things back from the hardware times, even to changing them.
 - The latter is usually something we were not able control in HW or didn't even
   know it existed.  For virtual workloads it might be completely different, but
   sometimes people are forgetting that.
>>> 2. Add support for qcow2 cache (many times, but most recently):
>>> https://www.redhat.com/archives/libvir-list/2017-September/msg00553.html
>>>
>>
>> Similarly here, it allows setting something that can be (at least
>> slightly) abstracted and in the worst case the performance will be
>> slightly hindered.
>
>This one I understand more why it would be rejected, but still providing
>the value allows certain things to work a whole lot better. I also know
>Berto has been "fine tuning" the algorithm in later QEMU releases - so
>that's like hitting a moving target.
>
This is very similar, it's just that there is no automatic balancing
done by QEMU.  But it usually is also about how you write the docs.  The
option can make very much sense, but if someone writes "Setting asdf can
allows fine-tuning of the asdf value in the underlying hypervisor", then
no matter how much that value makes sense it is not reflected in the
docs.  That's why I tried to add all the relevant info into the docs so
that it's clear what it is doing, how to set it, to what values and
when.
Apart from the fact that there is a "link" to some file in the QEMU
repository that someone is supposed to read, plus the decision for the
value determination are written there (but not why they are not
automatically calculated, or maybe I missed it), it:
 - is not possible to try using the <qemu:arg value='-newarg'/> approach
 - the docs say:
   <b>In general you should leave this option alone, unless you
      are very certain you know what you are doing.</b>
So in this particular case I wouldn't be totally against having it
there.  If you don't want to use it, then "just don't touch that" is an
approach that shouldn't hurt anyone.
>>
>> Contrast this with TSEG which, in case it is set incorrectly, will
>> prevent the machine from booting at all.  If we go to the extreme, not
>> only can you easily try to find out the right amount to set for a
>> particular machine, but you can even do that programmatically since when
>> OVMF fails due to small extended TSEG size it will reboot very fast.
>> And you can get that in form of events.  When I tried it now it even
>> looks like you get rtc-change event when the domain doesn't reboot
>> immediately due to small TSEG size.
>
>Is there something that is provided that would give the "hint" to adjust
>the TSEG size?
>
I thought there was, but maybe I'll add some more info after the discussion.
>>
>> I will not put it in the docs because I will not guarantee that this is
>> the right way to go, but this is how events look for default TSEG size
>> for a guest that needs a lot more (it has 240 possible vCPUs and 256 TiB
>> of maximum memory, but because it starts with only 1 vCPU and 1GiB
>> memory I can try it out easily on my machine:
>>
>>  virsh # event --domain nixos --all --loop --timestamp
>>  2018-05-31 09:42:21.060+0000: event 'lifecycle' for domain nixos:
>> Resumed Unpaused
>>  2018-05-31 09:42:21.066+0000: event 'lifecycle' for domain nixos:
>> Started Booted
>>  2018-05-31 09:42:21.514+0000: event 'reboot' for domain nixos
>>  2018-05-31 09:42:21.964+0000: event 'reboot' for domain nixos
>>  2018-05-31 09:42:22.414+0000: event 'reboot' for domain nixos
>>  2018-05-31 09:42:22.868+0000: event 'reboot' for domain nixos
>>  2018-05-31 09:42:23.325+0000: event 'reboot' for domain nixos
>>  2018-05-31 09:42:23.778+0000: event 'reboot' for domain nixos
>>  2018-05-31 09:42:24.230+0000: event 'reboot' for domain nixos
>>  2018-05-31 09:42:24.681+0000: event 'reboot' for domain nixos
>>  ...
>>
>> you get the point.
>>
>> And this is how it looks when I start it with increased size:
>>
>>  virsh # event --domain nixos --all --loop --timestamp
>>  2018-05-31 09:43:24.578+0000: event 'lifecycle' for domain nixos:
>> Resumed Unpaused
>>  2018-05-31 09:43:24.584+0000: event 'lifecycle' for domain nixos:
>> Started Booted
>>  2018-05-31 09:43:31.808+0000: event 'rtc-change' for domain nixos: 0
>>  2018-05-31 09:43:32.808+0000: event 'rtc-change' for domain nixos: 0
>>
>> The reasons for this not being done automatically are (from the top of
>> my head):
>>
>> - The above is just something I figured out myself, but it's not the
>>     recommended way written anywhere.  Maybe I'm wrong and it doesn't
>>     really work, but it still can be done manually.
>>
>> - You cannot change it however you would like automatically, it is part
>>  of the guest ABI and we are striving for keeping that stable.
>>
>> - Trying to figure this out by 1 MiB increments might take some time,
>>  but increasing it faster might be wasteful.
>>
>> Basically there is no one-size-fits-all value, no easy way to do it
>> automatically (maybe what I tried), but very good explanation how to do
>> that manually and very easy way to do that.  Also, from the SW POV, it
>> doesn't even depend on the guest OS, just on the loader/bios so if you
>> have two same domains (like a template in OpenStack for example) you try
>> it out once and then you have the value that just works and will
>> continue working until you change something for the domain.  And what
>> it depends on is clearly written in the documentation.
>>
>
>And there are those that could say if the underlying hypervisor knows
>that for certain memory sizes and/or vCPU counts that the TSEG will be
>too small for specific machine types that then the underlying hypervisor
>should be the one to "choose" a value that's programatically appropriate
>which to a degree IIUC is the argument being used against allowing a
>libvirt knob for the poll-max-ns and qcow2 cache sizes.
>
And they would be wrong as for TSEG the hypervisor a) doesn't know that
and b) cannot change that once it was started.
>>>> +
>>>> +          Due to the nature of this setting being similar to "how
>>>> much RAM
>>>> +          should the guest have" users are advised to either consult
>>>> the
>>>> +          documentation of the guest OS or loader (if there is any),
>>>> or test
>>>> +          this by trial-and-error changing the value until the VM boots
>>>> +          successfully.  Yet another guiding value for users might
>>>> be the fact
>>>> +          that 48 MiB should be enough for pretty large guests (240
>>>> VCPUs and
>>>> +          4TB guest RAM), but it is on purpose not set as default as
>>>> 48 MiB of
>>>> +          unavailable RAM might be too much for small guests (e.g.
>>>> with 512 MiB
>>>> +          of RAM).
>>>
>>> and this is the exact reason why patches like this get NACKd - because
>>> trial and error should not be a 'desired' means to calculate.
>>
>> It is not.  They are rejected because either a) there is no
>> documentation how to properly check if the value is the right value when
>> doing trial-end-error (this is not the case here since you can see if
>> the machine boots or not) or b) the values being set are too specific
>> instead of being abstracted -- setting value in KiB between 0 and the
>> size of a disk instead of "max_performance" or "min_latency" (this is
>> not the case here, the documentation explains what the size is and why
>> it is not about few guessable values).
>>
>> Basically we are NACKing simple pass-through values without
>> understanding them and adding some documentation for them.  For example
>> stuff for which we have documentation along the lines of: "Element asdf
>> can be used to set the asdf of the domain."
>>
>
>I guess it's all a matter of how things are being viewed. I can
>certainly see how someone without a redhat.com email may view this
>series as similar to the referenced ones. It's more about mitigating
>failure possibilities by being able to provide knobs that can help (or
>get you into trouble). Not everyone has large configurations - so do
>they really need the value? Not everyone needs every knob, but having
>them available for when you do need them isn't a bad thing.
>
>>> bz referenced in patch 5 has an incredible amount of data and
>>> calculations that provide even more insight and details that are lost
>>> when we try to summarize in a libvirt meaningful patch.
>>>
>>
>> Let me know what relevant information from the bz you are missing in the
>> documentation and I'll gladly add it.
>>
>
>If you think what's provided above is enough, then leave it as is. If
>there's something in that bz or in the ensuing discussion, then add it.
>Even though it's difficult when you're in the middle of the code - if
>you were someone who ran into this situation without knowing anything
>about it - would what's provided above give enough information to help
>you decide which value to use?  That whole trial and error comment is
>what leaves me wondering as a reader who knew nothing about this before
>I started. And now I have more I need a pub to help forget ;-)
>
Here comes the weekend!
>>> What it seems is really needed is an attribute that libvirt provides
>>> that tells QEMU to calculate the optimum size.
>>>
>>>> +
>>>> +          See <a href="#elementsMemoryAllocation">Memory Allocation</a>
>>>> +          for more details about the <code>unit</code> attribute.
>>>> +          <span class="since">Since 4.5.0</span> (QEMU only)
>>>
>>> haha - see you put 4.5.0 and this is the 4.4.0 release - so it was
>>> ignored until 4.5.0 was "on the clock" ;-)
>>>
>>> Ironically this is pointed out as QEMU only; however, genericxml2xmltest
>>> is used/updated.
>>>
>>
>> So?
>>
>
>Why would someone add this generically if it only works for QEMU?
>
The part where the VM is set up based on that is only supported in QEMU.
The part that does formatting/parsing is done in src/conf/domain_conf.c,
which is used by all drivers.
I would add it there because it:
 a) might be later added to other hypervisor drivers
 b) even when someone is compiling with the qemu driver turned off they
    cannot break it (provided they do run tests)
>>> So, I personally don't mind if this attribute is added; however, I think
>>> we become hypocrites to a certain degree if patches continue to be
>>> blocked/NACKed to other subsystems that have attributes that allow
>>> certain amount of control, but don't come with exact sizing references.
>>> Still if this is pushed, then perhaps those others can use this as the
>>> means to provide a reference to other knobs added.
>>>
>>
>> How much more exact would you wanted to be in terms of sizing?  If it
>> could be any more exact we wouldn't need the tunable at all.  Please
>> don't compare it to other tunables that we didn't want exposed just
>> because it "sounds similar".  I hear lot of people just put stuff like
>> this into "unknown knobs" box and they treat it the same.  But there are
>> differences and it's not all
>>
>>> You can have my :
>>>
>>> Reviewed-by: John Ferlan <jferlan@redhat.com>
>>>
>>
>> I wasn't ever against the R-b, but I guess I'm missing the point.  You
>> disagree with me on the change made here, but then I get a R-b? :) And
>> then I go further down and read that the R-b has actually no value at
>> all and I should wait for another one :D Maybe I'm overthinking this,
>> but it didn't used to happen back when we used ACKs :)
>>
>>> with a few adjustments above; however, another R-By should be obtained
>>> here as well as perhaps a policy change so that other similar such
>>> series could be merged... I guess I'm curious what "thoughts" others may
>>> have regarding adding this "knob" while not allowing others.
>>>
>>> John
>
>Did I disagree on the change or was I pointing out that from a certain
>abstraction level it could be said the code is "similar to" the patches
>I referenced w/r/t libvirt supplying some value that perhaps the
>underlying hypervisor could/should have adjusted instead based on it's
>knowledge of the rather involved algorithms. FWIW: I'm not opposed to
>the referenced patches, but I'm in the minority on them.
>
>AIUI, the R-b is essentially stating that I've looked at the patch and
>in general agree with it. As part of that - there are a few things that
>need adjustment and since I felt that it had some similarities to the
>referenced patches w/r/t adding a knob that some may feel isn't
>necessary I said let's make sure there's "enough consensus" that this
>should be added (which it seems there is).
>
>IIRC, the following was referenced at some point during the R-b discussion:
>
>https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/submitting-patches.rst?h=v4.13#n560
>
>John
>
--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list
                
            [...] First thanks for taking the time to elaborate - it is helpful. Much better than just stating no because I don't like it ;-). >> >>>> 1. Add poll-max-ns property of each iothread: >>>> https://www.redhat.com/archives/libvir-list/2017-February/msg01047.html >>>> >>> >>> This is about tunables. It might change the performance/latency of QEMU >>> slightly, but that's about it. >>> >> >> and there are those that would find it useful to have (bz 1545732)... If > > Much of what I read (underscore emphasis mine) suggests otherwise: > > - "There is a lot of sentiment *against* providing too many low level knobs > like this _without proper guidance_ on how they should be set." > > - "To address this issue QEMU implements self-tuning algorithm that > modifies > the current polling time to _adapt to different workloads_ and it can > also > fallback to blocking syscalls." > > - "The QEMU commits say the tunables all default to sane parameters so I'm > inclined to say we ignore them at the libvirt level entirely." > > - "I'm fine if libvirt doesn't add a dedicated API for setting <iothread> > polling parameters. It's _unlikely_ that users will need to change the > setting. In an emergency (e.g. disabling it due to a performance > regression) _they can_ use <qemu:arg value='-newarg'/>." > > The only points for the polling to be enabled were along the lines of: > > It _may_ help in _some_ workloads when you want a bit more throughput > for the price of more CPU cycles. > > With vague definitions of how much CPU, throughput and without > description of how to find out if a particular workload fits this. Even > when all of that is there, then you need yet another explanation on how > to calculate the value to be set. And then it all goes down back to the > fact that QEMU is already doing some automated balancing for this > (because they can, because this is not part of the guest ABI). That way > you can never actually say if it will help and how much. > > So for this one it is a clear "NO". > Another opposing viewpoint is: https://bugzilla.redhat.com/show_bug.cgi?id=1545732#c8 If it were only a "documentation issue" - someone would have figured out much earlier how to get beyond that. FWIW: Without guidance in the Contributor's Guide over what is/isn't acceptable I have a feeling we'll continue to see patches such as this and the one below. Still my point was less the actual feature or details of it, but rather my feeling is there are more examples where exposing low level knobs has been panned in the past. From the lack of details/knowledge I have/had about TSEG during my original review - I saw it as just another low level knob and while I saw value in the knob, I knew there has been a high degree of sentiment in previous patches regarding adding such knobs so I wanted to "be sure" it was desired/necessary adjustment by more than just my opinion (it is a community after all, right)? BTW: I'm not in disagreement that I found poll-max-ns as an odd tunable to add for many of the reasons supplied. Of course I'm the one stuck with the bz /-| and providing the "bad news" or just continually move the bz to a future release ;-) >> you don't have enough memory and your VM is paging like crazy, you just >> add more memory. Requires a reboot. Likewise, if your VM doesn't boot >> you add/alter the magic TSEG value using some algorithm as described >> above. From a 90,000 foot customer view is there a difference? It's just >> a knob that the hypervisor has to allow something to be accomplished for >> which libvirt provides the attribute to fine tune. >> > > Yeah, you're right. That's why I think both of them should be exposed. > Some > small differences to other knobs, just for completeness: > > - by the time you realize that the VM doesn't have enough memory, it > might be > too late as reboot isn't that easy of a thing for some production > workloads > > - on the other hand, you have a way to see that happening (compare it to > the > polling interval above which you have no idea without proper benchmarks) > > One more thing that's common to the memory size (and I hope TSEG in the > future) > is that in mgmt apps the TSEG setting already has a place where to live > and it > is exactly where the memory size lives currently. In templates. You have > "small vm" teplate and "ginormous vm". For the latter one you can just > add a > setting of TSEG _once_ per file. How's TSEG better and easier than > memory? You > figure it once for the VM settings (and possibly firmware, but that's > not going > to change much) and then it doesn't depend on the workflow, not even a > little > bit. > > Anyway. > True TSEG is much more bounded and in your face when it doesn't work. There still is this voice rumbling around in the back of my head that says QEMU should be the owner of deciding upon the algorithm for the value. Unlike a performance knob, it seems there's a solid way to calculate a 'correct value' to make the boot work. The problem is if that automatic calculation ended up being wrong at some point, then there'd be no way to change the value without adding a knob. So, in a way the knob could be the exception rather than the rule. It's a mechanism to make sure the guest can boot given outside interference. > What I see as the differences between tunables that make it in and > tunables that > don't is that: > > - the former are usually understandable and easy to see what they are in > bare > metal. Everyone knows what memory is in the hardware, how it looks > like, how > much is "not enough" and how much is "more that needed". We are used to > those things back from the hardware times, even to changing them. > Still the calculation of a proper TSEG value is based on multiple factors (memory/vcpus). Historically I've found these also need a fudge factor built in - it's the fudge factor that is the sticking point. On real hardware you'd be told - well you don't have enough memory, so buy some more - it's like printing money at that point for the sales guy. You'll be guided to buy a more expensive and larger piece than you may need to "ensure future expand-ability". You may not use the entire thing, but you have it. For software it's a much easier knob. > - The latter is usually something we were not able control in HW or > didn't even > know it existed. For virtual workloads it might be completely > different, but > sometimes people are forgetting that. > Sounds like a job for virtuned (or virtunefixd or virhighavaild). Years ago I worked on a project that would essentially show bottlenecks for the OS and provide the capability to "fix" those via various means (whether it was CPU, memory, or disk overutilization... even deadlocks and cluster quorum hangs). >>>> 2. Add support for qcow2 cache (many times, but most recently): >>>> https://www.redhat.com/archives/libvir-list/2017-September/msg00553.html >>>> >>>> >>> >>> Similarly here, it allows setting something that can be (at least >>> slightly) abstracted and in the worst case the performance will be >>> slightly hindered. >> >> This one I understand more why it would be rejected, but still providing >> the value allows certain things to work a whole lot better. I also know >> Berto has been "fine tuning" the algorithm in later QEMU releases - so >> that's like hitting a moving target. >> > > This is very similar, it's just that there is no automatic balancing > done by QEMU. But it usually is also about how you write the docs. The > option can make very much sense, but if someone writes "Setting asdf can > allows fine-tuning of the asdf value in the underlying hypervisor", then > no matter how much that value makes sense it is not reflected in the > docs. That's why I tried to add all the relevant info into the docs so > that it's clear what it is doing, how to set it, to what values and > when. > > Apart from the fact that there is a "link" to some file in the QEMU > repository that someone is supposed to read, plus the decision for the > value determination are written there (but not why they are not > automatically calculated, or maybe I missed it), it: > > - is not possible to try using the <qemu:arg value='-newarg'/> approach > > - the docs say: > <b>In general you should leave this option alone, unless you > are very certain you know what you are doing.</b> > > So in this particular case I wouldn't be totally against having it > there. If you don't want to use it, then "just don't touch that" is an > approach that shouldn't hurt anyone. > Search the formatdomain page for 'unless' - there are examples where knobs have been added that aren't well described and the consumer better know what they're doing in order to use them. Perhaps another case of alibistic behaviors (a/k/a CYA). [...] >> And there are those that could say if the underlying hypervisor knows >> that for certain memory sizes and/or vCPU counts that the TSEG will be >> too small for specific machine types that then the underlying hypervisor >> should be the one to "choose" a value that's programatically appropriate >> which to a degree IIUC is the argument being used against allowing a >> libvirt knob for the poll-max-ns and qcow2 cache sizes. >> > > And they would be wrong as for TSEG the hypervisor a) doesn't know that > and b) cannot change that once it was started. > I think you lost me here.... From the bz problem statement: "The necessary size is technically predictable (see bug 1468526 comment 8 point (2a) e.g.), but the formula is neither exact nor easy to describe, so as a first step, libvirt should please expose this value in an optional element or attribute." I read that as a proper size could be calculated by the hypervisor, but "just in case" let's make sure we have a fallback option. Perfectly reasonable to me and even more pointed, (so far) only for q35. Of course it's possible I read it wrong. John [...] -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
On Fri, Jun 01, 2018 at 08:21:52AM -0400, John Ferlan wrote: > >[...] > >First thanks for taking the time to elaborate - it is helpful. Much >better than just stating no because I don't like it ;-). > And thanks for appreciating that =) >>> >>>>> 1. Add poll-max-ns property of each iothread: >>>>> https://www.redhat.com/archives/libvir-list/2017-February/msg01047.html >>>>> >>>> >>>> This is about tunables. It might change the performance/latency of QEMU >>>> slightly, but that's about it. >>>> >>> >>> and there are those that would find it useful to have (bz 1545732)... If >> >> Much of what I read (underscore emphasis mine) suggests otherwise: >> >> - "There is a lot of sentiment *against* providing too many low level knobs >> like this _without proper guidance_ on how they should be set." >> >> - "To address this issue QEMU implements self-tuning algorithm that >> modifies >> the current polling time to _adapt to different workloads_ and it can >> also >> fallback to blocking syscalls." >> >> - "The QEMU commits say the tunables all default to sane parameters so I'm >> inclined to say we ignore them at the libvirt level entirely." >> >> - "I'm fine if libvirt doesn't add a dedicated API for setting <iothread> >> polling parameters. It's _unlikely_ that users will need to change the >> setting. In an emergency (e.g. disabling it due to a performance >> regression) _they can_ use <qemu:arg value='-newarg'/>." >> >> The only points for the polling to be enabled were along the lines of: >> >> It _may_ help in _some_ workloads when you want a bit more throughput >> for the price of more CPU cycles. >> >> With vague definitions of how much CPU, throughput and without >> description of how to find out if a particular workload fits this. Even >> when all of that is there, then you need yet another explanation on how >> to calculate the value to be set. And then it all goes down back to the >> fact that QEMU is already doing some automated balancing for this >> (because they can, because this is not part of the guest ABI). That way >> you can never actually say if it will help and how much. >> >> So for this one it is a clear "NO". >> > >Another opposing viewpoint is: > >https://bugzilla.redhat.com/show_bug.cgi?id=1545732#c8 > >If it were only a "documentation issue" - someone would have figured out >much earlier how to get beyond that. > I didn't mean it's "just" that, it's one of the parts. >FWIW: Without guidance in the Contributor's Guide over what is/isn't >acceptable I have a feeling we'll continue to see patches such as this >and the one below. > It's a good point that we could have that written somewhere. It is very difficult to write it so that it is as precise as possible, but also generic enough so that we don't need many exceptions. >Still my point was less the actual feature or details of it, but rather >my feeling is there are more examples where exposing low level knobs has >been panned in the past. From the lack of details/knowledge I have/had >about TSEG during my original review - I saw it as just another low >level knob and while I saw value in the knob, I knew there has been a >high degree of sentiment in previous patches regarding adding such knobs >so I wanted to "be sure" it was desired/necessary adjustment by more >than just my opinion (it is a community after all, right)? > Yes, you're absolutely right. We should do that and make sure we understand the code (part of what the review is for). It just twists my toes when the review refers to someone else in a CYA way. We need to express our own opinions as those are the ones forming the final consensus. Of course common practice, contributor guidelines and all other documentation should be taken into consideration as well. But if we just refer to other people opposing something we might end up having an opinion that's not our own (I mean the community's one). "5 monkeys and a ladder" kind of a situation. Of course we are all constantly adjusting our own opinions along the way, but the more we need to understand *why* we should (or should not) adjust them. >BTW: I'm not in disagreement that I found poll-max-ns as an odd tunable >to add for many of the reasons supplied. Of course I'm the one stuck >with the bz /-| and providing the "bad news" or just continually move >the bz to a future release ;-) > If there is no movement going on, then the BZ needs to be either closed or what can be done needs to be found out. >>> you don't have enough memory and your VM is paging like crazy, you just >>> add more memory. Requires a reboot. Likewise, if your VM doesn't boot >>> you add/alter the magic TSEG value using some algorithm as described >>> above. From a 90,000 foot customer view is there a difference? It's just >>> a knob that the hypervisor has to allow something to be accomplished for >>> which libvirt provides the attribute to fine tune. >>> >> >> Yeah, you're right. That's why I think both of them should be exposed. >> Some >> small differences to other knobs, just for completeness: >> >> - by the time you realize that the VM doesn't have enough memory, it >> might be >> too late as reboot isn't that easy of a thing for some production >> workloads >> >> - on the other hand, you have a way to see that happening (compare it to >> the >> polling interval above which you have no idea without proper benchmarks) >> >> One more thing that's common to the memory size (and I hope TSEG in the >> future) >> is that in mgmt apps the TSEG setting already has a place where to live >> and it >> is exactly where the memory size lives currently. In templates. You have >> "small vm" teplate and "ginormous vm". For the latter one you can just >> add a >> setting of TSEG _once_ per file. How's TSEG better and easier than >> memory? You >> figure it once for the VM settings (and possibly firmware, but that's >> not going >> to change much) and then it doesn't depend on the workflow, not even a >> little >> bit. >> >> Anyway. >> > >True TSEG is much more bounded and in your face when it doesn't work. >There still is this voice rumbling around in the back of my head that >says QEMU should be the owner of deciding upon the algorithm for the >value. Unlike a performance knob, it seems there's a solid way to >calculate a 'correct value' to make the boot work. The problem is if >that automatic calculation ended up being wrong at some point, then >there'd be no way to change the value without adding a knob. So, in a Yes. That's one of the problems. The other one is that this is part of the guest ABI. However this one is not that big of a deal, or at least it looks like it. I'm going to go with not setting the default based on what we know. And maybe later we can set it when the domain is starting by gathering that info from the QEMU process. >way the knob could be the exception rather than the rule. It's a >mechanism to make sure the guest can boot given outside interference. > > > What I see as the differences between tunables that make it in and >> tunables that >> don't is that: >> >> - the former are usually understandable and easy to see what they are in >> bare >> metal. Everyone knows what memory is in the hardware, how it looks >> like, how >> much is "not enough" and how much is "more that needed". We are used to >> those things back from the hardware times, even to changing them. >> > >Still the calculation of a proper TSEG value is based on multiple >factors (memory/vcpus). Historically I've found these also need a fudge >factor built in - it's the fudge factor that is the sticking point. On >real hardware you'd be told - well you don't have enough memory, so buy >some more - it's like printing money at that point for the sales guy. >You'll be guided to buy a more expensive and larger piece than you may >need to "ensure future expand-ability". You may not use the entire >thing, but you have it. For software it's a much easier knob. > >> - The latter is usually something we were not able control in HW or >> didn't even >> know it existed. For virtual workloads it might be completely >> different, but >> sometimes people are forgetting that. >> > >Sounds like a job for virtuned (or virtunefixd or virhighavaild). Years >ago I worked on a project that would essentially show bottlenecks for >the OS and provide the capability to "fix" those via various means >(whether it was CPU, memory, or disk overutilization... even deadlocks >and cluster quorum hangs). > >>>>> 2. Add support for qcow2 cache (many times, but most recently): >>>>> https://www.redhat.com/archives/libvir-list/2017-September/msg00553.html >>>>> >>>>> >>>> >>>> Similarly here, it allows setting something that can be (at least >>>> slightly) abstracted and in the worst case the performance will be >>>> slightly hindered. >>> >>> This one I understand more why it would be rejected, but still providing >>> the value allows certain things to work a whole lot better. I also know >>> Berto has been "fine tuning" the algorithm in later QEMU releases - so >>> that's like hitting a moving target. >>> >> >> This is very similar, it's just that there is no automatic balancing >> done by QEMU. But it usually is also about how you write the docs. The >> option can make very much sense, but if someone writes "Setting asdf can >> allows fine-tuning of the asdf value in the underlying hypervisor", then >> no matter how much that value makes sense it is not reflected in the >> docs. That's why I tried to add all the relevant info into the docs so >> that it's clear what it is doing, how to set it, to what values and >> when. >> >> Apart from the fact that there is a "link" to some file in the QEMU >> repository that someone is supposed to read, plus the decision for the >> value determination are written there (but not why they are not >> automatically calculated, or maybe I missed it), it: >> >> - is not possible to try using the <qemu:arg value='-newarg'/> approach >> >> - the docs say: >> <b>In general you should leave this option alone, unless you >> are very certain you know what you are doing.</b> >> >> So in this particular case I wouldn't be totally against having it >> there. If you don't want to use it, then "just don't touch that" is an >> approach that shouldn't hurt anyone. >> > >Search the formatdomain page for 'unless' - there are examples where >knobs have been added that aren't well described and the consumer better >know what they're doing in order to use them. Perhaps another case of >alibistic behaviors (a/k/a CYA). > Maybe it's because I'm used to all this "QEMU settings talk", but I understand all those that have the "unless you are very certain" disclaimer. Yeah, event_idx might not have been added before, but my guess is that happened before everyone wanted to add every existing option for QEMU and we didn't know what this is going to lead to. Anyway you are right, "the consumer better know what they're doing", that's why the disclaimer is there. And I, personally, would take such patch in. >[...] > >>> And there are those that could say if the underlying hypervisor knows >>> that for certain memory sizes and/or vCPU counts that the TSEG will be >>> too small for specific machine types that then the underlying hypervisor >>> should be the one to "choose" a value that's programatically appropriate >>> which to a degree IIUC is the argument being used against allowing a >>> libvirt knob for the poll-max-ns and qcow2 cache sizes. >>> >> >> And they would be wrong as for TSEG the hypervisor a) doesn't know that >> and b) cannot change that once it was started. >> > >I think you lost me here.... From the bz problem statement: > >"The necessary size is technically predictable (see bug 1468526 comment >8 point (2a) e.g.), but the formula is neither exact nor easy to >describe, so as a first step, libvirt should please expose this value in >an optional element or attribute." > >I read that as a proper size could be calculated by the hypervisor, but >"just in case" let's make sure we have a fallback option. Perfectly >reasonable to me and even more pointed, (so far) only for q35. Of course >it's possible I read it wrong. > No, you read it right. You nailed it above with the fudge factor. Since visible from the guest as a HW-related information it is also part of guest ABI and we strive to keep that stable. But that one I already discussed above. We'll see how v2 goes ;) > >John > >[...] -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
On Mon, May 21, 2018 at 05:00:51PM +0200, Martin Kletzander wrote:
>TSEG (Top of Memory Segment) is one of many regions that SMM (System Management
>Mode) can occupy.  This one, however is special, because a) most of the SMM code
>lives in TSEG nowadays and b) QEMU just (well, some time ago) added support for
>so called 'extended' TSEG.  The difference to the TSEG implemented in real q35's
>MCH (Memory Controller Hub) is that it can have any size from 1 MiB up to 65534
>MiB in 1 MiB increments.  But more about that in the QEMU patch.
>
>Signed-off-by: Martin Kletzander <mkletzan@redhat.com>
>---
> docs/formatdomain.html.in           | 39 +++++++++++++++++++
> docs/schemas/domaincommon.rng       |  5 +++
> src/conf/domain_conf.c              | 60 ++++++++++++++++++++++++++++-
> src/conf/domain_conf.h              |  1 +
> tests/genericxml2xmlindata/tseg.xml | 23 +++++++++++
> tests/genericxml2xmltest.c          |  2 +
> 6 files changed, 129 insertions(+), 1 deletion(-)
> create mode 100644 tests/genericxml2xmlindata/tseg.xml
>
>@@ -27076,6 +27102,38 @@ virDomainDefFormatInternal(virDomainDefPtr def,
>
>                 break;
>
>+            case VIR_DOMAIN_FEATURE_SMM:
>+                switch ((virTristateSwitch) def->features[i]) {
>+                case VIR_TRISTATE_SWITCH_LAST:
>+                case VIR_TRISTATE_SWITCH_ABSENT:
>+                    break;
>+
>+                case VIR_TRISTATE_SWITCH_ON:
>+                    virBufferAddLit(buf, "<smm state='on'");
>+                    if (!def->tseg_size) {
>+                        virBufferAddLit(buf, "/>\n");
>+                    } else {
>+                        const char *unit;
>+                        unsigned long long short_size = virFormatIntPretty(def->tseg_size,
>+                                                                           &unit);
>+
>+                        virBufferAddLit(buf, ">\n");
>+                        virBufferAdjustIndent(buf, 2);
>+                        virBufferAsprintf(buf, "<tseg unit='%s'>%llu</tseg>\n",
>+                                          unit, short_size);
>+                        virBufferAdjustIndent(buf, -2);
>+                        virBufferAddLit(buf, "</smm>\n");
>+                    }
>+
>+                    break;
>+
>+                case VIR_TRISTATE_SWITCH_OFF:
>+                    virBufferAddLit(buf, "<smm state='off'/>\n");
>+                    break;
Consider using virXMLFormatElement.
Jano
--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list
                
            © 2016 - 2025 Red Hat, Inc.