So, majority of the code is just ready as-is. Well, with one
slight change: differentiate between dimm and nvdimm in places
like device alias generation, generating the command line and so
on.
Speaking of the command line, we also need to append 'nvdimm=on'
to the '-machine' argument so that the nvdimm feature is
advertised in the ACPI tables properly.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com>
---
src/qemu/qemu_alias.c | 10 ++-
src/qemu/qemu_command.c | 91 +++++++++++++++-------
src/qemu/qemu_command.h | 1 +
src/qemu/qemu_domain.c | 34 +++++---
src/qemu/qemu_hotplug.c | 3 +-
.../qemuxml2argv-memory-hotplug-nvdimm.args | 26 +++++++
tests/qemuxml2argvtest.c | 4 +-
7 files changed, 128 insertions(+), 41 deletions(-)
create mode 100644 tests/qemuxml2argvdata/qemuxml2argv-memory-hotplug-nvdimm.args
diff --git a/src/qemu/qemu_alias.c b/src/qemu/qemu_alias.c
index 4cccf231e..05e1293ef 100644
--- a/src/qemu/qemu_alias.c
+++ b/src/qemu/qemu_alias.c
@@ -352,17 +352,23 @@ qemuAssignDeviceMemoryAlias(virDomainDefPtr def,
size_t i;
int maxidx = 0;
int idx;
+ const char *prefix;
+
+ if (mem->model == VIR_DOMAIN_MEMORY_MODEL_DIMM)
+ prefix = "dimm";
+ else
+ prefix = "nvdimm";
if (oldAlias) {
for (i = 0; i < def->nmems; i++) {
- if ((idx = qemuDomainDeviceAliasIndex(&def->mems[i]->info, "dimm")) >= maxidx)
+ if ((idx = qemuDomainDeviceAliasIndex(&def->mems[i]->info, prefix)) >= maxidx)
maxidx = idx + 1;
}
} else {
maxidx = mem->info.addr.dimm.slot;
}
- if (virAsprintf(&mem->info.alias, "dimm%d", maxidx) < 0)
+ if (virAsprintf(&mem->info.alias, "%s%d", prefix, maxidx) < 0)
return -1;
return 0;
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
index 002f60f0f..8f7dd000f 100644
--- a/src/qemu/qemu_command.c
+++ b/src/qemu/qemu_command.c
@@ -3089,6 +3089,7 @@ qemuBuildControllerDevCommandLine(virCommandPtr cmd,
* @userNodeset: user requested map of host nodes to alloc the memory on, NULL
* for default
* @autoNodeset: fallback nodeset in case of automatic NUMA placement
+ * @memPathReq: request memory-backend-file with specific mem-path
* @force: forcibly use one of the backends
*
* Creates a configuration object that represents memory backend of given guest
@@ -3103,6 +3104,8 @@ qemuBuildControllerDevCommandLine(virCommandPtr cmd,
* Then, if one of the two memory-backend-* should be used, the @qemuCaps is
* consulted to check if qemu does support it.
*
+ * If @memPathReq is non-NULL, memory-backend-file is used with passed path.
+ *
* Returns: 0 on success,
* 1 on success and if there's no need to use memory-backend-*
* -1 on error.
@@ -3118,6 +3121,7 @@ qemuBuildMemoryBackendStr(virJSONValuePtr *backendProps,
unsigned long long pagesize,
virBitmapPtr userNodeset,
virBitmapPtr autoNodeset,
+ const char *memPathReq,
bool force)
{
virDomainHugePagePtr master_hugepage = NULL;
@@ -3126,7 +3130,8 @@ qemuBuildMemoryBackendStr(virJSONValuePtr *backendProps,
const long system_page_size = virGetSystemPageSizeKB();
virDomainMemoryAccess memAccess = VIR_DOMAIN_MEMORY_ACCESS_DEFAULT;
size_t i;
- char *mem_path = NULL;
+ char *memPathActual = NULL;
+ bool prealloc = false;
virBitmapPtr nodemask = NULL;
int ret = -1;
virJSONValuePtr props = NULL;
@@ -3206,27 +3211,36 @@ qemuBuildMemoryBackendStr(virJSONValuePtr *backendProps,
if (!(props = virJSONValueNewObject()))
return -1;
- if (pagesize || def->mem.source == VIR_DOMAIN_MEMORY_SOURCE_FILE) {
+ if (pagesize || memPathReq ||
+ def->mem.source == VIR_DOMAIN_MEMORY_SOURCE_FILE) {
*backendType = "memory-backend-file";
- if (def->mem.source == VIR_DOMAIN_MEMORY_SOURCE_FILE) {
+ if (memPathReq) {
+ if (VIR_STRDUP(memPathActual, memPathReq) < 0)
+ goto cleanup;
+ prealloc = true;
+ } else if (def->mem.source == VIR_DOMAIN_MEMORY_SOURCE_FILE) {
/* we can have both pagesize and mem source, then check mem source first */
+ if (VIR_STRDUP(memPathActual, cfg->memoryBackingDir) < 0)
+ goto cleanup;
force = true;
- if (virJSONValueObjectAdd(props,
- "s:mem-path", cfg->memoryBackingDir,
- NULL) < 0)
- goto cleanup;
} else {
- if (qemuGetDomainHupageMemPath(def, cfg, pagesize, &mem_path) < 0)
- goto cleanup;
-
- if (virJSONValueObjectAdd(props,
- "b:prealloc", true,
- "s:mem-path", mem_path,
- NULL) < 0)
+ if (qemuGetDomainHupageMemPath(def, cfg, pagesize, &memPathActual) < 0)
goto cleanup;
+ prealloc = true;
}
+ if (prealloc &&
+ virJSONValueObjectAdd(props,
+ "b:prealloc", true,
+ NULL) < 0)
+ goto cleanup;
+
+ if (virJSONValueObjectAdd(props,
+ "s:mem-path", memPathActual,
+ NULL) < 0)
+ goto cleanup;
+
switch (memAccess) {
case VIR_DOMAIN_MEMORY_ACCESS_SHARED:
if (virJSONValueObjectAdd(props, "b:share", true, NULL) < 0)
@@ -3281,7 +3295,8 @@ qemuBuildMemoryBackendStr(virJSONValuePtr *backendProps,
}
/* If none of the following is requested... */
- if (!needHugepage && !userNodeset && !memAccess && !nodeSpecified && !force) {
+ if (!needHugepage && !userNodeset &&
+ !memAccess && !nodeSpecified && !force && !memPathReq) {
/* report back that using the new backend is not necessary
* to achieve the desired configuration */
ret = 1;
@@ -3309,8 +3324,7 @@ qemuBuildMemoryBackendStr(virJSONValuePtr *backendProps,
cleanup:
virJSONValueFree(props);
- VIR_FREE(mem_path);
-
+ VIR_FREE(memPathActual);
return ret;
}
@@ -3338,7 +3352,7 @@ qemuBuildMemoryCellBackendStr(virDomainDefPtr def,
if ((rc = qemuBuildMemoryBackendStr(&props, &backendType, cfg, qemuCaps,
def, cell, memsize, 0, NULL,
- auto_nodeset, false)) < 0)
+ auto_nodeset, NULL, false)) < 0)
goto cleanup;
if (!(*backendStr = virQEMUBuildObjectCommandlineFromJSON(backendType,
@@ -3379,7 +3393,8 @@ qemuBuildMemoryDimmBackendStr(virDomainMemoryDefPtr mem,
if (qemuBuildMemoryBackendStr(&props, &backendType, cfg, qemuCaps, def,
mem->targetNode, mem->size, mem->pagesize,
- mem->sourceNodes, auto_nodeset, true) < 0)
+ mem->sourceNodes, auto_nodeset, mem->path,
+ true) < 0)
goto cleanup;
ret = virQEMUBuildObjectCommandlineFromJSON(backendType, alias, props);
@@ -3396,6 +3411,7 @@ char *
qemuBuildMemoryDeviceStr(virDomainMemoryDefPtr mem)
{
virBuffer buf = VIR_BUFFER_INITIALIZER;
+ const char *device;
if (!mem->info.alias) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
@@ -3404,8 +3420,15 @@ qemuBuildMemoryDeviceStr(virDomainMemoryDefPtr mem)
}
switch ((virDomainMemoryModel) mem->model) {
+ case VIR_DOMAIN_MEMORY_MODEL_NVDIMM:
case VIR_DOMAIN_MEMORY_MODEL_DIMM:
- virBufferAddLit(&buf, "pc-dimm,");
+
+ if (mem->model == VIR_DOMAIN_MEMORY_MODEL_DIMM)
+ device = "pc-dimm";
+ else
+ device = "nvdimm";
+
+ virBufferAsprintf(&buf, "%s,", device);
if (mem->targetNode >= 0)
virBufferAsprintf(&buf, "node=%d,", mem->targetNode);
@@ -3421,12 +3444,6 @@ qemuBuildMemoryDeviceStr(virDomainMemoryDefPtr mem)
break;
- case VIR_DOMAIN_MEMORY_MODEL_NVDIMM:
- virReportError(VIR_ERR_NO_SUPPORT, "%s",
- _("nvdimm not supported yet"));
- return NULL;
- break;
-
case VIR_DOMAIN_MEMORY_MODEL_NONE:
case VIR_DOMAIN_MEMORY_MODEL_LAST:
break;
@@ -6976,6 +6993,7 @@ qemuBuildMachineCommandLine(virCommandPtr cmd,
{
virBuffer buf = VIR_BUFFER_INITIALIZER;
bool obsoleteAccel = false;
+ size_t i;
int ret = -1;
/* This should *never* be NULL, since we always provide
@@ -7012,6 +7030,15 @@ qemuBuildMachineCommandLine(virCommandPtr cmd,
"with this QEMU binary"));
return -1;
}
+
+ for (i = 0; i < def->nmems; i++) {
+ if (def->mems[i]->model == VIR_DOMAIN_MEMORY_MODEL_NVDIMM) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+ _("nvdimm not is not available "
+ "with this QEMU binary"));
+ return -1;
+ }
+ }
} else {
virTristateSwitch vmport = def->features[VIR_DOMAIN_FEATURE_VMPORT];
virTristateSwitch smm = def->features[VIR_DOMAIN_FEATURE_SMM];
@@ -7132,6 +7159,18 @@ qemuBuildMachineCommandLine(virCommandPtr cmd,
}
}
+ for (i = 0; i < def->nmems; i++) {
+ if (def->mems[i]->model == VIR_DOMAIN_MEMORY_MODEL_NVDIMM) {
+ if (!virQEMUCapsGet(qemuCaps, QEMU_CAPS_DEVICE_NVDIMM)) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+ _("nvdimm isn't supported by this QEMU binary"));
+ goto cleanup;
+ }
+ virBufferAddLit(&buf, ",nvdimm=on");
+ break;
+ }
+ }
+
virCommandAddArgBuffer(cmd, &buf);
}
diff --git a/src/qemu/qemu_command.h b/src/qemu/qemu_command.h
index efdad77f3..e23930255 100644
--- a/src/qemu/qemu_command.h
+++ b/src/qemu/qemu_command.h
@@ -134,6 +134,7 @@ int qemuBuildMemoryBackendStr(virJSONValuePtr *backendProps,
unsigned long long pagesize,
virBitmapPtr userNodeset,
virBitmapPtr autoNodeset,
+ const char *memPath,
bool force);
char *qemuBuildMemoryDeviceStr(virDomainMemoryDefPtr mem);
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index 5ec610564..f585e6f25 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -5878,6 +5878,7 @@ qemuDomainDefValidateMemoryHotplugDevice(const virDomainMemoryDef *mem,
{
switch ((virDomainMemoryModel) mem->model) {
case VIR_DOMAIN_MEMORY_MODEL_DIMM:
+ case VIR_DOMAIN_MEMORY_MODEL_NVDIMM:
if (mem->info.type != VIR_DOMAIN_DEVICE_ADDRESS_TYPE_DIMM &&
mem->info.type != VIR_DOMAIN_DEVICE_ADDRESS_TYPE_NONE) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
@@ -5910,11 +5911,6 @@ qemuDomainDefValidateMemoryHotplugDevice(const virDomainMemoryDef *mem,
}
break;
- case VIR_DOMAIN_MEMORY_MODEL_NVDIMM:
- virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
- _("nvdimm hotplug not supported yet"));
- return -1;
-
case VIR_DOMAIN_MEMORY_MODEL_NONE:
case VIR_DOMAIN_MEMORY_MODEL_LAST:
return -1;
@@ -5968,12 +5964,6 @@ qemuDomainDefValidateMemoryHotplug(const virDomainDef *def,
return 0;
}
- if (!virQEMUCapsGet(qemuCaps, QEMU_CAPS_DEVICE_PC_DIMM)) {
- virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
- _("memory hotplug isn't supported by this QEMU binary"));
- return -1;
- }
-
if (!ARCH_IS_PPC64(def->os.arch)) {
/* due to guest support, qemu would silently enable NUMA with one node
* once the memory hotplug backend is enabled. To avoid possible
@@ -5997,6 +5987,28 @@ qemuDomainDefValidateMemoryHotplug(const virDomainDef *def,
for (i = 0; i < def->nmems; i++) {
hotplugMemory += def->mems[i]->size;
+ switch ((virDomainMemoryModel) def->mems[i]->model) {
+ case VIR_DOMAIN_MEMORY_MODEL_DIMM:
+ if (!virQEMUCapsGet(qemuCaps, QEMU_CAPS_DEVICE_PC_DIMM)) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+ _("memory hotplug isn't supported by this QEMU binary"));
+ return -1;
+ }
+ break;
+
+ case VIR_DOMAIN_MEMORY_MODEL_NVDIMM:
+ if (!virQEMUCapsGet(qemuCaps, QEMU_CAPS_DEVICE_NVDIMM)) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+ _("nvdimm isn't supported by this QEMU binary"));
+ return -1;
+ }
+ break;
+
+ case VIR_DOMAIN_MEMORY_MODEL_NONE:
+ case VIR_DOMAIN_MEMORY_MODEL_LAST:
+ break;
+ }
+
/* already existing devices don't need to be checked on hotplug */
if (!mem &&
qemuDomainDefValidateMemoryHotplugDevice(def->mems[i], def) < 0)
diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c
index c7b8074d6..51b87804d 100644
--- a/src/qemu/qemu_hotplug.c
+++ b/src/qemu/qemu_hotplug.c
@@ -2218,7 +2218,8 @@ qemuDomainAttachMemory(virQEMUDriverPtr driver,
if (qemuBuildMemoryBackendStr(&props, &backendType, cfg, priv->qemuCaps,
vm->def, mem->targetNode, mem->size,
- mem->pagesize, mem->sourceNodes, NULL, true) < 0)
+ mem->pagesize, mem->sourceNodes, NULL,
+ mem->path, true) < 0)
goto cleanup;
if (virDomainMemoryInsert(vm->def, mem) < 0) {
diff --git a/tests/qemuxml2argvdata/qemuxml2argv-memory-hotplug-nvdimm.args b/tests/qemuxml2argvdata/qemuxml2argv-memory-hotplug-nvdimm.args
new file mode 100644
index 000000000..907bcbeda
--- /dev/null
+++ b/tests/qemuxml2argvdata/qemuxml2argv-memory-hotplug-nvdimm.args
@@ -0,0 +1,26 @@
+LC_ALL=C \
+PATH=/bin \
+HOME=/home/test \
+USER=test \
+LOGNAME=test \
+QEMU_AUDIO_DRV=none \
+/usr/bin/qemu \
+-name QEMUGuest1 \
+-S \
+-machine pc,accel=tcg,nvdimm=on \
+-m size=1048576k,slots=16,maxmem=1099511627776k \
+-smp 2,sockets=2,cores=1,threads=1 \
+-numa node,nodeid=0,cpus=0-1,mem=1024 \
+-object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm,\
+size=536870912 \
+-device nvdimm,node=0,memdev=memnvdimm0,id=nvdimm0,slot=0 \
+-uuid c7a5fdbd-edaf-9455-926a-d65c16db1809 \
+-nographic \
+-nodefaults \
+-monitor unix:/tmp/lib/domain--1-QEMUGuest1/monitor.sock,server,nowait \
+-no-acpi \
+-boot c \
+-usb \
+-drive file=/dev/HostVG/QEMUGuest1,format=raw,if=none,id=drive-ide0-0-0 \
+-device ide-drive,bus=ide.0,unit=0,drive=drive-ide0-0-0,id=ide0-0-0 \
+-device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x3
diff --git a/tests/qemuxml2argvtest.c b/tests/qemuxml2argvtest.c
index f55b04b05..df0007e57 100644
--- a/tests/qemuxml2argvtest.c
+++ b/tests/qemuxml2argvtest.c
@@ -2343,7 +2343,7 @@ mymain(void)
DO_TEST_FAILURE("memory-align-fail", NONE);
DO_TEST_FAILURE("memory-hotplug-nonuma", QEMU_CAPS_DEVICE_PC_DIMM);
- DO_TEST_FAILURE("memory-hotplug", NONE);
+ DO_TEST("memory-hotplug", NONE);
DO_TEST("memory-hotplug", QEMU_CAPS_DEVICE_PC_DIMM, QEMU_CAPS_NUMA);
DO_TEST("memory-hotplug-dimm", QEMU_CAPS_DEVICE_PC_DIMM, QEMU_CAPS_NUMA,
QEMU_CAPS_OBJECT_MEMORY_RAM, QEMU_CAPS_OBJECT_MEMORY_FILE);
@@ -2351,6 +2351,8 @@ mymain(void)
QEMU_CAPS_OBJECT_MEMORY_RAM, QEMU_CAPS_OBJECT_MEMORY_FILE);
DO_TEST("memory-hotplug-ppc64-nonuma", QEMU_CAPS_KVM, QEMU_CAPS_DEVICE_PC_DIMM, QEMU_CAPS_NUMA,
QEMU_CAPS_OBJECT_MEMORY_RAM, QEMU_CAPS_OBJECT_MEMORY_FILE);
+ DO_TEST("memory-hotplug-nvdimm", QEMU_CAPS_MACHINE_OPT, QEMU_CAPS_DEVICE_NVDIMM,
+ QEMU_CAPS_NUMA, QEMU_CAPS_OBJECT_MEMORY_RAM, QEMU_CAPS_OBJECT_MEMORY_FILE);
DO_TEST("machine-aeskeywrap-on-caps",
QEMU_CAPS_MACHINE_OPT, QEMU_CAPS_AES_KEY_WRAP,
--
2.11.0
--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list
On Thu, Feb 23, 2017 at 10:02:48AM +0100, Michal Privoznik wrote: > So, majority of the code is just ready as-is. Well, with one > slight change: differentiate between dimm and nvdimm in places > like device alias generation, generating the command line and so > on. > > Speaking of the command line, we also need to append 'nvdimm=on' > to the '-machine' argument so that the nvdimm feature is > advertised in the ACPI tables properly. > > Signed-off-by: Michal Privoznik <mprivozn@redhat.com> > + if (prealloc && > + virJSONValueObjectAdd(props, > + "b:prealloc", true, > + NULL) < 0) > + goto cleanup; As discussed on IRC, using prealloc with QEMU causes it to memset() the first byte of every page of memory to 0. With NVDIMM this is obviously corrupting application data stored in the NVDIMM. Obviously this needs fixing in QEMU, but I would think that libvirt needs to block use of prealloc==true when running against existing broken QEMU versions, otherwise users are going to be rather upset to see their data corrupted on every boot Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://entangle-photo.org -o- http://search.cpan.org/~danberr/ :| -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
On 02/23/2017 11:02 AM, Daniel P. Berrange wrote: > On Thu, Feb 23, 2017 at 10:02:48AM +0100, Michal Privoznik wrote: >> So, majority of the code is just ready as-is. Well, with one >> slight change: differentiate between dimm and nvdimm in places >> like device alias generation, generating the command line and so >> on. >> >> Speaking of the command line, we also need to append 'nvdimm=on' >> to the '-machine' argument so that the nvdimm feature is >> advertised in the ACPI tables properly. >> >> Signed-off-by: Michal Privoznik <mprivozn@redhat.com> > > >> + if (prealloc && >> + virJSONValueObjectAdd(props, >> + "b:prealloc", true, >> + NULL) < 0) >> + goto cleanup; > > As discussed on IRC, using prealloc with QEMU causes it to memset() > the first byte of every page of memory to 0. With NVDIMM this is > obviously corrupting application data stored in the NVDIMM. > > Obviously this needs fixing in QEMU, but I would think that libvirt > needs to block use of prealloc==true when running against existing > broken QEMU versions, otherwise users are going to be rather upset > to see their data corrupted on every boot They are, but should we work around broken QEMUs? And if so - how do we detect whether the qemu we are dealing with is broken or already fixed in that respect? On the other hand, we can just not set prealloc true for nvdimm. That will have one downside though - after qemu mmaps() the file, kernel is not forced to create a private copy of the pages. Michal -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
On Thu, Feb 23, 2017 at 11:16:17AM +0100, Michal Privoznik wrote: > On 02/23/2017 11:02 AM, Daniel P. Berrange wrote: > > On Thu, Feb 23, 2017 at 10:02:48AM +0100, Michal Privoznik wrote: > >> So, majority of the code is just ready as-is. Well, with one > >> slight change: differentiate between dimm and nvdimm in places > >> like device alias generation, generating the command line and so > >> on. > >> > >> Speaking of the command line, we also need to append 'nvdimm=on' > >> to the '-machine' argument so that the nvdimm feature is > >> advertised in the ACPI tables properly. > >> > >> Signed-off-by: Michal Privoznik <mprivozn@redhat.com> > > > > > >> + if (prealloc && > >> + virJSONValueObjectAdd(props, > >> + "b:prealloc", true, > >> + NULL) < 0) > >> + goto cleanup; > > > > As discussed on IRC, using prealloc with QEMU causes it to memset() > > the first byte of every page of memory to 0. With NVDIMM this is > > obviously corrupting application data stored in the NVDIMM. > > > > Obviously this needs fixing in QEMU, but I would think that libvirt > > needs to block use of prealloc==true when running against existing > > broken QEMU versions, otherwise users are going to be rather upset > > to see their data corrupted on every boot > > They are, but should we work around broken QEMUs? And if so - how do we > detect whether the qemu we are dealing with is broken or already fixed > in that respect? Most likely we'll just have todo a version number check I think, since fixing this isn't going to involve any externally visible change to QEMU. > On the other hand, we can just not set prealloc true for nvdimm. That > will have one downside though - after qemu mmaps() the file, kernel is > not forced to create a private copy of the pages. If the guest has requested prealloc, then I don't think we can ignore it for NVDIMM, because its a clear violation of the memory guarantee we're claiming to provide apps. Thus I think we've no option but to report an error if we see prealloc + broken QEMU Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://entangle-photo.org -o- http://search.cpan.org/~danberr/ :| -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
On 02/23/2017 11:48 AM, Daniel P. Berrange wrote: > On Thu, Feb 23, 2017 at 11:16:17AM +0100, Michal Privoznik wrote: >> On 02/23/2017 11:02 AM, Daniel P. Berrange wrote: >>> On Thu, Feb 23, 2017 at 10:02:48AM +0100, Michal Privoznik wrote: >>>> So, majority of the code is just ready as-is. Well, with one >>>> slight change: differentiate between dimm and nvdimm in places >>>> like device alias generation, generating the command line and so >>>> on. >>>> >>>> Speaking of the command line, we also need to append 'nvdimm=on' >>>> to the '-machine' argument so that the nvdimm feature is >>>> advertised in the ACPI tables properly. >>>> >>>> Signed-off-by: Michal Privoznik <mprivozn@redhat.com> >>> >>> >>>> + if (prealloc && >>>> + virJSONValueObjectAdd(props, >>>> + "b:prealloc", true, >>>> + NULL) < 0) >>>> + goto cleanup; >>> >>> As discussed on IRC, using prealloc with QEMU causes it to memset() >>> the first byte of every page of memory to 0. With NVDIMM this is >>> obviously corrupting application data stored in the NVDIMM. >>> >>> Obviously this needs fixing in QEMU, but I would think that libvirt >>> needs to block use of prealloc==true when running against existing >>> broken QEMU versions, otherwise users are going to be rather upset >>> to see their data corrupted on every boot >> >> They are, but should we work around broken QEMUs? And if so - how do we >> detect whether the qemu we are dealing with is broken or already fixed >> in that respect? > > Most likely we'll just have todo a version number check I think, since > fixing this isn't going to involve any externally visible change to > QEMU. What about backports then? If some distro decides to backport your fix posted on the qemu list, this check here would prevent them from running fixed qemu. Also, I don't think we want to work around qemu bugs. > >> On the other hand, we can just not set prealloc true for nvdimm. That >> will have one downside though - after qemu mmaps() the file, kernel is >> not forced to create a private copy of the pages. > > If the guest has requested prealloc, then I don't think we can ignore > it for NVDIMM, because its a clear violation of the memory guarantee > we're claiming to provide apps. Thus I think we've no option but to > report an error if we see prealloc + broken QEMU Do you mean user instead of guest? Because it's user who requests prealloc (well, in theory it is user; libvirt does not allow users to request prealloc but rather has some rules worked in that request it instead). Michal -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
On Thu, Feb 23, 2017 at 02:07:27PM +0100, Michal Privoznik wrote: > On 02/23/2017 11:48 AM, Daniel P. Berrange wrote: > > On Thu, Feb 23, 2017 at 11:16:17AM +0100, Michal Privoznik wrote: > >> On 02/23/2017 11:02 AM, Daniel P. Berrange wrote: > >>> On Thu, Feb 23, 2017 at 10:02:48AM +0100, Michal Privoznik wrote: > >>>> So, majority of the code is just ready as-is. Well, with one > >>>> slight change: differentiate between dimm and nvdimm in places > >>>> like device alias generation, generating the command line and so > >>>> on. > >>>> > >>>> Speaking of the command line, we also need to append 'nvdimm=on' > >>>> to the '-machine' argument so that the nvdimm feature is > >>>> advertised in the ACPI tables properly. > >>>> > >>>> Signed-off-by: Michal Privoznik <mprivozn@redhat.com> > >>> > >>> > >>>> + if (prealloc && > >>>> + virJSONValueObjectAdd(props, > >>>> + "b:prealloc", true, > >>>> + NULL) < 0) > >>>> + goto cleanup; > >>> > >>> As discussed on IRC, using prealloc with QEMU causes it to memset() > >>> the first byte of every page of memory to 0. With NVDIMM this is > >>> obviously corrupting application data stored in the NVDIMM. > >>> > >>> Obviously this needs fixing in QEMU, but I would think that libvirt > >>> needs to block use of prealloc==true when running against existing > >>> broken QEMU versions, otherwise users are going to be rather upset > >>> to see their data corrupted on every boot > >> > >> They are, but should we work around broken QEMUs? And if so - how do we > >> detect whether the qemu we are dealing with is broken or already fixed > >> in that respect? > > > > Most likely we'll just have todo a version number check I think, since > > fixing this isn't going to involve any externally visible change to > > QEMU. > > What about backports then? If some distro decides to backport your fix > posted on the qemu list, this check here would prevent them from running > fixed qemu. Also, I don't think we want to work around qemu bugs. If the distro backports the QEMU fix, then they would have to adjust their libvirt to relax the version check too. Obviously we try to avoid this kind of scenario, but sometimes there's no other option, and this looks like one of those times. > >> On the other hand, we can just not set prealloc true for nvdimm. That > >> will have one downside though - after qemu mmaps() the file, kernel is > >> not forced to create a private copy of the pages. > > > > If the guest has requested prealloc, then I don't think we can ignore > > it for NVDIMM, because its a clear violation of the memory guarantee > > we're claiming to provide apps. Thus I think we've no option but to > > report an error if we see prealloc + broken QEMU > > Do you mean user instead of guest? Because it's user who requests > prealloc (well, in theory it is user; libvirt does not allow users to > request prealloc but rather has some rules worked in that request it > instead). Sorry yes, i meant user. eg if they set <memoryBacking> <allocation mode="immediate"/> </memoryBacking> I guess this is fairly new syntax so probably few people/apps are using it. IIRC, if this is not present in XML, then we just automagically assume allocation=immediate if using huge pages. So I guess we could say, if allocation=immediate is *not* in the guest XML, then NVDIMMs are assumed to have allocation=ondemand by default. It'd be inconsistent behaviour, but not wrong. That way we'd only need to raise an error if the user had explicitly set allocation=immediate Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://entangle-photo.org -o- http://search.cpan.org/~danberr/ :| -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
On 02/23/2017 02:16 PM, Daniel P. Berrange wrote: > On Thu, Feb 23, 2017 at 02:07:27PM +0100, Michal Privoznik wrote: >> On 02/23/2017 11:48 AM, Daniel P. Berrange wrote: >>> On Thu, Feb 23, 2017 at 11:16:17AM +0100, Michal Privoznik wrote: >>>> On 02/23/2017 11:02 AM, Daniel P. Berrange wrote: >>>>> On Thu, Feb 23, 2017 at 10:02:48AM +0100, Michal Privoznik wrote: >>>>>> So, majority of the code is just ready as-is. Well, with one >>>>>> slight change: differentiate between dimm and nvdimm in places >>>>>> like device alias generation, generating the command line and so >>>>>> on. >>>>>> >>>>>> Speaking of the command line, we also need to append 'nvdimm=on' >>>>>> to the '-machine' argument so that the nvdimm feature is >>>>>> advertised in the ACPI tables properly. >>>>>> >>>>>> Signed-off-by: Michal Privoznik <mprivozn@redhat.com> >>>>> >>>>> >>>>>> + if (prealloc && >>>>>> + virJSONValueObjectAdd(props, >>>>>> + "b:prealloc", true, >>>>>> + NULL) < 0) >>>>>> + goto cleanup; >>>>> >>>>> As discussed on IRC, using prealloc with QEMU causes it to memset() >>>>> the first byte of every page of memory to 0. With NVDIMM this is >>>>> obviously corrupting application data stored in the NVDIMM. >>>>> >>>>> Obviously this needs fixing in QEMU, but I would think that libvirt >>>>> needs to block use of prealloc==true when running against existing >>>>> broken QEMU versions, otherwise users are going to be rather upset >>>>> to see their data corrupted on every boot >>>> >>>> They are, but should we work around broken QEMUs? And if so - how do we >>>> detect whether the qemu we are dealing with is broken or already fixed >>>> in that respect? >>> >>> Most likely we'll just have todo a version number check I think, since >>> fixing this isn't going to involve any externally visible change to >>> QEMU. >> >> What about backports then? If some distro decides to backport your fix >> posted on the qemu list, this check here would prevent them from running >> fixed qemu. Also, I don't think we want to work around qemu bugs. > > If the distro backports the QEMU fix, then they would have to adjust > their libvirt to relax the version check too. Obviously we try to > avoid this kind of scenario, but sometimes there's no other option, > and this looks like one of those times. Okay, fair enough. > >>>> On the other hand, we can just not set prealloc true for nvdimm. That >>>> will have one downside though - after qemu mmaps() the file, kernel is >>>> not forced to create a private copy of the pages. >>> >>> If the guest has requested prealloc, then I don't think we can ignore >>> it for NVDIMM, because its a clear violation of the memory guarantee >>> we're claiming to provide apps. Thus I think we've no option but to >>> report an error if we see prealloc + broken QEMU >> >> Do you mean user instead of guest? Because it's user who requests >> prealloc (well, in theory it is user; libvirt does not allow users to >> request prealloc but rather has some rules worked in that request it >> instead). > > Sorry yes, i meant user. eg if they set > > <memoryBacking> > <allocation mode="immediate"/> > </memoryBacking> > > I guess this is fairly new syntax so probably few people/apps are using > it. IIRC, if this is not present in XML, then we just automagically > assume allocation=immediate if using huge pages. > > So I guess we could say, if allocation=immediate is *not* in the guest > XML, then NVDIMMs are assumed to have allocation=ondemand by default. > > It'd be inconsistent behaviour, but not wrong. > > That way we'd only need to raise an error if the user had explicitly > set allocation=immediate Frankly, I'd rather be consistent AND bug-free at the same time. Can't we make NVDIMM work with just fixed QEMUs? At the level code, the nvdimm capability would be set if nvdimm is found in a list of devices provided by qemu AND if the qemu version is at least 2.8 (assuming your fix makes it into the release). Michal > > Regards, > Daniel > -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
On Thu, Feb 23, 2017 at 02:25:13PM +0100, Michal Privoznik wrote: > On 02/23/2017 02:16 PM, Daniel P. Berrange wrote: > > On Thu, Feb 23, 2017 at 02:07:27PM +0100, Michal Privoznik wrote: > >> On 02/23/2017 11:48 AM, Daniel P. Berrange wrote: > >>> On Thu, Feb 23, 2017 at 11:16:17AM +0100, Michal Privoznik wrote: > >>>> On 02/23/2017 11:02 AM, Daniel P. Berrange wrote: > >>>>> On Thu, Feb 23, 2017 at 10:02:48AM +0100, Michal Privoznik wrote: > >>>>>> So, majority of the code is just ready as-is. Well, with one > >>>>>> slight change: differentiate between dimm and nvdimm in places > >>>>>> like device alias generation, generating the command line and so > >>>>>> on. > >>>>>> > >>>>>> Speaking of the command line, we also need to append 'nvdimm=on' > >>>>>> to the '-machine' argument so that the nvdimm feature is > >>>>>> advertised in the ACPI tables properly. > >>>>>> > >>>>>> Signed-off-by: Michal Privoznik <mprivozn@redhat.com> > >>>>> > >>>>> > >>>>>> + if (prealloc && > >>>>>> + virJSONValueObjectAdd(props, > >>>>>> + "b:prealloc", true, > >>>>>> + NULL) < 0) > >>>>>> + goto cleanup; > >>>>> > >>>>> As discussed on IRC, using prealloc with QEMU causes it to memset() > >>>>> the first byte of every page of memory to 0. With NVDIMM this is > >>>>> obviously corrupting application data stored in the NVDIMM. > >>>>> > >>>>> Obviously this needs fixing in QEMU, but I would think that libvirt > >>>>> needs to block use of prealloc==true when running against existing > >>>>> broken QEMU versions, otherwise users are going to be rather upset > >>>>> to see their data corrupted on every boot > >>>> > >>>> They are, but should we work around broken QEMUs? And if so - how do we > >>>> detect whether the qemu we are dealing with is broken or already fixed > >>>> in that respect? > >>> > >>> Most likely we'll just have todo a version number check I think, since > >>> fixing this isn't going to involve any externally visible change to > >>> QEMU. > >> > >> What about backports then? If some distro decides to backport your fix > >> posted on the qemu list, this check here would prevent them from running > >> fixed qemu. Also, I don't think we want to work around qemu bugs. > > > > If the distro backports the QEMU fix, then they would have to adjust > > their libvirt to relax the version check too. Obviously we try to > > avoid this kind of scenario, but sometimes there's no other option, > > and this looks like one of those times. > > Okay, fair enough. > > > > >>>> On the other hand, we can just not set prealloc true for nvdimm. That > >>>> will have one downside though - after qemu mmaps() the file, kernel is > >>>> not forced to create a private copy of the pages. > >>> > >>> If the guest has requested prealloc, then I don't think we can ignore > >>> it for NVDIMM, because its a clear violation of the memory guarantee > >>> we're claiming to provide apps. Thus I think we've no option but to > >>> report an error if we see prealloc + broken QEMU > >> > >> Do you mean user instead of guest? Because it's user who requests > >> prealloc (well, in theory it is user; libvirt does not allow users to > >> request prealloc but rather has some rules worked in that request it > >> instead). > > > > Sorry yes, i meant user. eg if they set > > > > <memoryBacking> > > <allocation mode="immediate"/> > > </memoryBacking> > > > > I guess this is fairly new syntax so probably few people/apps are using > > it. IIRC, if this is not present in XML, then we just automagically > > assume allocation=immediate if using huge pages. > > > > So I guess we could say, if allocation=immediate is *not* in the guest > > XML, then NVDIMMs are assumed to have allocation=ondemand by default. > > > > It'd be inconsistent behaviour, but not wrong. > > > > That way we'd only need to raise an error if the user had explicitly > > set allocation=immediate > > Frankly, I'd rather be consistent AND bug-free at the same time. Can't > we make NVDIMM work with just fixed QEMUs? At the level code, the nvdimm > capability would be set if nvdimm is found in a list of devices provided > by qemu AND if the qemu version is at least 2.8 (assuming your fix makes > it into the release). Yep that's a valid approach to take assuming QEMU get a fix done in a timely manner, so that people aren't pressuring us to support the buggy QEMU. Lets aim todo that until someone complains :-) Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://entangle-photo.org -o- http://search.cpan.org/~danberr/ :| -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
© 2016 - 2025 Red Hat, Inc.