Before we exec() qemu we have to spawn pr-helper processes for
all managed reservations (well, technically there can only one).
The only caveat there is that we should place the process into
the same namespace and cgroup as qemu (so that it shares the same
view of the system). But we can do that only after we've forked.
That means calling the setup function between fork() and exec().
Signed-off-by: Michal Privoznik <mprivozn@redhat.com>
---
src/qemu/qemu_process.c | 169 ++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 169 insertions(+)
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index 57c06c7c1..b876d293a 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -2551,6 +2551,169 @@ qemuProcessResctrlCreate(virQEMUDriverPtr driver,
ret = 0;
cleanup:
virObjectUnref(caps);
+
+ return ret;
+}
+
+
+static void
+qemuProcessKillPRDaemon(virDomainObjPtr vm)
+{
+ qemuDomainObjPrivatePtr priv = vm->privateData;
+
+ if (priv->prPid == (pid_t) -1)
+ return;
+
+ virProcessKillPainfully(priv->prPid, true);
+ priv->prPid = (pid_t) -1;
+}
+
+
+static int
+qemuProcessSetupOnePRDaemonHook(void *opaque)
+{
+ virDomainObjPtr vm = opaque;
+ size_t i, nfds = 0;
+ int *fds = NULL;
+ int ret = -1;
+
+ if (virProcessGetNamespaces(vm->pid, &nfds, &fds) < 0)
+ return ret;
+
+ if (nfds > 0 &&
+ virProcessSetNamespaces(nfds, fds) < 0)
+ goto cleanup;
+
+ ret = 0;
+ cleanup:
+ for (i = 0; i < nfds; i++)
+ VIR_FORCE_CLOSE(fds[i]);
+ VIR_FREE(fds);
+ return ret;
+}
+
+
+static int
+qemuProcessSetupOnePRDaemon(virDomainObjPtr vm,
+ virDomainDiskDefPtr disk)
+{
+ qemuDomainObjPrivatePtr priv = vm->privateData;
+ virQEMUDriverPtr driver = priv->driver;
+ virQEMUDriverConfigPtr cfg;
+ qemuDomainStorageSourcePrivatePtr srcPriv;
+ qemuDomainDiskPRDPtr prd;
+ char *pidfile = NULL;
+ pid_t cpid = -1;
+ virCommandPtr cmd = NULL;
+ virTimeBackOffVar timebackoff;
+ const unsigned long long timeout = 500000; /* ms */
+ int ret = -1;
+
+ if (priv->prPid != (pid_t) -1 ||
+ !virStoragePRDefIsManaged(disk->src->pr))
+ return 0;
+
+ cfg = virQEMUDriverGetConfig(driver);
+ srcPriv = QEMU_DOMAIN_STORAGE_SOURCE_PRIVATE(disk->src);
+ prd = srcPriv->prd;
+
+ if (!virFileIsExecutable(cfg->prHelperName)) {
+ virReportSystemError(errno, _("'%s' is not a suitable pr helper"),
+ cfg->prHelperName);
+ goto cleanup;
+ }
+
+ if (!(pidfile = virPidFileBuildPath(cfg->stateDir, prd->alias)))
+ goto cleanup;
+
+ if (unlink(pidfile) < 0 &&
+ errno != ENOENT) {
+ virReportSystemError(errno,
+ _("Cannot remove stale PID file %s"),
+ pidfile);
+ goto cleanup;
+ }
+
+ if (!(cmd = virCommandNewArgList(cfg->prHelperName,
+ "-k", prd->path,
+ NULL)))
+ goto cleanup;
+
+ virCommandDaemonize(cmd);
+ virCommandSetPidFile(cmd, pidfile);
+ /* The only caveat there is that we should place the process
+ * into the same namespace and cgroup as qemu (so that it
+ * shares the same view of the system). */
+ virCommandSetPreExecHook(cmd, qemuProcessSetupOnePRDaemonHook, vm);
+
+ if (virCommandRun(cmd, NULL) < 0)
+ goto cleanup;
+
+ if (virPidFileReadPath(pidfile, &cpid) < 0) {
+ virReportError(VIR_ERR_INTERNAL_ERROR,
+ _("pr helper %s didn't show up"), prd->alias);
+ goto cleanup;
+ }
+
+ if (virTimeBackOffStart(&timebackoff, 1, timeout) < 0)
+ goto cleanup;
+ while (virTimeBackOffWait(&timebackoff)) {
+ if (virFileExists(prd->path))
+ break;
+
+ if (virProcessKill(cpid, 0) == 0)
+ continue;
+
+ virReportSystemError(errno,
+ _("pr helper %s died unexpectedly"), prd->alias);
+ goto cleanup;
+ }
+
+ if (priv->cgroup &&
+ virCgroupAddMachineTask(priv->cgroup, cpid) < 0)
+ goto cleanup;
+
+ if (qemuSecurityDomainSetPathLabel(driver->securityManager,
+ vm->def, prd->path, true) < 0)
+ goto cleanup;
+
+ priv->prPid = cpid;
+ ret = 0;
+ cleanup:
+ if (ret < 0) {
+ virCommandAbort(cmd);
+ virProcessKillPainfully(cpid, true);
+ }
+ virCommandFree(cmd);
+ if (pidfile) {
+ if (unlink(pidfile) < 0 &&
+ errno != ENOENT &&
+ !virGetLastError())
+ virReportSystemError(errno,
+ _("Cannot remove stale PID file %s"),
+ pidfile);
+ VIR_FREE(pidfile);
+ }
+ virObjectUnref(cfg);
+ return ret;
+}
+
+
+static int
+qemuProcessSetupPRDaemon(virDomainObjPtr vm)
+{
+ size_t i;
+ int ret = -1;
+
+ for (i = 0; i < vm->def->ndisks; i++) {
+ if (qemuProcessSetupOnePRDaemon(vm, vm->def->disks[i]) < 0)
+ goto cleanup;
+ }
+
+ ret = 0;
+ cleanup:
+ if (ret < 0)
+ qemuProcessKillPRDaemon(vm);
return ret;
}
@@ -6065,6 +6228,10 @@ qemuProcessLaunch(virConnectPtr conn,
if (qemuProcessResctrlCreate(driver, vm) < 0)
goto cleanup;
+ VIR_DEBUG("Setting up PR daemon");
+ if (qemuProcessSetupPRDaemon(vm) < 0)
+ goto cleanup;
+
VIR_DEBUG("Setting domain security labels");
if (qemuSecuritySetAllLabel(driver,
vm,
@@ -6645,6 +6812,8 @@ void qemuProcessStop(virQEMUDriverPtr driver,
VIR_FREE(vm->def->seclabels[i]->imagelabel);
}
+ qemuProcessKillPRDaemon(vm);
+
qemuHostdevReAttachDomainDevices(driver, vm->def);
def = vm->def;
--
2.16.1
--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list
On Wed, Mar 14, 2018 at 17:05:38 +0100, Michal Privoznik wrote: > Before we exec() qemu we have to spawn pr-helper processes for > all managed reservations (well, technically there can only one). > The only caveat there is that we should place the process into > the same namespace and cgroup as qemu (so that it shares the same > view of the system). But we can do that only after we've forked. > That means calling the setup function between fork() and exec(). > > Signed-off-by: Michal Privoznik <mprivozn@redhat.com> > --- > src/qemu/qemu_process.c | 169 ++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 169 insertions(+) > > diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c > index 57c06c7c1..b876d293a 100644 > --- a/src/qemu/qemu_process.c > +++ b/src/qemu/qemu_process.c > @@ -2551,6 +2551,169 @@ qemuProcessResctrlCreate(virQEMUDriverPtr driver, > +static int > +qemuProcessSetupOnePRDaemonHook(void *opaque) The naming does not make much sense here. There's only one managed daemon. > +{ > + virDomainObjPtr vm = opaque; > + size_t i, nfds = 0; > + int *fds = NULL; > + int ret = -1; > + > + if (virProcessGetNamespaces(vm->pid, &nfds, &fds) < 0) > + return ret; > + > + if (nfds > 0 && > + virProcessSetNamespaces(nfds, fds) < 0) > + goto cleanup; > + > + ret = 0; > + cleanup: > + for (i = 0; i < nfds; i++) > + VIR_FORCE_CLOSE(fds[i]); > + VIR_FREE(fds); > + return ret; > +} > + > + > +static int > +qemuProcessSetupOnePRDaemon(virDomainObjPtr vm, > + virDomainDiskDefPtr disk) > +{ > + qemuDomainObjPrivatePtr priv = vm->privateData; > + virQEMUDriverPtr driver = priv->driver; > + virQEMUDriverConfigPtr cfg; > + qemuDomainStorageSourcePrivatePtr srcPriv; > + qemuDomainDiskPRDPtr prd; > + char *pidfile = NULL; > + pid_t cpid = -1; > + virCommandPtr cmd = NULL; > + virTimeBackOffVar timebackoff; > + const unsigned long long timeout = 500000; /* ms */ > + int ret = -1; > + > + if (priv->prPid != (pid_t) -1 || > + !virStoragePRDefIsManaged(disk->src->pr)) > + return 0; This function should ever be called only for the managed PR daemon and only once per domain. This looks wrong and makes the caller look that multiple daemons are setup. Please rename it and move this logic to the caller, which determines whether the managed daemon is necessary and calls this exactly once if it is. > + > + cfg = virQEMUDriverGetConfig(driver); > + srcPriv = QEMU_DOMAIN_STORAGE_SOURCE_PRIVATE(disk->src); > + prd = srcPriv->prd; > + > + if (!virFileIsExecutable(cfg->prHelperName)) { > + virReportSystemError(errno, _("'%s' is not a suitable pr helper"), > + cfg->prHelperName); > + goto cleanup; > + } > + > + if (!(pidfile = virPidFileBuildPath(cfg->stateDir, prd->alias))) > + goto cleanup; > + > + if (unlink(pidfile) < 0 && > + errno != ENOENT) { > + virReportSystemError(errno, > + _("Cannot remove stale PID file %s"), > + pidfile); > + goto cleanup; > + } > + > + if (!(cmd = virCommandNewArgList(cfg->prHelperName, > + "-k", prd->path, > + NULL))) > + goto cleanup; > + > + virCommandDaemonize(cmd); > + virCommandSetPidFile(cmd, pidfile); > + /* The only caveat there is that we should place the process > + * into the same namespace and cgroup as qemu (so that it > + * shares the same view of the system). */ > + virCommandSetPreExecHook(cmd, qemuProcessSetupOnePRDaemonHook, vm); > + > + if (virCommandRun(cmd, NULL) < 0) > + goto cleanup; > + > + if (virPidFileReadPath(pidfile, &cpid) < 0) { > + virReportError(VIR_ERR_INTERNAL_ERROR, > + _("pr helper %s didn't show up"), prd->alias); > + goto cleanup; > + } > + > + if (virTimeBackOffStart(&timebackoff, 1, timeout) < 0) > + goto cleanup; > + while (virTimeBackOffWait(&timebackoff)) { > + if (virFileExists(prd->path)) > + break; > + > + if (virProcessKill(cpid, 0) == 0) > + continue; > + > + virReportSystemError(errno, > + _("pr helper %s died unexpectedly"), prd->alias); > + goto cleanup; > + } This code does not do anything on timeout. > + > + if (priv->cgroup && > + virCgroupAddMachineTask(priv->cgroup, cpid) < 0) > + goto cleanup; > + > + if (qemuSecurityDomainSetPathLabel(driver->securityManager, > + vm->def, prd->path, true) < 0) > + goto cleanup; So this function will probably fail with a weird error message. > + > + priv->prPid = cpid; > + ret = 0; > + cleanup: > + if (ret < 0) { > + virCommandAbort(cmd); > + virProcessKillPainfully(cpid, true); > + } > + virCommandFree(cmd); > + if (pidfile) { > + if (unlink(pidfile) < 0 && > + errno != ENOENT && > + !virGetLastError()) > + virReportSystemError(errno, > + _("Cannot remove stale PID file %s"), > + pidfile); > + VIR_FREE(pidfile); By removing the pidfile, you can't make sure later that the PR helper program is still the same process, thus when killing the VM by the pid number only you might actually kill a different process. I unfortunately already deleted the top of the message so you'll need to fix the function qemuProcessKillPRDaemon to see whether it's the same process, similarly to what we do when reconnecting to qemu. > + } > + virObjectUnref(cfg); > + return ret; > +} > + > + > +static int > +qemuProcessSetupPRDaemon(virDomainObjPtr vm) > +{ > + size_t i; > + int ret = -1; > + > + for (i = 0; i < vm->def->ndisks; i++) { > + if (qemuProcessSetupOnePRDaemon(vm, vm->def->disks[i]) < 0) > + goto cleanup; This looks wrong, since we only do one PR daemon. This function should determine whether one is needed and call that function only once. > + } > + > + ret = 0; > + cleanup: > + if (ret < 0) > + qemuProcessKillPRDaemon(vm); > return ret; > } > -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
On 03/15/2018 01:31 PM, Peter Krempa wrote: > On Wed, Mar 14, 2018 at 17:05:38 +0100, Michal Privoznik wrote: >> Before we exec() qemu we have to spawn pr-helper processes for >> all managed reservations (well, technically there can only one). >> The only caveat there is that we should place the process into >> the same namespace and cgroup as qemu (so that it shares the same >> view of the system). But we can do that only after we've forked. >> That means calling the setup function between fork() and exec(). >> >> Signed-off-by: Michal Privoznik <mprivozn@redhat.com> >> --- >> src/qemu/qemu_process.c | 169 ++++++++++++++++++++++++++++++++++++++++++++++++ >> 1 file changed, 169 insertions(+) >> > >> + >> + priv->prPid = cpid; >> + ret = 0; >> + cleanup: >> + if (ret < 0) { >> + virCommandAbort(cmd); >> + virProcessKillPainfully(cpid, true); >> + } >> + virCommandFree(cmd); >> + if (pidfile) { >> + if (unlink(pidfile) < 0 && >> + errno != ENOENT && >> + !virGetLastError()) >> + virReportSystemError(errno, >> + _("Cannot remove stale PID file %s"), >> + pidfile); >> + VIR_FREE(pidfile); > > By removing the pidfile, you can't make sure later that the PR helper > program is still the same process, thus when killing the VM by the pid > number only you might actually kill a different process. > > I unfortunately already deleted the top of the message so you'll need to > fix the function qemuProcessKillPRDaemon to see whether it's the same > process, similarly to what we do when reconnecting to qemu. Okay, let me write here what I told you in person. I've tried couple of approaches to do proper locking of pidfile: a) run qemu-pr-helper -f $pidfile in combination with virPidFileForceCleanupPath(). This doesn't work because qemu-pr-helper uses lockf() to lock the pidfile. Libvirt uses F_SETLK. These two locks don't know about each other. Sure, we can adapt virPid* to do lockf()-ing. But we will fail miserably if qemu ever decides to change that to say fnctl(). b) I've changed qemu-pr-helper to use fnctl() to lock pidfile. They have a nice wrapper over it - qemu_lock_fd(). This still did not fly. Their wrapper prefers F_OFD_SETLK over F_SETLK. OFD stands for Open File Description Locks. The lock is not associated with process but with FD and thus survives fork(). Again, incompatible locks. If file is locked with F_OFD_SETLK another process can F_SETLK it successfully. c) My third attempt was to not rely on qemu-pr-helper locking at all (after all they can change it anytime which could break our tailored solution). So I've modified qemuProcessStartPRDaemonHook() so it calls virPidFileAcquirePath() and leaks the pidfile FD to qemu-pr-helper. This looked promising: command hooks are called just before exec(), after all FDs are closed. Unfortunately, virPidFileAcquirePath() sets FD_CLOEXEC flag, so no leaking is possible. Sure I can call virSetInherit() to cancel the CLOEXEC flag. BUT, here the proposal: IIUC the only problem we have with storing pid and killing it later is that we might end up killing a different process than qemu-pr-helper because qemu-pr-helper may die and another process might be spawn with its PID. Well, this would be solved with the events I'm mentioning in cover letter. The idea is that qemu sends us an event as soon as qemu-pr-helper process dies. So we would spawn a new one and update the stored PID. This way we would never kill wrong process. BTW: don't look into qemuProcessReconnect. It suffers from the same problem. If libvirtd is stopped, qemu quits and another process is spawned with its PID, we kill the new process as soon as we try to reconnect to the monitor, because connecting to the monitor fails (obviously) so we jump onto error label where qemuProcessStop() is called. Worse, if there's OOM situation and we fail to asprintf() pidfile path we jump onto the label too (and massacre the process we think is qemu). My proposal is to: 1) keep tracking of PID as-is now, 2) forbid starting domains with <reservations managed='yes'/> until the events are implemented. Alternatively, I can go with c) (in the end I have it working locally) and do all the changes necessary. Frankly, I don't have preference. I also wonder if API design for events and query-* command for reconnect might makes us lean towards one of the options. Michal -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
On Thu, Mar 22, 2018 at 11:16:47AM +0100, Michal Privoznik wrote: > On 03/15/2018 01:31 PM, Peter Krempa wrote: > > On Wed, Mar 14, 2018 at 17:05:38 +0100, Michal Privoznik wrote: > >> Before we exec() qemu we have to spawn pr-helper processes for > >> all managed reservations (well, technically there can only one). > >> The only caveat there is that we should place the process into > >> the same namespace and cgroup as qemu (so that it shares the same > >> view of the system). But we can do that only after we've forked. > >> That means calling the setup function between fork() and exec(). > >> > >> Signed-off-by: Michal Privoznik <mprivozn@redhat.com> > >> --- > >> src/qemu/qemu_process.c | 169 ++++++++++++++++++++++++++++++++++++++++++++++++ > >> 1 file changed, 169 insertions(+) > >> > > > > > >> + > >> + priv->prPid = cpid; > >> + ret = 0; > >> + cleanup: > >> + if (ret < 0) { > >> + virCommandAbort(cmd); > >> + virProcessKillPainfully(cpid, true); > >> + } > >> + virCommandFree(cmd); > >> + if (pidfile) { > >> + if (unlink(pidfile) < 0 && > >> + errno != ENOENT && > >> + !virGetLastError()) > >> + virReportSystemError(errno, > >> + _("Cannot remove stale PID file %s"), > >> + pidfile); > >> + VIR_FREE(pidfile); > > > > By removing the pidfile, you can't make sure later that the PR helper > > program is still the same process, thus when killing the VM by the pid > > number only you might actually kill a different process. > > > > I unfortunately already deleted the top of the message so you'll need to > > fix the function qemuProcessKillPRDaemon to see whether it's the same > > process, similarly to what we do when reconnecting to qemu. In general when pidfiles are locked using lockf()/fcntl(), you should never unlink them unless you have first acquired the lock, and even then the code that acquires the lock needs to be made robust against races. a race condition Our virPidFileAcquirePath copes with race'ing unlinks, but QEMU's code does not. We should fix QEMU in this respect too. > > Okay, let me write here what I told you in person. I've tried couple of > approaches to do proper locking of pidfile: > > a) run qemu-pr-helper -f $pidfile in combination with > virPidFileForceCleanupPath(). This doesn't work because qemu-pr-helper > uses lockf() to lock the pidfile. Libvirt uses F_SETLK. These two locks > don't know about each other. Sure, we can adapt virPid* to do > lockf()-ing. But we will fail miserably if qemu ever decides to change > that to say fnctl(). I don't think that is correct - lockf() is just a thing wrapper around fnctl() F_SETLK from what I understand. Many systems do this equivalance, although technically POSIX leaves it undefined. > b) I've changed qemu-pr-helper to use fnctl() to lock pidfile. They have > a nice wrapper over it - qemu_lock_fd(). This still did not fly. Their > wrapper prefers F_OFD_SETLK over F_SETLK. OFD stands for Open File > Description Locks. The lock is not associated with process but with FD > and thus survives fork(). Again, incompatible locks. If file is locked > with F_OFD_SETLK another process can F_SETLK it successfully. QEMU does the OFD_SETLK/SETLK magic because its APIs were primarily design for use with the block layer, where it must have sane semantics to prevent accidental loss of the lock. The limitations of F_SETLK are not really an issue for pidfiles which are opened once and never closed. So it would be valid to change QEMU to use fcntl(F_SETLK) exclusively for pidfiles. This would be semantically compatible with lockf() on most systems, and get QEMU away from under-specified semantics of lockf(). > c) My third attempt was to not rely on qemu-pr-helper locking at all > (after all they can change it anytime which could break our tailored > solution). So I've modified qemuProcessStartPRDaemonHook() so it calls > virPidFileAcquirePath() and leaks the pidfile FD to qemu-pr-helper. This > looked promising: command hooks are called just before exec(), after all > FDs are closed. Unfortunately, virPidFileAcquirePath() sets FD_CLOEXEC > flag, so no leaking is possible. Sure I can call virSetInherit() to > cancel the CLOEXEC flag. BUT, here the proposal: > > IIUC the only problem we have with storing pid and killing it later is > that we might end up killing a different process than qemu-pr-helper > because qemu-pr-helper may die and another process might be spawn with > its PID. Well, this would be solved with the events I'm mentioning in > cover letter. The idea is that qemu sends us an event as soon as > qemu-pr-helper process dies. So we would spawn a new one and update the > stored PID. This way we would never kill wrong process. > > BTW: don't look into qemuProcessReconnect. It suffers from the same > problem. If libvirtd is stopped, qemu quits and another process is > spawned with its PID, we kill the new process as soon as we try to > reconnect to the monitor, because connecting to the monitor fails > (obviously) so we jump onto error label where qemuProcessStop() is > called. Worse, if there's OOM situation and we fail to asprintf() > pidfile path we jump onto the label too (and massacre the process we > think is qemu). > > > My proposal is to: > 1) keep tracking of PID as-is now, > 2) forbid starting domains with <reservations managed='yes'/> until the > events are implemented. > > Alternatively, I can go with c) (in the end I have it working locally) > and do all the changes necessary. Frankly, I don't have preference. I > also wonder if API design for events and query-* command for reconnect > might makes us lean towards one of the options. I would suggest doing both a) and b). Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :| -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
© 2016 - 2025 Red Hat, Inc.