From 1b905ac8567ab9d1137cb057911d38b9c3600bf3 Mon Sep 17 00:00:00 2001 From: Tim Smith Date: Tue, 11 Jun 2024 14:08:26 +0100 Subject: [PATCH] CP-49775 convert SMGC to systemd service Add a templated oneshot service and use systemctl to start it. The systemctl command is invoked with "--no-wait" so that it returns immediately because the "oneshot" nature of the service would otherwise wait until the command completes. Note that because the service is a oneshot with RemainAfterExit=no (so we can multiply start it), it will go from "activating" to "dead" without ever appearing to be "running". Signed-off-by: Tim Smith --- Makefile | 2 ++ drivers/FileSR.py | 2 +- drivers/LVHDSR.py | 4 ++-- drivers/cleanup.py | 40 ++++++++++++++++++++++++++++++++-------- mk/sm.spec.in | 1 + systemd/SMGC@.service | 12 ++++++++++++ 6 files changed, 50 insertions(+), 11 deletions(-) create mode 100644 systemd/SMGC@.service diff --git a/Makefile b/Makefile index 972f671f..ad81be40 100755 --- a/Makefile +++ b/Makefile @@ -167,6 +167,8 @@ install: precheck $(SM_STAGING)/$(SYSTEMD_SERVICE_DIR) install -m 644 systemd/sr_health_check.timer \ $(SM_STAGING)/$(SYSTEMD_SERVICE_DIR) + install -m 644 systemd/SMGC@.service \ + $(SM_STAGING)/$(SYSTEMD_SERVICE_DIR) for i in $(UDEV_RULES); do \ install -m 644 udev/$$i.rules \ $(SM_STAGING)$(UDEV_RULES_DIR); done diff --git a/drivers/FileSR.py b/drivers/FileSR.py index cceecc02..ba9b1863 100755 --- a/drivers/FileSR.py +++ b/drivers/FileSR.py @@ -379,7 +379,7 @@ def _process_replay(self, data): def _kickGC(self): util.SMlog("Kicking GC") - cleanup.start_gc(self.session, self.uuid) + cleanup.start_gc_service(self.uuid) def _isbind(self): # os.path.ismount can't deal with bind mount diff --git a/drivers/LVHDSR.py b/drivers/LVHDSR.py index 2c7bec15..708866da 100755 --- a/drivers/LVHDSR.py +++ b/drivers/LVHDSR.py @@ -892,7 +892,7 @@ def _handleInterruptedCoalesceLeaf(self): entries = self.journaler.getAll(cleanup.VDI.JRN_LEAF) if len(entries) > 0: util.SMlog("*** INTERRUPTED COALESCE-LEAF OP DETECTED ***") - cleanup.gc_force(self.session, self.uuid) + cleanup.start_gc_service(self.uuid, True) self.lvmCache.refresh() def _handleInterruptedCloneOp(self, origUuid, jval, forceUndo=False): @@ -1290,7 +1290,7 @@ def _prepareTestMode(self): def _kickGC(self): util.SMlog("Kicking GC") - cleanup.start_gc(self.session, self.uuid) + cleanup.start_gc_service(self.uuid) def ensureCBTSpace(self): # Ensure we have space for at least one LV diff --git a/drivers/cleanup.py b/drivers/cleanup.py index 6f9917b7..042fe998 100755 --- a/drivers/cleanup.py +++ b/drivers/cleanup.py @@ -3206,6 +3206,10 @@ def gc(session, srUuid, inBackground, dryRun=False): def start_gc(session, sr_uuid): + """ + This function is used to try to start a backgrounded GC session by forking + the current process. If using the systemd version, call start_gc_service() instead. + """ # don't bother if an instance already running (this is just an # optimization to reduce the overhead of forking a new process if we # don't have to, but the process will check the lock anyways) @@ -3230,6 +3234,24 @@ def start_gc(session, sr_uuid): subprocess.run([__file__, '-b', '-u', sr_uuid, '-g'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) +def start_gc_service(sr_uuid, wait=False): + """ + This starts the templated systemd service which runs GC on the given SR UUID. + If the service was already started, this is a no-op. + + Because the service is a one-shot with RemainAfterExit=no, when called with + wait=True this will run the service synchronously and will not return until the + run has finished. This is used to force a run of the GC instead of just kicking it + in the background. + """ + sr_uuid_esc = sr_uuid.replace("-", "\\x2d") + util.SMlog(f"Kicking SMGC@{sr_uuid}...") + cmd=[ "/usr/bin/systemctl", "--quiet" ] + if not wait: + cmd.append("--no-block") + cmd += ["start", f"SMGC@{sr_uuid_esc}"] + subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) + def gc_force(session, srUuid, force=False, dryRun=False, lockSR=False): """Garbage collect all deleted VDIs in SR "srUuid". The caller must ensure @@ -3263,15 +3285,17 @@ def gc_force(session, srUuid, force=False, dryRun=False, lockSR=False): def get_state(srUuid): - """Return whether GC/coalesce is currently running or not. The information - is not guaranteed for any length of time if the call is not protected by - locking. + """Return whether GC/coalesce is currently running or not. This asks systemd for + the state of the templated SMGC service and will return True if it is "activating" + or "running" (for completeness, as in practice it will never achieve the latter state) """ - init(srUuid) - if lockGCActive.acquireNoblock(): - lockGCActive.release() - return False - return True + sr_uuid_esc = srUuid.replace("-", "\\x2d") + cmd=[ "/usr/bin/systemctl", "is-active", f"SMGC@{sr_uuid_esc}"] + result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) + state = result.stdout.decode('utf-8').rstrip() + if state == "activating" or state == "running": + return True + return False def should_preempt(session, srUuid): diff --git a/mk/sm.spec.in b/mk/sm.spec.in index dc7661d3..faee7bf1 100755 --- a/mk/sm.spec.in +++ b/mk/sm.spec.in @@ -212,6 +212,7 @@ tests/run_python_unittests.sh %{_unitdir}/usb-scan.socket %{_unitdir}/mpathcount.service %{_unitdir}/mpathcount.socket +%{_unitdir}/SMGC@.service %config /etc/udev/rules.d/65-multipath.rules %config /etc/udev/rules.d/55-xs-mpath-scsidev.rules %config /etc/udev/rules.d/58-xapi.rules diff --git a/systemd/SMGC@.service b/systemd/SMGC@.service new file mode 100644 index 00000000..54ab1b7d --- /dev/null +++ b/systemd/SMGC@.service @@ -0,0 +1,12 @@ +[Unit] +Description=Garbage Collector for SR %I +DefaultDependencies=no + +[Service] +Type=oneshot +Restart=no +ExecStart=/opt/xensource/sm/cleanup.py -g -u %I +# This is the default, but just to make it clear we may run this +# service multiple times. When running, it will show as "activating"; +# when not running, it will show as "dead" +RemainAfterExit=no