[Tools] testbot: Monitor offline VMs and automatically put them back online.

Francois Gouget fgouget at codeweavers.com
Thu Oct 19 07:53:27 CDT 2017


When a VM goes offline, start a LibvirtTool process to monitor it and
put it back online as soon as we can access it again.
This makes the TestBot resilient to temporary network outages and VM 
host reboots.

Signed-off-by: Francois Gouget <fgouget at codeweavers.com>
---
 testbot/bin/Engine.pl           |  4 ++-
 testbot/bin/LibvirtTool.pl      | 60 ++++++++++++++++++++++++++++++++++++++---
 testbot/lib/WineTestBot/Jobs.pm | 18 ++++++++-----
 testbot/lib/WineTestBot/VMs.pm  | 31 ++++++++++++++++++---
 4 files changed, 100 insertions(+), 13 deletions(-)

diff --git a/testbot/bin/Engine.pl b/testbot/bin/Engine.pl
index 02c361c4..e6755bcb 100755
--- a/testbot/bin/Engine.pl
+++ b/testbot/bin/Engine.pl
@@ -181,7 +181,9 @@ sub Cleanup(;$$)
       if ($KillVMs)
       {
         $VM->KillChild();
-        $VM->RunPowerOff();
+        # $KillVMs is normally used on shutdown so don't start a process that
+        # will get stuck 'forever' waiting for an offline VM.
+        $VM->RunPowerOff() if ($VM->Status ne "offline");
       }
       elsif (!$VM->CanHaveChild())
       {
diff --git a/testbot/bin/LibvirtTool.pl b/testbot/bin/LibvirtTool.pl
index fcc0e814..ab972614 100755
--- a/testbot/bin/LibvirtTool.pl
+++ b/testbot/bin/LibvirtTool.pl
@@ -1,7 +1,7 @@
 #!/usr/bin/perl -Tw
 # -*- Mode: Perl; perl-indent-level: 2; indent-tabs-mode: nil -*-
 #
-# Performs poweroff and revert operations on the specified VM.
+# Performs poweroff, revert and other operations on the specified VM.
 # These operations can take quite a bit of time, particularly in case of
 # network trouble, and thus are best performed in a separate process.
 #
@@ -104,7 +104,7 @@ while (@ARGV)
   {
     $LogOnly = 1;
   }
-  elsif ($Arg =~ /^(?:checkidle|poweroff|revert)$/)
+  elsif ($Arg =~ /^(?:checkidle|monitor|poweroff|revert)$/)
   {
     $Action = $Arg;
   }
@@ -163,7 +163,7 @@ if (!defined $Usage)
 }
 if (defined $Usage)
 {
-  print "Usage: $Name0 [--debug] [--log-only] [--help] (checkidle|poweroff|revert) VMName\n";
+  print "Usage: $Name0 [--debug] [--log-only] [--help] (checkidle|monitor|poweroff|revert) VMName\n";
   exit $Usage;
 }
 
@@ -251,6 +251,56 @@ sub ChangeStatus($$;$)
   return 0;
 }
 
+sub Monitor()
+{
+  $CurrentStatus = "offline";
+  while (1)
+  {
+    # Get a fresh status
+    $VM = CreateVMs()->GetItem($VMKey);
+    if (!defined $VM or $VM->Role eq "retired" or $VM->Role eq "deleted" or
+        $VM->Status eq "maintenance")
+    {
+      my $Reason = $VM ? "Role=". $VM->Role ."\nStatus=". $VM->Status :
+                         "$VMKey does not exist anymore";
+      NotifyAdministrator("The $VMKey VM is not relevant anymore",
+                          "The $VMKey VM was offline but ceased to be relevant after ".
+                          Elapsed($Start). " seconds:\n\n$Reason\n");
+      return 1;
+    }
+    if ($VM->Status ne "offline")
+    {
+      NotifyAdministrator("The $VMKey VM is working again (". $VM->Status .")",
+                          "The status of the $VMKey VM unexpectedly switched from offline\n".
+                          "to ". $VM->Status ." after ". Elapsed($Start)
+                          ." seconds.");
+      return 0;
+    }
+
+    my $IsPoweredOn = $VM->GetDomain()->IsPoweredOn();
+    if ($IsPoweredOn)
+    {
+      my $ErrMessage = $VM->GetDomain()->PowerOff(1);
+      if (defined $ErrMessage)
+      {
+        Error "$ErrMessage\n";
+        $IsPoweredOn = undef;
+      }
+    }
+    if (defined $IsPoweredOn)
+    {
+      return 1 if (ChangeStatus("offline", "off", "done"));
+      NotifyAdministrator("The $VMKey VM is working again",
+                          "The $VMKey VM started working again after ".
+                          Elapsed($Start) ." seconds.");
+      return 0;
+    }
+
+    Debug(Elapsed($Start), " $VMKey is still unreachable\n");
+    sleep(60);
+  }
+}
+
 sub PowerOff()
 {
   # Power off VMs no matter what their initial status is
@@ -336,6 +386,10 @@ if ($Action eq "checkidle")
 {
   $Rc = CheckIdle();
 }
+elsif ($Action eq "monitor")
+{
+  $Rc = Monitor();
+}
 elsif ($Action eq "poweroff")
 {
   $Rc = PowerOff();
diff --git a/testbot/lib/WineTestBot/Jobs.pm b/testbot/lib/WineTestBot/Jobs.pm
index 6e7937f4..de639c72 100644
--- a/testbot/lib/WineTestBot/Jobs.pm
+++ b/testbot/lib/WineTestBot/Jobs.pm
@@ -215,24 +215,23 @@ sub Cancel($)
     $Tasks->AddFilter("Status", ["queued", "running"]);
     foreach my $Task (@{$Tasks->GetItems()})
     {
+      my $VM = $Task->VM;
       if ($Task->Status eq "queued")
       {
         $Task->Status("skipped");
         my ($EProperty, $EMessage) = $Task->Save();
         $ErrMessage ||= "$EMessage ($EProperty)" if ($EMessage);
       }
-      elsif (defined $Task->ChildPid)
+      elsif (defined $VM->ChildPid)
       {
         require WineTestBot::Log;
         WineTestBot::Log::LogMsg("Canceling the " . join("/", $self->Id, $Step->No, $Task->No) . " task\n");
-        kill("TERM", $Task->ChildPid);
         $Task->Status("canceled");
-        $Task->ChildPid(undef);
         my ($EProperty, $EMessage) = $Task->Save();
         $ErrMessage ||= "$EMessage ($EProperty)" if ($EMessage);
 
-        my $VM = $Task->VM;
         $VM->Status('dirty');
+        $VM->KillChild();
         ($EProperty, $EMessage) = $VM->Save();
         $ErrMessage ||= "$EMessage ($EProperty)" if ($EMessage);
       }
@@ -285,7 +284,6 @@ sub Restart($)
         system("rm", "-rf", "$JobDir/" . $Step->No . "/" . $Task->No);
       }
       $Task->Status("queued");
-      $Task->ChildPid(undef);
       $Task->Started(undef);
       $Task->Ended(undef);
       $Task->TestFailures(undef);
@@ -493,6 +491,14 @@ sub ScheduleOnHost($$$)
     {
       $RunningCount++;
     }
+    elsif ($VMStatus eq "offline")
+    {
+      if (!$VM->HasRunningChild())
+      {
+        my $ErrMessage = $VM->RunMonitor();
+        return $ErrMessage if (defined $ErrMessage);
+      }
+    }
     else
     {
       my $Priority = $VM->Type eq "build" ? 10 :
@@ -617,7 +623,7 @@ sub ScheduleOnHost($$$)
     next if (exists $VMsToRevert{$VMKey});
 
     my $VM = $HostVMs->GetItem($VMKey);
-    next if ($VM->Status ne "dirty" or defined $VM->ChildPid);
+    next if ($VM->Status ne "dirty" or $VM->HasRunningChild());
 
     my $ErrMessage = $VM->RunPowerOff();
     return $ErrMessage if (defined $ErrMessage);
diff --git a/testbot/lib/WineTestBot/VMs.pm b/testbot/lib/WineTestBot/VMs.pm
index 377eb0ad..e71ad218 100644
--- a/testbot/lib/WineTestBot/VMs.pm
+++ b/testbot/lib/WineTestBot/VMs.pm
@@ -241,7 +241,7 @@ Returns true if the VM status is compatible with ChildPid being set.
 sub CanHaveChild($)
 {
   my ($self) = @_;
-  return ($self->Status =~ /^(?:dirty|reverting|sleeping)$/);
+  return ($self->Status =~ /^(?:dirty|reverting|sleeping|offline)$/);
 }
 
 =pod
@@ -416,6 +416,31 @@ sub RunCheckIdle($)
   return $self->_RunVMTool("dirty", ["--log-only", "checkidle", $self->GetKey()]);
 }
 
+=pod
+=over 12
+
+=item C<RunMonitor()>
+
+Monitors an offline VM to detect when it becomes accessible again.
+When the VM can again be accessed through the hypervisor it is powered off and
+its status is set to 'off'.
+
+This operation can obviously take a long time so it is performed in a separate
+process.
+
+=back
+=cut
+
+sub RunMonitor($)
+{
+  my ($self) = @_;
+  # In fact the status is already set
+  return $self->_RunVMTool("offline", ["--log-only", "monitor", $self->GetKey()]);
+}
+
+=pod
+=over 12
+
 =item C<RunPowerOff()>
 
 Powers off the VM so that it stops using resources.
@@ -547,8 +572,8 @@ sub FilterEnabledRole($)
 sub FilterEnabledStatus($)
 {
   my ($self) = @_;
-  # Filter out the disabled VMs, that is the offline and maintenance ones
-  $self->AddFilter("Status", ["dirty", "reverting", "sleeping", "idle", "running", "off"]);
+  # Filter out the maintenance VMs
+  $self->AddFilter("Status", ["dirty", "reverting", "sleeping", "idle", "running", "off", "offline"]);
 }
 
 sub FilterHypervisors($$)
-- 
2.14.2



More information about the wine-patches mailing list