Francois Gouget : testbot: Monitor offline VMs and automatically put them back online.

Alexandre Julliard julliard at winehq.org
Fri Oct 20 01:38:21 CDT 2017


Module: tools
Branch: master
Commit: 51b379e666f7962a00d143208f1384282dfd1306
URL:    http://source.winehq.org/git/tools.git/?a=commit;h=51b379e666f7962a00d143208f1384282dfd1306

Author: Francois Gouget <fgouget at codeweavers.com>
Date:   Thu Oct 19 14:53:27 2017 +0200

testbot: Monitor offline VMs and automatically put them back online.

When a VM goes offline, start a LibvirtTool process to monitor it and
put it back online as soon as we can access it again.
This makes the TestBot resilient to temporary network outages and VM
host reboots.

Signed-off-by: Francois Gouget <fgouget at codeweavers.com>
Signed-off-by: Alexandre Julliard <julliard at winehq.org>

---

 testbot/bin/Engine.pl           |  4 ++-
 testbot/bin/LibvirtTool.pl      | 60 ++++++++++++++++++++++++++++++++++++++---
 testbot/lib/WineTestBot/Jobs.pm | 18 ++++++++-----
 testbot/lib/WineTestBot/VMs.pm  | 31 ++++++++++++++++++---
 4 files changed, 100 insertions(+), 13 deletions(-)

diff --git a/testbot/bin/Engine.pl b/testbot/bin/Engine.pl
index 02c361c..e6755bc 100755
--- a/testbot/bin/Engine.pl
+++ b/testbot/bin/Engine.pl
@@ -181,7 +181,9 @@ sub Cleanup(;$$)
       if ($KillVMs)
       {
         $VM->KillChild();
-        $VM->RunPowerOff();
+        # $KillVMs is normally used on shutdown so don't start a process that
+        # will get stuck 'forever' waiting for an offline VM.
+        $VM->RunPowerOff() if ($VM->Status ne "offline");
       }
       elsif (!$VM->CanHaveChild())
       {
diff --git a/testbot/bin/LibvirtTool.pl b/testbot/bin/LibvirtTool.pl
index fcc0e81..ab97261 100755
--- a/testbot/bin/LibvirtTool.pl
+++ b/testbot/bin/LibvirtTool.pl
@@ -1,7 +1,7 @@
 #!/usr/bin/perl -Tw
 # -*- Mode: Perl; perl-indent-level: 2; indent-tabs-mode: nil -*-
 #
-# Performs poweroff and revert operations on the specified VM.
+# Performs poweroff, revert and other operations on the specified VM.
 # These operations can take quite a bit of time, particularly in case of
 # network trouble, and thus are best performed in a separate process.
 #
@@ -104,7 +104,7 @@ while (@ARGV)
   {
     $LogOnly = 1;
   }
-  elsif ($Arg =~ /^(?:checkidle|poweroff|revert)$/)
+  elsif ($Arg =~ /^(?:checkidle|monitor|poweroff|revert)$/)
   {
     $Action = $Arg;
   }
@@ -163,7 +163,7 @@ if (!defined $Usage)
 }
 if (defined $Usage)
 {
-  print "Usage: $Name0 [--debug] [--log-only] [--help] (checkidle|poweroff|revert) VMName\n";
+  print "Usage: $Name0 [--debug] [--log-only] [--help] (checkidle|monitor|poweroff|revert) VMName\n";
   exit $Usage;
 }
 
@@ -251,6 +251,56 @@ sub ChangeStatus($$;$)
   return 0;
 }
 
+sub Monitor()
+{
+  $CurrentStatus = "offline";
+  while (1)
+  {
+    # Get a fresh status
+    $VM = CreateVMs()->GetItem($VMKey);
+    if (!defined $VM or $VM->Role eq "retired" or $VM->Role eq "deleted" or
+        $VM->Status eq "maintenance")
+    {
+      my $Reason = $VM ? "Role=". $VM->Role ."\nStatus=". $VM->Status :
+                         "$VMKey does not exist anymore";
+      NotifyAdministrator("The $VMKey VM is not relevant anymore",
+                          "The $VMKey VM was offline but ceased to be relevant after ".
+                          Elapsed($Start). " seconds:\n\n$Reason\n");
+      return 1;
+    }
+    if ($VM->Status ne "offline")
+    {
+      NotifyAdministrator("The $VMKey VM is working again (". $VM->Status .")",
+                          "The status of the $VMKey VM unexpectedly switched from offline\n".
+                          "to ". $VM->Status ." after ". Elapsed($Start)
+                          ." seconds.");
+      return 0;
+    }
+
+    my $IsPoweredOn = $VM->GetDomain()->IsPoweredOn();
+    if ($IsPoweredOn)
+    {
+      my $ErrMessage = $VM->GetDomain()->PowerOff(1);
+      if (defined $ErrMessage)
+      {
+        Error "$ErrMessage\n";
+        $IsPoweredOn = undef;
+      }
+    }
+    if (defined $IsPoweredOn)
+    {
+      return 1 if (ChangeStatus("offline", "off", "done"));
+      NotifyAdministrator("The $VMKey VM is working again",
+                          "The $VMKey VM started working again after ".
+                          Elapsed($Start) ." seconds.");
+      return 0;
+    }
+
+    Debug(Elapsed($Start), " $VMKey is still unreachable\n");
+    sleep(60);
+  }
+}
+
 sub PowerOff()
 {
   # Power off VMs no matter what their initial status is
@@ -336,6 +386,10 @@ if ($Action eq "checkidle")
 {
   $Rc = CheckIdle();
 }
+elsif ($Action eq "monitor")
+{
+  $Rc = Monitor();
+}
 elsif ($Action eq "poweroff")
 {
   $Rc = PowerOff();
diff --git a/testbot/lib/WineTestBot/Jobs.pm b/testbot/lib/WineTestBot/Jobs.pm
index 6e7937f..de639c7 100644
--- a/testbot/lib/WineTestBot/Jobs.pm
+++ b/testbot/lib/WineTestBot/Jobs.pm
@@ -215,24 +215,23 @@ sub Cancel($)
     $Tasks->AddFilter("Status", ["queued", "running"]);
     foreach my $Task (@{$Tasks->GetItems()})
     {
+      my $VM = $Task->VM;
       if ($Task->Status eq "queued")
       {
         $Task->Status("skipped");
         my ($EProperty, $EMessage) = $Task->Save();
         $ErrMessage ||= "$EMessage ($EProperty)" if ($EMessage);
       }
-      elsif (defined $Task->ChildPid)
+      elsif (defined $VM->ChildPid)
       {
         require WineTestBot::Log;
         WineTestBot::Log::LogMsg("Canceling the " . join("/", $self->Id, $Step->No, $Task->No) . " task\n");
-        kill("TERM", $Task->ChildPid);
         $Task->Status("canceled");
-        $Task->ChildPid(undef);
         my ($EProperty, $EMessage) = $Task->Save();
         $ErrMessage ||= "$EMessage ($EProperty)" if ($EMessage);
 
-        my $VM = $Task->VM;
         $VM->Status('dirty');
+        $VM->KillChild();
         ($EProperty, $EMessage) = $VM->Save();
         $ErrMessage ||= "$EMessage ($EProperty)" if ($EMessage);
       }
@@ -285,7 +284,6 @@ sub Restart($)
         system("rm", "-rf", "$JobDir/" . $Step->No . "/" . $Task->No);
       }
       $Task->Status("queued");
-      $Task->ChildPid(undef);
       $Task->Started(undef);
       $Task->Ended(undef);
       $Task->TestFailures(undef);
@@ -493,6 +491,14 @@ sub ScheduleOnHost($$$)
     {
       $RunningCount++;
     }
+    elsif ($VMStatus eq "offline")
+    {
+      if (!$VM->HasRunningChild())
+      {
+        my $ErrMessage = $VM->RunMonitor();
+        return $ErrMessage if (defined $ErrMessage);
+      }
+    }
     else
     {
       my $Priority = $VM->Type eq "build" ? 10 :
@@ -617,7 +623,7 @@ sub ScheduleOnHost($$$)
     next if (exists $VMsToRevert{$VMKey});
 
     my $VM = $HostVMs->GetItem($VMKey);
-    next if ($VM->Status ne "dirty" or defined $VM->ChildPid);
+    next if ($VM->Status ne "dirty" or $VM->HasRunningChild());
 
     my $ErrMessage = $VM->RunPowerOff();
     return $ErrMessage if (defined $ErrMessage);
diff --git a/testbot/lib/WineTestBot/VMs.pm b/testbot/lib/WineTestBot/VMs.pm
index 377eb0a..e71ad21 100644
--- a/testbot/lib/WineTestBot/VMs.pm
+++ b/testbot/lib/WineTestBot/VMs.pm
@@ -241,7 +241,7 @@ Returns true if the VM status is compatible with ChildPid being set.
 sub CanHaveChild($)
 {
   my ($self) = @_;
-  return ($self->Status =~ /^(?:dirty|reverting|sleeping)$/);
+  return ($self->Status =~ /^(?:dirty|reverting|sleeping|offline)$/);
 }
 
 =pod
@@ -416,6 +416,31 @@ sub RunCheckIdle($)
   return $self->_RunVMTool("dirty", ["--log-only", "checkidle", $self->GetKey()]);
 }
 
+=pod
+=over 12
+
+=item C<RunMonitor()>
+
+Monitors an offline VM to detect when it becomes accessible again.
+When the VM can again be accessed through the hypervisor it is powered off and
+its status is set to 'off'.
+
+This operation can obviously take a long time so it is performed in a separate
+process.
+
+=back
+=cut
+
+sub RunMonitor($)
+{
+  my ($self) = @_;
+  # In fact the status is already set
+  return $self->_RunVMTool("offline", ["--log-only", "monitor", $self->GetKey()]);
+}
+
+=pod
+=over 12
+
 =item C<RunPowerOff()>
 
 Powers off the VM so that it stops using resources.
@@ -547,8 +572,8 @@ sub FilterEnabledRole($)
 sub FilterEnabledStatus($)
 {
   my ($self) = @_;
-  # Filter out the disabled VMs, that is the offline and maintenance ones
-  $self->AddFilter("Status", ["dirty", "reverting", "sleeping", "idle", "running", "off"]);
+  # Filter out the maintenance VMs
+  $self->AddFilter("Status", ["dirty", "reverting", "sleeping", "idle", "running", "off", "offline"]);
 }
 
 sub FilterHypervisors($$)




More information about the wine-cvs mailing list