[Tools 4/6] testbot: Keep track of the revert processes and handle server restarts more gracefully.

Francois Gouget fgouget at codeweavers.com
Tue Mar 26 05:47:48 CDT 2013


On startup the WineTestBot server now:
* Detects tasks that are still running and lets them complete.
* Cleans up and requeues the running tasks that died.
* Lets VMs that are currently reverting complete their revert.
* Leaves the idle powered on VMs and those used by running tasks as it.
* Shuts down any other active VM and marks them as dirty.
This makes it possible to kill and restart a WineTestBot server without impacting the running tasks.
Also, should the server reboot unexpectedly, the tasks that were running will be restarted.
---

Note: This patch requires a database update (see update21.sql).

 testbot/bin/Engine.pl              |  149 +++++++++++++++++++++++++++++++-----
 testbot/bin/RevertVM.pl            |    2 +
 testbot/ddl/update21.sql           |    5 ++
 testbot/ddl/winetestbot.sql        |    1 +
 testbot/doc/winetestbot-schema.dia |   20 +++++
 testbot/lib/WineTestBot/Jobs.pm    |   25 ++++--
 testbot/lib/WineTestBot/Steps.pm   |   23 ++++--
 testbot/lib/WineTestBot/Tasks.pm   |    2 +
 testbot/lib/WineTestBot/VMs.pm     |   19 +++++
 testbot/web/admin/VMDetails.pl     |    9 +++
 10 files changed, 221 insertions(+), 34 deletions(-)
 create mode 100644 testbot/ddl/update21.sql

diff --git a/testbot/bin/Engine.pl b/testbot/bin/Engine.pl
index 0a467fb..f5b0661 100755
--- a/testbot/bin/Engine.pl
+++ b/testbot/bin/Engine.pl
@@ -58,6 +58,136 @@ sub FatalError
   exit 1;
 }
 
+
+=pod
+=over 12
+
+=item C<Cleanup()>
+
+The Cleanup() function gets the tasks and VMs in a consistent state on the
+Engine startup. It has to contend with two main scenarios:
+- The Engine being restarted. Any task started just before that is still
+  running should still have its process and powered on VM and should be left
+  alone so it can complete normally. If a task died unexpectedly while the
+  Engine was done it's ok to requeue it.
+- The Engine startup after a reboot. All task processes will be dead and need
+  to be requeued. But the VMs are likely to be hosted on a separate machine so
+  it is quite possible that they will still be running. Hopefully any running
+  process matching a task's ChildPid will belong to another user so we don't
+  mistake that case for the previous one.
+
+In all cases we only trust that a VM status field is still valid:
+- It is 'running' and used by a task that is still running.
+- It is 'reverting' or 'sleeping', is powered on and the revert process is
+  still running.
+- It is 'idle' and powered on.
+In all other cases the VM will be powered off and marked as dirty.
+
+=back
+=cut
+
+sub Cleanup()
+{
+  # Verify that the running tasks are still alive and requeue them if not.
+  # Ignore the Job and Step status fields because they may be a bit out of date.
+  my %BusyVMs;
+  foreach my $Job (@{CreateJobs()->GetItems()})
+  {
+    my $CallUpdateStatus;
+    foreach my $Step (@{$Job->Steps->GetItems()})
+    {
+      my $Tasks = $Step->Tasks;
+      $Tasks->AddFilter("Status", ["running"]);
+      foreach my $Task (@{$Tasks->GetItems()})
+      {
+        my $TaskKey = join("/", $Job->Id, $Step->No, $Task->No);
+
+        my $Requeue;
+        if (!defined $Task->ChildPid)
+        {
+          # That task's process died somehow.
+          $Requeue = 1;
+        }
+        elsif (!$Task->VM->IsPoweredOn())
+        {
+          # A running task should have a powered on VM.
+          $Requeue = 1;
+          # Kill the task process if it's still there.
+          kill("TERM", $Task->ChildPid);
+        }
+        elsif (!kill(0, $Task->ChildPid))
+        {
+          # The event that caused the WineTestBot server to restart probably
+          # also killed the task's child process.
+          $Requeue = 1;
+        }
+        else
+        {
+          # This task is still running!
+          LogMsg "$TaskKey is still running\n";
+          $BusyVMs{$Task->VM->GetKey()} = 1;
+          next;
+        }
+        if ($Requeue)
+        {
+          LogMsg "Requeuing $TaskKey\n";
+          system("rm", "-r", "-f", "$DataDir/jobs/$TaskKey");
+          $Task->Status("queued");
+          $Task->ChildPid(undef);
+          $Task->Started(undef);
+          $Task->Ended(undef);
+          $Task->TestFailures(undef);
+          $Task->Save();
+          $CallUpdateStatus = 1;
+        }
+      }
+    }
+    # The Job and Steps status fields will actually remain unchanged except if
+    # the task that died was in the first step. In that case they will revert
+    # from 'running' to 'queued'.
+    $Job->UpdateStatus() if ($CallUpdateStatus);
+  }
+
+  # Get the VMs in order now
+  my $VMs = CreateVMs();
+  $VMs->FilterEnabledRole();
+  $VMs->FilterEnabledStatus();
+  foreach my $VM (@{$VMs->GetItems()})
+  {
+    my $VMKey = $VM->GetKey();
+    if ($BusyVMs{$VMKey})
+    {
+      # This VM is still running a task. Let it.
+      LogMsg "$VMKey is used by a task\n";
+      next;
+    }
+
+    if ($VM->IsPoweredOn())
+    {
+      next if ($VM->Status eq "idle");
+      if (($VM->Status eq "reverting" or $VM->Status eq "sleeping") and
+          defined $VM->ChildPid and kill(0, $VM->ChildPid))
+      {
+        # This VM is still being reverted. Let that process run its course.
+        LogMsg "$VMKey is being reverted\n";
+        next;
+      }
+      LogMsg "Marking $VMKey as dirty and shutting it down\n";
+      $VM->PowerOff();
+    }
+    else
+    {
+      next if ($VM->Status eq "dirty");
+      LogMsg "Marking $VMKey as dirty\n";
+    }
+
+    $VM->Status('dirty');
+    $VM->ChildPid(undef);
+    $VM->Save();
+  }
+}
+
+
 sub HandlePing
 {
   return "1pong\n";
@@ -394,23 +524,6 @@ sub ClientRead
   return $GotSomething;
 }
 
-sub InitVMs()
-{
-  # Kill any stale RevertVM.pl process so they don't mess with our VMs
-  system('killall RevertVM.pl 2>/dev/null');
-
-  # On startup we don't know what state the VMs are in. So consider them all
-  # to be dirty.
-  my $VMs = CreateVMs();
-  $VMs->FilterEnabledRole();
-  $VMs->FilterEnabledStatus();
-  foreach my $VM (@{$VMs->GetItems()})
-  {
-    $VM->Status('dirty');
-    $VM->Save();
-  }
-}
-
 =pod
 =over 12
 
@@ -515,7 +628,7 @@ sub main
   $WineTestBot::Engine::Notify::RunningInEngine = 1;
   LogMsg "Starting the WineTestBot Engine\n";
 
-  InitVMs();
+  Cleanup();
 
   my $SockName = "$DataDir/socket/engine";
   my $uaddr = sockaddr_un($SockName);
diff --git a/testbot/bin/RevertVM.pl b/testbot/bin/RevertVM.pl
index f9edeb7..acd3b5b 100755
--- a/testbot/bin/RevertVM.pl
+++ b/testbot/bin/RevertVM.pl
@@ -51,6 +51,7 @@ sub FatalError
   if ($VM)
   {
     $VM->Status("offline");
+    $VM->ChildPid(undef);
     $VM->Save();
 
     my $VMKey = $VM->GetKey();
@@ -133,6 +134,7 @@ if ($SleepAfterRevert != 0)
 }
 
 $VM->Status("idle");
+$VM->ChildPid(undef);
 ($ErrProperty, $ErrMessage) = $VM->Save();
 if (defined($ErrMessage))
 {
diff --git a/testbot/ddl/update21.sql b/testbot/ddl/update21.sql
new file mode 100644
index 0000000..827fd3e
--- /dev/null
+++ b/testbot/ddl/update21.sql
@@ -0,0 +1,5 @@
+USE winetestbot;
+
+ALTER TABLE VMs
+  ADD ChildPid INT(5) NULL
+      AFTER Status;
diff --git a/testbot/ddl/winetestbot.sql b/testbot/ddl/winetestbot.sql
index 4920951..4763d86 100644
--- a/testbot/ddl/winetestbot.sql
+++ b/testbot/ddl/winetestbot.sql
@@ -49,6 +49,7 @@ CREATE TABLE VMs
   Type         ENUM('win32', 'win64', 'build') NOT NULL,
   Role         ENUM('extra', 'base', 'winetest', 'retired', 'deleted') NOT NULL,
   Status       ENUM('dirty', 'reverting', 'sleeping', 'idle', 'running', 'offline') NOT NULL,
+  ChildPid     INT(5)           NULL,
   VirtURI      VARCHAR(64)      NOT NULL,
   VirtDomain   VARCHAR(32)      NOT NULL,
   IdleSnapshot VARCHAR(32)      NOT NULL,
diff --git a/testbot/doc/winetestbot-schema.dia b/testbot/doc/winetestbot-schema.dia
index 5f906f2..2418c8b 100644
--- a/testbot/doc/winetestbot-schema.dia
+++ b/testbot/doc/winetestbot-schema.dia
@@ -2253,6 +2253,26 @@
         </dia:composite>
         <dia:composite type="table_attribute">
           <dia:attribute name="name">
+            <dia:string>#ChildPid#</dia:string>
+          </dia:attribute>
+          <dia:attribute name="type">
+            <dia:string>#INT(5)#</dia:string>
+          </dia:attribute>
+          <dia:attribute name="comment">
+            <dia:string>##</dia:string>
+          </dia:attribute>
+          <dia:attribute name="primary_key">
+            <dia:boolean val="false"/>
+          </dia:attribute>
+          <dia:attribute name="nullable">
+            <dia:boolean val="true"/>
+          </dia:attribute>
+          <dia:attribute name="unique">
+            <dia:boolean val="false"/>
+          </dia:attribute>
+        </dia:composite>
+        <dia:composite type="table_attribute">
+          <dia:attribute name="name">
             <dia:string>#VirtURI#</dia:string>
           </dia:attribute>
           <dia:attribute name="type">
diff --git a/testbot/lib/WineTestBot/Jobs.pm b/testbot/lib/WineTestBot/Jobs.pm
index 61ef113..4b7d425 100644
--- a/testbot/lib/WineTestBot/Jobs.pm
+++ b/testbot/lib/WineTestBot/Jobs.pm
@@ -152,17 +152,26 @@ sub UpdateStatus($)
     }
   }
 
-  # Inherit the steps most significant status with some caveats:
-  # - if one of the steps is still queued then this job is still running
-  # - if a step is skipped it's either because a previous step failed or because
-  #   the user canceled the job. In both cases mark the job as failed
-  # - if all the steps are still queued, then this job's original status,
-  #   'queued', is still valid
-  foreach my $StepStatus ("running", "failed", "skipped", "completed")
+  # Inherit the steps most significant status.
+  # Note that one or more tasks may have been requeued during the cleanup phase
+  # of the server startup. So this job may regress from 'running' back to
+  # 'queued'. This means all possible step status values must be considered.
+  foreach my $StepStatus ("running", "failed", "skipped", "completed", "queued")
   {
     if ($Has{$StepStatus})
     {
-      $Status = $Has{"queued"} ? "running" : $StepStatus eq "skipped" ? "failed" : $StepStatus;
+      if ($Has{"queued"})
+      {
+        # Either nothing ran so this job is still / again 'queued', or not
+        # everything has been run yet which means it's still 'running'.
+        $Status = $StepStatus eq "queued" ? "queued" : "running";
+      }
+      else
+      {
+        # If all steps are skipped it's because the user canceled the job. In
+        # that case mark the job as 'failed'.
+        $Status = $StepStatus eq "skipped" ? "failed" : $StepStatus;
+      }
       $self->Status($Status);
       if ($Status ne "running" && $Status ne "queued" && !defined $self->Ended)
       {
diff --git a/testbot/lib/WineTestBot/Steps.pm b/testbot/lib/WineTestBot/Steps.pm
index 37fd3f9..acbab81 100644
--- a/testbot/lib/WineTestBot/Steps.pm
+++ b/testbot/lib/WineTestBot/Steps.pm
@@ -105,17 +105,24 @@ sub UpdateStatus($$)
     $Has{$Task->UpdateStatus($Skip)} = 1;
   }
 
-  # Inherit the tasks most significant status with some caveats:
-  # - if one of the tasks is still queued then this step is still running
-  # - if all the tasks are still queued, then this step's original status,
-  #   'queued', is still valid
-  # - if we tagged some tasks as skipped, then the step status will no longer
-  #   be 'queued', which means we will save the step and hence its tasks
-  foreach my $TaskStatus ("running", "failed", "skipped", "completed")
+  # Inherit the tasks most significant status.
+  # Note that one or more tasks may have been requeued during the cleanup phase
+  # of the server startup. So this step may regress from 'running' back to
+  # 'queued'. This means all possible task status values must be considered.
+  foreach my $TaskStatus ("running", "failed", "skipped", "completed", "queued")
   {
     if ($Has{$TaskStatus})
     {
-      $Status = $Has{"queued"} ? "running" : $TaskStatus;
+      if ($Has{"queued"})
+      {
+        # Either nothing ran so this step is still / again 'queued', or not
+        # everything has been run yet which means it's still 'running'.
+        $Status = $TaskStatus eq "queued" ? "queued" : "running";
+      }
+      else
+      {
+        $Status = $TaskStatus;
+      }
       $self->Status($Status);
       $self->Save();
       last;
diff --git a/testbot/lib/WineTestBot/Tasks.pm b/testbot/lib/WineTestBot/Tasks.pm
index 8ca9da0..c65f7cf 100644
--- a/testbot/lib/WineTestBot/Tasks.pm
+++ b/testbot/lib/WineTestBot/Tasks.pm
@@ -161,6 +161,8 @@ sub UpdateStatus
         close TASKLOG;
       }
       umask($OldUMask);
+      # This probably indicates a bug in the task script.
+      # Don't requeue the task to avoid an infinite loop.
       LogMsg "Child process for task $JobId/$StepNo/$TaskNo died unexpectedly\n";
       $self->Status("failed");
       $Status = "failed";
diff --git a/testbot/lib/WineTestBot/VMs.pm b/testbot/lib/WineTestBot/VMs.pm
index ff57d5a..eea9e78 100644
--- a/testbot/lib/WineTestBot/VMs.pm
+++ b/testbot/lib/WineTestBot/VMs.pm
@@ -330,6 +330,15 @@ sub RemoveSnapshot($$)
   return undef;
 }
 
+sub IsPoweredOn($)
+{
+  my ($self) = @_;
+
+  my ($ErrMessage, $Domain) = $self->_GetDomain();
+  return undef if (defined $ErrMessage);
+  return $Domain->is_active();
+}
+
 sub PowerOn
 {
   my ($self) = @_;
@@ -513,6 +522,14 @@ sub RunRevert
     exit(1);
   }
 
+  # Note that if the child process completes quickly (typically due to some
+  # error), it may set ChildPid to undef before we get here. So we may end up
+  # with non-reverting VM for which ChildPid is set. That's ok because
+  # ChildPid should be ignored anyway if Status is not 'reverting' or
+  # 'sleeping'.
+  $self->ChildPid($Pid);
+  $self->Save();
+
   return undef;
 }
 
@@ -556,6 +573,8 @@ BEGIN
     CreateEnumPropertyDescriptor("Type", "Type of VM", !1, 1, ['win32', 'win64', 'build']),
     CreateEnumPropertyDescriptor("Role", "VM Role", !1, 1, ['extra', 'base', 'winetest', 'retired', 'deleted']),
     CreateEnumPropertyDescriptor("Status", "Current status", !1, 1, ['dirty', 'reverting', 'sleeping', 'idle', 'running', 'offline']),
+    # Note: ChildPid is only valid when Status == 'reverting' or 'sleeping'.
+    CreateBasicPropertyDescriptor("ChildPid", "Child process id", !1, !1, "N", 5),
     CreateBasicPropertyDescriptor("VirtURI", "LibVirt URI of the VM", !1, 1, "A", 64),
     CreateBasicPropertyDescriptor("VirtDomain", "LibVirt Domain for the VM", !1, 1, "A", 32),
     CreateBasicPropertyDescriptor("IdleSnapshot", "Name of idle snapshot", !1, 1, "A", 32),
diff --git a/testbot/web/admin/VMDetails.pl b/testbot/web/admin/VMDetails.pl
index f24061c..bd0c777 100644
--- a/testbot/web/admin/VMDetails.pl
+++ b/testbot/web/admin/VMDetails.pl
@@ -33,6 +33,15 @@ sub _initialize
   $self->SUPER::_initialize(@_, CreateVMs());
 }
 
+sub DisplayProperty
+{
+  my $self = shift;
+  my $PropertyDescriptor = $_[0];
+
+  return "" if ($PropertyDescriptor->GetName() eq "ChildPid");
+  return $self->SUPER::DisplayProperty(@_);
+}
+
 package main;
 
 my $Request = shift;
-- 
1.7.10.4




More information about the wine-patches mailing list