[Tools] testbot: Rerun the task in case of a TestBot error.

Francois Gouget fgouget at codeweavers.com
Mon Jun 26 12:00:44 CDT 2017


This improves the robustness in the face of network or virtualization 
errors. Set the $MaxTaskTries setting to configure the number of 
attempts.
Note that this does not impact regular failures and thus flaky tests, 
these should be handled in TestLauncher.

Signed-off-by: Francois Gouget <fgouget at codeweavers.com>
---


 testbot/bin/WineRunBuild.pl       | 52 +++++++++++++++++++++++++++--------
 testbot/bin/WineRunReconfig.pl    | 57 +++++++++++++++++++++++++++++---------
 testbot/bin/WineRunTask.pl        | 58 +++++++++++++++++++++++++++------------
 testbot/lib/WineTestBot/Config.pm |  8 ++++--
 testbot/lib/WineTestBot/Tasks.pm  |  6 ++++
 5 files changed, 138 insertions(+), 43 deletions(-)

diff --git a/testbot/bin/WineRunBuild.pl b/testbot/bin/WineRunBuild.pl
index 5dfb65f6..56a92f9f 100755
--- a/testbot/bin/WineRunBuild.pl
+++ b/testbot/bin/WineRunBuild.pl
@@ -191,12 +191,36 @@ sub LogTaskError($)
   umask($OldUMask);
 }
 
-sub WrapUpAndExit($)
+sub WrapUpAndExit($;$)
 {
-  my ($Status) = @_;
+  my ($Status, $Retry) = @_;
+  my $NewVMStatus = $Status eq 'queued' ? 'offline' : 'dirty';
+
+  my $TestFailures;
+  my $Tries = $Task->TestFailures || 0;
+  if ($Retry)
+  {
+    # This may be a transient error (e.g. a network glitch)
+    # so retry a few times to improve robustness
+    $Tries++;
+    if ($Task->CanRetry())
+    {
+      $Status = 'queued';
+      $TestFailures = $Tries;
+    }
+    else
+    {
+      LogTaskError("Giving up after $Tries run(s)\n");
+    }
+  }
+  elsif ($Tries >= 1)
+  {
+    LogTaskError("The previous $Tries run(s) terminated abnormally\n");
+  }
 
   # Update the Task and Job
   $Task->Status($Status);
+  $Task->TestFailures($TestFailures);
   $Task->ChildPid(undef);
   if ($Status eq 'queued')
   {
@@ -215,25 +239,25 @@ sub WrapUpAndExit($)
   $VM = CreateVMs()->GetItem($VM->GetKey());
   if ($VM->Status eq 'running')
   {
-    $VM->Status($Status eq 'queued' ? 'offline' : 'dirty');
+    $VM->Status($NewVMStatus);
     $VM->Save();
     RescheduleJobs();
   }
 
-  my $Result = $VM->Name .": ". $VM->Status ." Task: $Status";
+  my $Result = $VM->Name .": ". $VM->Status ." Status: $Status Failures: ". (defined $TestFailures ? $TestFailures : "unset");
   LogMsg "Task $JobId/$StepNo/$TaskNo done ($Result)\n";
   Debug(Elapsed($Start), " Done. $Result\n");
   exit($Status eq 'completed' ? 0 : 1);
 }
 
-sub FatalError($)
+sub FatalError($;$)
 {
-  my ($ErrMessage) = @_;
+  my ($ErrMessage, $Retry) = @_;
 
   LogMsg "$JobId/$StepNo/$TaskNo $ErrMessage";
   LogTaskError($ErrMessage);
 
-  WrapUpAndExit('boterror');
+  WrapUpAndExit('boterror', $Retry);
 }
 
 sub FatalTAError($$)
@@ -246,21 +270,27 @@ sub FatalTAError($$)
   if (!defined $IsPoweredOn)
   {
     # The VM host is not accessible anymore so mark the VM as offline and
-    # requeue the task.
+    # requeue the task. This does not count towards the task's tries limit
+    # since neither the VM nor the task are at fault.
     Error("$ErrMessage\n");
     WrapUpAndExit('queued');
   }
 
+  my $Retry;
   if ($IsPoweredOn)
   {
-    $ErrMessage .= " The test VM has crashed, rebooted or lost connectivity (or the TestAgent server died)\n";
+    LogMsg("$ErrMessage\n");
+    LogTaskError("$ErrMessage\n");
+    $ErrMessage = "The test VM has crashed, rebooted or lost connectivity (or the TestAgent server died)\n";
+    # Retry in case it was a temporary network glitch
+    $Retry = 1;
   }
   else
   {
     # Ignore the TestAgent error, it's irrelevant
     $ErrMessage = "The test VM is powered off!\n";
   }
-  FatalError($ErrMessage);
+  FatalError($ErrMessage, $Retry);
 }
 
 
@@ -394,7 +424,7 @@ if ($TA->GetFile("Build.log", $FullLogFileName))
   }
   else
   {
-    FatalError("Unable to open the build log for reading: $!\n");
+    FatalError("Unable to open the build log for reading: $!\n", "retry");
   }
 }
 elsif (!defined $TAError)
diff --git a/testbot/bin/WineRunReconfig.pl b/testbot/bin/WineRunReconfig.pl
index 135a5c38..3cadfe98 100755
--- a/testbot/bin/WineRunReconfig.pl
+++ b/testbot/bin/WineRunReconfig.pl
@@ -191,12 +191,37 @@ sub LogTaskError($)
   umask($OldUMask);
 }
 
-sub WrapUpAndExit($)
+sub WrapUpAndExit($;$)
 {
-  my ($Status) = @_;
+  my ($Status, $Retry) = @_;
+  my $NewVMStatus = $Status eq 'queued' ? 'offline' :
+                    $Status eq 'completed' ? 'idle' : 'dirty';
+
+  my $TestFailures;
+  my $Tries = $Task->TestFailures || 0;
+  if ($Retry)
+  {
+    # This may be a transient error (e.g. a network glitch)
+    # so retry a few times to improve robustness
+    $Tries++;
+    if ($Task->CanRetry())
+    {
+      $Status = 'queued';
+      $TestFailures = $Tries;
+    }
+    else
+    {
+      LogTaskError("Giving up after $Tries run(s)\n");
+    }
+  }
+  elsif ($Tries >= 1)
+  {
+    LogTaskError("The previous $Tries run(s) terminated abnormally\n");
+  }
 
   # Update the Task and Job
   $Task->Status($Status);
+  $Task->TestFailures($TestFailures);
   $Task->ChildPid(undef);
   if ($Status eq 'queued')
   {
@@ -215,25 +240,25 @@ sub WrapUpAndExit($)
   $VM = CreateVMs()->GetItem($VM->GetKey());
   if ($VM->Status eq 'running')
   {
-    $VM->Status($Status eq 'queued' ? 'offline' :
-                $Status eq 'completed' ? 'idle' : 'dirty');
+    $VM->Status($NewVMStatus);
     $VM->Save();
     RescheduleJobs();
   }
 
-  my $Result = $VM->Name .": ". $VM->Status ." Task: $Status";
+  my $Result = $VM->Name .": ". $VM->Status ." Status: $Status Failures: ". (defined $TestFailures ? $TestFailures : "unset");
   LogMsg "Task $JobId/$StepNo/$TaskNo done ($Result)\n";
   Debug(Elapsed($Start), " Done. $Result\n");
-  exit($Status eq 'completed' ? 0 : 1);}
+  exit($Status eq 'completed' ? 0 : 1);
+}
 
-sub FatalError($)
+sub FatalError($;$)
 {
-  my ($ErrMessage) = @_;
+  my ($ErrMessage, $Retry) = @_;
 
   LogMsg "$JobId/$StepNo/$TaskNo $ErrMessage";
   LogTaskError($ErrMessage);
 
-  WrapUpAndExit('boterror');
+  WrapUpAndExit('boterror', $Retry);
 }
 
 sub FatalTAError($$)
@@ -246,21 +271,27 @@ sub FatalTAError($$)
   if (!defined $IsPoweredOn)
   {
     # The VM host is not accessible anymore so mark the VM as offline and
-    # requeue the task.
+    # requeue the task. This does not count towards the task's tries limit
+    # since neither the VM nor the task are at fault.
     Error("$ErrMessage\n");
     WrapUpAndExit('queued');
   }
 
+  my $Retry;
   if ($IsPoweredOn)
   {
-    $ErrMessage .= " The test VM has crashed, rebooted or lost connectivity (or the TestAgent server died)\n";
+    LogMsg("$ErrMessage\n");
+    LogTaskError("$ErrMessage\n");
+    $ErrMessage = "The test VM has crashed, rebooted or lost connectivity (or the TestAgent server died)\n";
+    # Retry in case it was a temporary network glitch
+    $Retry = 1;
   }
   else
   {
     # Ignore the TestAgent error, it's irrelevant
     $ErrMessage = "The test VM is powered off!\n";
   }
-  FatalError($ErrMessage);
+  FatalError($ErrMessage, $Retry);
 }
 
 
@@ -420,7 +451,7 @@ if ($NewStatus eq 'completed')
   {
     # It's not clear if the snapshot is still usable. Rather than try to figure
     # it out now, let the next task deal with it.
-    FatalError("Could not remove the ". $VM->IdleSnapshot ." snapshot: $ErrMessage\n");
+    FatalError("Could not remove the ". $VM->IdleSnapshot ." snapshot: $ErrMessage\n", "retry");
   }
 
   Debug(Elapsed($Start), " Recreating the ", $VM->IdleSnapshot, " snapshot\n");
diff --git a/testbot/bin/WineRunTask.pl b/testbot/bin/WineRunTask.pl
index 22339b75..615d8986 100755
--- a/testbot/bin/WineRunTask.pl
+++ b/testbot/bin/WineRunTask.pl
@@ -217,9 +217,31 @@ sub LogTaskError($)
   umask($OldUMask);
 }
 
-sub WrapUpAndExit($;$)
+sub WrapUpAndExit($;$$)
 {
-  my ($Status, $TestFailures) = @_;
+  my ($Status, $TestFailures, $Retry) = @_;
+  my $NewVMStatus = $Status eq 'queued' ? 'offline' : 'dirty';
+
+  my $Tries = $Task->TestFailures || 0;
+  if ($Retry)
+  {
+    # This may be a transient error (e.g. a network glitch)
+    # so retry a few times to improve robustness
+    $Tries++;
+    if ($Task->CanRetry())
+    {
+      $Status = 'queued';
+      $TestFailures = $Tries;
+    }
+    else
+    {
+      LogTaskError("Giving up after $Tries run(s)\n");
+    }
+  }
+  elsif ($Tries >= 1)
+  {
+    LogTaskError("The previous $Tries run(s) terminated abnormally\n");
+  }
 
   # Update the Task and Job
   $Task->Status($Status);
@@ -242,7 +264,7 @@ sub WrapUpAndExit($;$)
   $VM = CreateVMs()->GetItem($VM->GetKey());
   if ($VM->Status eq 'running')
   {
-    $VM->Status($Status eq 'queued' ? 'offline' : 'dirty');
+    $VM->Status($NewVMStatus);
     $VM->Save();
     RescheduleJobs();
   }
@@ -258,20 +280,20 @@ sub WrapUpAndExit($;$)
     link($FullErrFileName, "$LatestBaseName.err") if (-f $FullErrFileName);
   }
 
-  my $Result = $VM->Name .": ". $VM->Status ." Task: $Status Failures: ". (defined $TestFailures ? $TestFailures : "unset");
+  my $Result = $VM->Name .": ". $VM->Status ." Status: $Status Failures: ". (defined $TestFailures ? $TestFailures : "unset");
   LogMsg "Task $JobId/$StepNo/$TaskNo done ($Result)\n";
   Debug(Elapsed($Start), " Done. $Result\n");
   exit($Status eq 'completed' ? 0 : 1);
 }
 
-sub FatalError($)
+sub FatalError($;$)
 {
-  my ($ErrMessage) = @_;
+  my ($ErrMessage, $Retry) = @_;
 
   LogMsg "$JobId/$StepNo/$TaskNo $ErrMessage";
   LogTaskError($ErrMessage);
 
-  WrapUpAndExit('boterror');
+  WrapUpAndExit('boterror', undef, $Retry);
 }
 
 sub FatalTAError($$;$)
@@ -284,31 +306,33 @@ sub FatalTAError($$;$)
   if (!defined $IsPoweredOn)
   {
     # The VM host is not accessible anymore so mark the VM as offline and
-    # requeue the task.
+    # requeue the task. This does not count towards the task's tries limit
+    # since neither the VM nor the task are at fault.
     Error("$ErrMessage\n");
     WrapUpAndExit('queued');
   }
 
-  my $VMState;
+  my $Retry;
   if ($IsPoweredOn)
   {
-    $VMState = "The test VM has crashed, rebooted or lost connectivity (or the TestAgent server died)\n";
+    LogMsg("$ErrMessage\n");
+    LogTaskError("$ErrMessage\n");
+    $ErrMessage = "The test VM has crashed, rebooted or lost connectivity (or the TestAgent server died)\n";
+    # Retry in case it was a temporary network glitch
+    $Retry = 1;
   }
   else
   {
-    $VMState = "The test VM is powered off! Did the test shut it down?\n";
     # Ignore the TestAgent error, it's irrelevant
-    $ErrMessage = "";
+    $ErrMessage = "The test VM is powered off! Did the test shut it down?\n";
   }
-  if ($PossibleCrash)
+  if ($PossibleCrash and !$Task->CanRetry())
   {
     # The test did it!
-    $ErrMessage .= "\n" if ($ErrMessage);
-    LogTaskError("$ErrMessage$VMState");
+    LogTaskError($ErrMessage);
     WrapUpAndExit('completed', 1);
   }
-  $ErrMessage .= " " if ($ErrMessage);
-  FatalError("$ErrMessage$VMState");
+  FatalError($ErrMessage, $Retry);
 }
 
 
diff --git a/testbot/lib/WineTestBot/Config.pm b/testbot/lib/WineTestBot/Config.pm
index 99114a50..0047770c 100644
--- a/testbot/lib/WineTestBot/Config.pm
+++ b/testbot/lib/WineTestBot/Config.pm
@@ -28,7 +28,8 @@ WineTestBot::Config - Site-independent configuration settings
 use vars qw (@ISA @EXPORT @EXPORT_OK $UseSSL $LogDir $DataDir $BinDir
              $DbDataSource $DbUsername $DbPassword $MaxRevertingVMs
              $MaxRevertsWhileRunningVMs $MaxActiveVMs $MaxVMsWhenIdle
-             $SleepAfterRevert $WaitForToolsInVM $AdminEMail $RobotEMail
+             $SleepAfterRevert $WaitForToolsInVM $MaxTaskTries $AdminEMail
+             $RobotEMail
              $WinePatchToOverride $WinePatchCc $SuiteTimeout $SingleTimeout
              $BuildTimeout $ReconfigTimeout $TagPrefix
              $ProjectName $PatchesMailingList $LDAPServer
@@ -41,7 +42,8 @@ require Exporter;
 @ISA = qw(Exporter);
 @EXPORT = qw($UseSSL $LogDir $DataDir $BinDir
              $MaxRevertingVMs $MaxRevertsWhileRunningVMs $MaxActiveVMs
-             $MaxVMsWhenIdle $SleepAfterRevert $WaitForToolsInVM $AdminEMail
+             $MaxVMsWhenIdle $SleepAfterRevert $WaitForToolsInVM
+             $MaxTaskTries $AdminEMail
              $RobotEMail $WinePatchToOverride $WinePatchCc $SuiteTimeout
              $SingleTimeout $BuildTimeout $ReconfigTimeout
              $TagPrefix $ProjectName $PatchesMailingList
@@ -70,6 +72,8 @@ $MaxVMsWhenIdle = undef;
 $SleepAfterRevert = 0;
 $WaitForToolsInVM = 30;
 
+$MaxTaskTries = 3;
+
 $SuiteTimeout = 30 * 60;
 $SingleTimeout = 2 * 60;
 $BuildTimeout = 5 * 60;
diff --git a/testbot/lib/WineTestBot/Tasks.pm b/testbot/lib/WineTestBot/Tasks.pm
index 34be66cc..900498a6 100644
--- a/testbot/lib/WineTestBot/Tasks.pm
+++ b/testbot/lib/WineTestBot/Tasks.pm
@@ -142,6 +142,12 @@ sub Run($$)
   return $ErrMessage;
 }
 
+sub CanRetry($)
+{
+  my ($self) = @_;
+  return ($self->TestFailures || 0) + 1 < $MaxTaskTries;
+}
+
 sub UpdateStatus($$)
 {
   my ($self, $Skip) = @_;
-- 
2.11.0



More information about the wine-patches mailing list