[Tools] testbot: Rerun the task in case of a TestBot error.
Francois Gouget
fgouget at codeweavers.com
Mon Jun 26 12:00:44 CDT 2017
This improves the robustness in the face of network or virtualization
errors. Set the $MaxTaskTries setting to configure the number of
attempts.
Note that this does not impact regular failures and thus flaky tests,
these should be handled in TestLauncher.
Signed-off-by: Francois Gouget <fgouget at codeweavers.com>
---
testbot/bin/WineRunBuild.pl | 52 +++++++++++++++++++++++++++--------
testbot/bin/WineRunReconfig.pl | 57 +++++++++++++++++++++++++++++---------
testbot/bin/WineRunTask.pl | 58 +++++++++++++++++++++++++++------------
testbot/lib/WineTestBot/Config.pm | 8 ++++--
testbot/lib/WineTestBot/Tasks.pm | 6 ++++
5 files changed, 138 insertions(+), 43 deletions(-)
diff --git a/testbot/bin/WineRunBuild.pl b/testbot/bin/WineRunBuild.pl
index 5dfb65f6..56a92f9f 100755
--- a/testbot/bin/WineRunBuild.pl
+++ b/testbot/bin/WineRunBuild.pl
@@ -191,12 +191,36 @@ sub LogTaskError($)
umask($OldUMask);
}
-sub WrapUpAndExit($)
+sub WrapUpAndExit($;$)
{
- my ($Status) = @_;
+ my ($Status, $Retry) = @_;
+ my $NewVMStatus = $Status eq 'queued' ? 'offline' : 'dirty';
+
+ my $TestFailures;
+ my $Tries = $Task->TestFailures || 0;
+ if ($Retry)
+ {
+ # This may be a transient error (e.g. a network glitch)
+ # so retry a few times to improve robustness
+ $Tries++;
+ if ($Task->CanRetry())
+ {
+ $Status = 'queued';
+ $TestFailures = $Tries;
+ }
+ else
+ {
+ LogTaskError("Giving up after $Tries run(s)\n");
+ }
+ }
+ elsif ($Tries >= 1)
+ {
+ LogTaskError("The previous $Tries run(s) terminated abnormally\n");
+ }
# Update the Task and Job
$Task->Status($Status);
+ $Task->TestFailures($TestFailures);
$Task->ChildPid(undef);
if ($Status eq 'queued')
{
@@ -215,25 +239,25 @@ sub WrapUpAndExit($)
$VM = CreateVMs()->GetItem($VM->GetKey());
if ($VM->Status eq 'running')
{
- $VM->Status($Status eq 'queued' ? 'offline' : 'dirty');
+ $VM->Status($NewVMStatus);
$VM->Save();
RescheduleJobs();
}
- my $Result = $VM->Name .": ". $VM->Status ." Task: $Status";
+ my $Result = $VM->Name .": ". $VM->Status ." Status: $Status Failures: ". (defined $TestFailures ? $TestFailures : "unset");
LogMsg "Task $JobId/$StepNo/$TaskNo done ($Result)\n";
Debug(Elapsed($Start), " Done. $Result\n");
exit($Status eq 'completed' ? 0 : 1);
}
-sub FatalError($)
+sub FatalError($;$)
{
- my ($ErrMessage) = @_;
+ my ($ErrMessage, $Retry) = @_;
LogMsg "$JobId/$StepNo/$TaskNo $ErrMessage";
LogTaskError($ErrMessage);
- WrapUpAndExit('boterror');
+ WrapUpAndExit('boterror', $Retry);
}
sub FatalTAError($$)
@@ -246,21 +270,27 @@ sub FatalTAError($$)
if (!defined $IsPoweredOn)
{
# The VM host is not accessible anymore so mark the VM as offline and
- # requeue the task.
+ # requeue the task. This does not count towards the task's tries limit
+ # since neither the VM nor the task are at fault.
Error("$ErrMessage\n");
WrapUpAndExit('queued');
}
+ my $Retry;
if ($IsPoweredOn)
{
- $ErrMessage .= " The test VM has crashed, rebooted or lost connectivity (or the TestAgent server died)\n";
+ LogMsg("$ErrMessage\n");
+ LogTaskError("$ErrMessage\n");
+ $ErrMessage = "The test VM has crashed, rebooted or lost connectivity (or the TestAgent server died)\n";
+ # Retry in case it was a temporary network glitch
+ $Retry = 1;
}
else
{
# Ignore the TestAgent error, it's irrelevant
$ErrMessage = "The test VM is powered off!\n";
}
- FatalError($ErrMessage);
+ FatalError($ErrMessage, $Retry);
}
@@ -394,7 +424,7 @@ if ($TA->GetFile("Build.log", $FullLogFileName))
}
else
{
- FatalError("Unable to open the build log for reading: $!\n");
+ FatalError("Unable to open the build log for reading: $!\n", "retry");
}
}
elsif (!defined $TAError)
diff --git a/testbot/bin/WineRunReconfig.pl b/testbot/bin/WineRunReconfig.pl
index 135a5c38..3cadfe98 100755
--- a/testbot/bin/WineRunReconfig.pl
+++ b/testbot/bin/WineRunReconfig.pl
@@ -191,12 +191,37 @@ sub LogTaskError($)
umask($OldUMask);
}
-sub WrapUpAndExit($)
+sub WrapUpAndExit($;$)
{
- my ($Status) = @_;
+ my ($Status, $Retry) = @_;
+ my $NewVMStatus = $Status eq 'queued' ? 'offline' :
+ $Status eq 'completed' ? 'idle' : 'dirty';
+
+ my $TestFailures;
+ my $Tries = $Task->TestFailures || 0;
+ if ($Retry)
+ {
+ # This may be a transient error (e.g. a network glitch)
+ # so retry a few times to improve robustness
+ $Tries++;
+ if ($Task->CanRetry())
+ {
+ $Status = 'queued';
+ $TestFailures = $Tries;
+ }
+ else
+ {
+ LogTaskError("Giving up after $Tries run(s)\n");
+ }
+ }
+ elsif ($Tries >= 1)
+ {
+ LogTaskError("The previous $Tries run(s) terminated abnormally\n");
+ }
# Update the Task and Job
$Task->Status($Status);
+ $Task->TestFailures($TestFailures);
$Task->ChildPid(undef);
if ($Status eq 'queued')
{
@@ -215,25 +240,25 @@ sub WrapUpAndExit($)
$VM = CreateVMs()->GetItem($VM->GetKey());
if ($VM->Status eq 'running')
{
- $VM->Status($Status eq 'queued' ? 'offline' :
- $Status eq 'completed' ? 'idle' : 'dirty');
+ $VM->Status($NewVMStatus);
$VM->Save();
RescheduleJobs();
}
- my $Result = $VM->Name .": ". $VM->Status ." Task: $Status";
+ my $Result = $VM->Name .": ". $VM->Status ." Status: $Status Failures: ". (defined $TestFailures ? $TestFailures : "unset");
LogMsg "Task $JobId/$StepNo/$TaskNo done ($Result)\n";
Debug(Elapsed($Start), " Done. $Result\n");
- exit($Status eq 'completed' ? 0 : 1);}
+ exit($Status eq 'completed' ? 0 : 1);
+}
-sub FatalError($)
+sub FatalError($;$)
{
- my ($ErrMessage) = @_;
+ my ($ErrMessage, $Retry) = @_;
LogMsg "$JobId/$StepNo/$TaskNo $ErrMessage";
LogTaskError($ErrMessage);
- WrapUpAndExit('boterror');
+ WrapUpAndExit('boterror', $Retry);
}
sub FatalTAError($$)
@@ -246,21 +271,27 @@ sub FatalTAError($$)
if (!defined $IsPoweredOn)
{
# The VM host is not accessible anymore so mark the VM as offline and
- # requeue the task.
+ # requeue the task. This does not count towards the task's tries limit
+ # since neither the VM nor the task are at fault.
Error("$ErrMessage\n");
WrapUpAndExit('queued');
}
+ my $Retry;
if ($IsPoweredOn)
{
- $ErrMessage .= " The test VM has crashed, rebooted or lost connectivity (or the TestAgent server died)\n";
+ LogMsg("$ErrMessage\n");
+ LogTaskError("$ErrMessage\n");
+ $ErrMessage = "The test VM has crashed, rebooted or lost connectivity (or the TestAgent server died)\n";
+ # Retry in case it was a temporary network glitch
+ $Retry = 1;
}
else
{
# Ignore the TestAgent error, it's irrelevant
$ErrMessage = "The test VM is powered off!\n";
}
- FatalError($ErrMessage);
+ FatalError($ErrMessage, $Retry);
}
@@ -420,7 +451,7 @@ if ($NewStatus eq 'completed')
{
# It's not clear if the snapshot is still usable. Rather than try to figure
# it out now, let the next task deal with it.
- FatalError("Could not remove the ". $VM->IdleSnapshot ." snapshot: $ErrMessage\n");
+ FatalError("Could not remove the ". $VM->IdleSnapshot ." snapshot: $ErrMessage\n", "retry");
}
Debug(Elapsed($Start), " Recreating the ", $VM->IdleSnapshot, " snapshot\n");
diff --git a/testbot/bin/WineRunTask.pl b/testbot/bin/WineRunTask.pl
index 22339b75..615d8986 100755
--- a/testbot/bin/WineRunTask.pl
+++ b/testbot/bin/WineRunTask.pl
@@ -217,9 +217,31 @@ sub LogTaskError($)
umask($OldUMask);
}
-sub WrapUpAndExit($;$)
+sub WrapUpAndExit($;$$)
{
- my ($Status, $TestFailures) = @_;
+ my ($Status, $TestFailures, $Retry) = @_;
+ my $NewVMStatus = $Status eq 'queued' ? 'offline' : 'dirty';
+
+ my $Tries = $Task->TestFailures || 0;
+ if ($Retry)
+ {
+ # This may be a transient error (e.g. a network glitch)
+ # so retry a few times to improve robustness
+ $Tries++;
+ if ($Task->CanRetry())
+ {
+ $Status = 'queued';
+ $TestFailures = $Tries;
+ }
+ else
+ {
+ LogTaskError("Giving up after $Tries run(s)\n");
+ }
+ }
+ elsif ($Tries >= 1)
+ {
+ LogTaskError("The previous $Tries run(s) terminated abnormally\n");
+ }
# Update the Task and Job
$Task->Status($Status);
@@ -242,7 +264,7 @@ sub WrapUpAndExit($;$)
$VM = CreateVMs()->GetItem($VM->GetKey());
if ($VM->Status eq 'running')
{
- $VM->Status($Status eq 'queued' ? 'offline' : 'dirty');
+ $VM->Status($NewVMStatus);
$VM->Save();
RescheduleJobs();
}
@@ -258,20 +280,20 @@ sub WrapUpAndExit($;$)
link($FullErrFileName, "$LatestBaseName.err") if (-f $FullErrFileName);
}
- my $Result = $VM->Name .": ". $VM->Status ." Task: $Status Failures: ". (defined $TestFailures ? $TestFailures : "unset");
+ my $Result = $VM->Name .": ". $VM->Status ." Status: $Status Failures: ". (defined $TestFailures ? $TestFailures : "unset");
LogMsg "Task $JobId/$StepNo/$TaskNo done ($Result)\n";
Debug(Elapsed($Start), " Done. $Result\n");
exit($Status eq 'completed' ? 0 : 1);
}
-sub FatalError($)
+sub FatalError($;$)
{
- my ($ErrMessage) = @_;
+ my ($ErrMessage, $Retry) = @_;
LogMsg "$JobId/$StepNo/$TaskNo $ErrMessage";
LogTaskError($ErrMessage);
- WrapUpAndExit('boterror');
+ WrapUpAndExit('boterror', undef, $Retry);
}
sub FatalTAError($$;$)
@@ -284,31 +306,33 @@ sub FatalTAError($$;$)
if (!defined $IsPoweredOn)
{
# The VM host is not accessible anymore so mark the VM as offline and
- # requeue the task.
+ # requeue the task. This does not count towards the task's tries limit
+ # since neither the VM nor the task are at fault.
Error("$ErrMessage\n");
WrapUpAndExit('queued');
}
- my $VMState;
+ my $Retry;
if ($IsPoweredOn)
{
- $VMState = "The test VM has crashed, rebooted or lost connectivity (or the TestAgent server died)\n";
+ LogMsg("$ErrMessage\n");
+ LogTaskError("$ErrMessage\n");
+ $ErrMessage = "The test VM has crashed, rebooted or lost connectivity (or the TestAgent server died)\n";
+ # Retry in case it was a temporary network glitch
+ $Retry = 1;
}
else
{
- $VMState = "The test VM is powered off! Did the test shut it down?\n";
# Ignore the TestAgent error, it's irrelevant
- $ErrMessage = "";
+ $ErrMessage = "The test VM is powered off! Did the test shut it down?\n";
}
- if ($PossibleCrash)
+ if ($PossibleCrash and !$Task->CanRetry())
{
# The test did it!
- $ErrMessage .= "\n" if ($ErrMessage);
- LogTaskError("$ErrMessage$VMState");
+ LogTaskError($ErrMessage);
WrapUpAndExit('completed', 1);
}
- $ErrMessage .= " " if ($ErrMessage);
- FatalError("$ErrMessage$VMState");
+ FatalError($ErrMessage, $Retry);
}
diff --git a/testbot/lib/WineTestBot/Config.pm b/testbot/lib/WineTestBot/Config.pm
index 99114a50..0047770c 100644
--- a/testbot/lib/WineTestBot/Config.pm
+++ b/testbot/lib/WineTestBot/Config.pm
@@ -28,7 +28,8 @@ WineTestBot::Config - Site-independent configuration settings
use vars qw (@ISA @EXPORT @EXPORT_OK $UseSSL $LogDir $DataDir $BinDir
$DbDataSource $DbUsername $DbPassword $MaxRevertingVMs
$MaxRevertsWhileRunningVMs $MaxActiveVMs $MaxVMsWhenIdle
- $SleepAfterRevert $WaitForToolsInVM $AdminEMail $RobotEMail
+ $SleepAfterRevert $WaitForToolsInVM $MaxTaskTries $AdminEMail
+ $RobotEMail
$WinePatchToOverride $WinePatchCc $SuiteTimeout $SingleTimeout
$BuildTimeout $ReconfigTimeout $TagPrefix
$ProjectName $PatchesMailingList $LDAPServer
@@ -41,7 +42,8 @@ require Exporter;
@ISA = qw(Exporter);
@EXPORT = qw($UseSSL $LogDir $DataDir $BinDir
$MaxRevertingVMs $MaxRevertsWhileRunningVMs $MaxActiveVMs
- $MaxVMsWhenIdle $SleepAfterRevert $WaitForToolsInVM $AdminEMail
+ $MaxVMsWhenIdle $SleepAfterRevert $WaitForToolsInVM
+ $MaxTaskTries $AdminEMail
$RobotEMail $WinePatchToOverride $WinePatchCc $SuiteTimeout
$SingleTimeout $BuildTimeout $ReconfigTimeout
$TagPrefix $ProjectName $PatchesMailingList
@@ -70,6 +72,8 @@ $MaxVMsWhenIdle = undef;
$SleepAfterRevert = 0;
$WaitForToolsInVM = 30;
+$MaxTaskTries = 3;
+
$SuiteTimeout = 30 * 60;
$SingleTimeout = 2 * 60;
$BuildTimeout = 5 * 60;
diff --git a/testbot/lib/WineTestBot/Tasks.pm b/testbot/lib/WineTestBot/Tasks.pm
index 34be66cc..900498a6 100644
--- a/testbot/lib/WineTestBot/Tasks.pm
+++ b/testbot/lib/WineTestBot/Tasks.pm
@@ -142,6 +142,12 @@ sub Run($$)
return $ErrMessage;
}
+sub CanRetry($)
+{
+ my ($self) = @_;
+ return ($self->TestFailures || 0) + 1 < $MaxTaskTries;
+}
+
sub UpdateStatus($$)
{
my ($self, $Skip) = @_;
--
2.11.0
More information about the wine-patches
mailing list