[Tools] testbot: Better identify and diagnose task errors.

Francois Gouget fgouget at codeweavers.com
Sun Feb 28 18:32:52 CST 2016


In particular when a TestAgent error occurs and the VM host is not accessible anymore, mark it as offline and requeue the task so it is re-run later on. Also check if the VM is still running and report accordingly.

Signed-off-by: Francois Gouget <fgouget at codeweavers.com>
---
 testbot/bin/WineRunBuild.pl    | 348 +++++++++++++++++++++----------------
 testbot/bin/WineRunReconfig.pl | 349 +++++++++++++++++++++----------------
 testbot/bin/WineRunTask.pl     | 380 ++++++++++++++++++++++++-----------------
 3 files changed, 632 insertions(+), 445 deletions(-)

diff --git a/testbot/bin/WineRunBuild.pl b/testbot/bin/WineRunBuild.pl
index 91244ad..f36e74a 100755
--- a/testbot/bin/WineRunBuild.pl
+++ b/testbot/bin/WineRunBuild.pl
@@ -4,7 +4,7 @@
 # See the bin/build/Build.pl script.
 #
 # Copyright 2009 Ge van Geldorp
-# Copyright 2013 Francois Gouget
+# Copyright 2013-2016 Francois Gouget
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -39,6 +39,8 @@ sub BEGIN
 my $Name0 = $0;
 $Name0 =~ s+^.*/++;
 
+use File::Path;
+
 use WineTestBot::Config;
 use WineTestBot::Jobs;
 use WineTestBot::VMs;
@@ -58,99 +60,14 @@ sub Error(@)
   LogMsg @_;
 }
 
-sub LogTaskError($$)
-{
-  my ($ErrMessage, $FullErrFileName) = @_;
-  Debug("$Name0:error: $ErrMessage");
-
-  my $OldUMask = umask(002);
-  if (open(my $ErrFile, ">>", $FullErrFileName))
-  {
-    umask($OldUMask);
-    print $ErrFile $ErrMessage;
-    close($ErrFile);
-  }
-  else
-  {
-    umask($OldUMask);
-    Error "Unable to open '$FullErrFileName' for writing: $!\n";
-  }
-}
-
-sub FatalError($$$$)
-{
-  my ($ErrMessage, $FullErrFileName, $Job, $Task) = @_;
-  Debug("$Name0:error: $ErrMessage");
-
-  my ($JobKey, $StepKey, $TaskKey) = @{$Task->GetMasterKey()};
-  LogMsg "$JobKey/$StepKey/$TaskKey $ErrMessage";
-
-  LogTaskError($ErrMessage, $FullErrFileName);
-  $Task->Status("boterror");
-  $Task->Ended(time);
-  $Task->Save();
-  $Job->UpdateStatus();
-
-  # Get the up-to-date VM status and update it if nobody else changed it
-  my $VM = CreateVMs()->GetItem($Task->VM->GetKey());
-  if ($VM->Status eq 'running')
-  {
-    $VM->Status('dirty');
-    $VM->Save();
-    RescheduleJobs();
-  }
-
-  exit 1;
-}
-
-sub ProcessLog($$)
-{
-  my ($FullLogFileName, $FullErrFileName) = @_;
-
-  my ($Status, $Errors);
-  if (open(my $LogFile, "<", $FullLogFileName))
-  {
-    # Collect and analyze the 'Build:' status line(s)
-    $Errors = "";
-    foreach my $Line (<$LogFile>)
-    {
-      chomp($Line);
-      next if ($Line !~ /^Build: (.*)$/);
-      if ($1 ne "ok")
-      {
-        $Errors .= "$1\n";
-        $Status = ($1 eq "Patch failed to apply") ? "badpatch" : "badbuild";
-      }
-      elsif (!defined $Status)
-      {
-        $Status = "completed";
-      }
-    }
-    close($LogFile);
-
-    if (!defined $Status)
-    {
-      $Status = "boterror";
-      $Errors = "Missing build status line\n";
-    }
-  }
-  else
-  {
-    $Status = "boterror";
-    $Errors = "Unable to open the log file\n";
-    Error "Unable to open '$FullLogFileName' for reading: $!\n";
-  }
-
-  LogTaskError($Errors, $FullErrFileName) if ($Errors);
-  return $Status;
-}
 
+#
+# Setup and command line processing
+#
 
 $ENV{PATH} = "/usr/bin:/bin";
 delete $ENV{ENV};
 
-
-# Grab the command line options
 my $Usage;
 sub ValidateNumber($$)
 {
@@ -242,23 +159,129 @@ mkdir "$DataDir/jobs/$JobId/$StepNo";
 mkdir "$DataDir/jobs/$JobId/$StepNo/$TaskNo";
 umask($OldUMask);
 
+my $VM = $Task->VM;
 my $StepDir = "$DataDir/jobs/$JobId/$StepNo";
 my $TaskDir = "$StepDir/$TaskNo";
 my $FullLogFileName = "$TaskDir/log";
 my $FullErrFileName = "$TaskDir/err";
 
-my $VM = $Task->VM;
+my $Start = Time();
+LogMsg "Task $JobId/$StepNo/$TaskNo started\n";
+
+
+#
+# Error handling helpers
+#
+
+sub LogTaskError($)
+{
+  my ($ErrMessage) = @_;
+  Debug("$Name0:error: ", $ErrMessage);
+
+  my $OldUMask = umask(002);
+  if (open(my $ErrFile, ">>", $FullErrFileName))
+  {
+    umask($OldUMask);
+    print $ErrFile $ErrMessage;
+    close($ErrFile);
+  }
+  else
+  {
+    umask($OldUMask);
+    Error "Unable to open '$FullErrFileName' for writing: $!\n";
+  }
+}
+
+sub WrapUpAndExit($)
+{
+  my ($Status) = @_;
+
+  # Update the Task and Job
+  $Task->Status($Status);
+  $Task->ChildPid(undef);
+  if ($Status eq 'queued')
+  {
+    $Task->Started(undef);
+    $Task->Ended(undef);
+    Error "Unable to delete '$TaskDir': $!\n" if (!rmtree($TaskDir));
+  }
+  else
+  {
+    $Task->Ended(time());
+  }
+  $Task->Save();
+  $Job->UpdateStatus();
+
+  # Get the up-to-date VM status and update it if nobody else changed it
+  $VM = CreateVMs()->GetItem($VM->GetKey());
+  if ($VM->Status eq 'running')
+  {
+    $VM->Status($Status eq 'queued' ? 'offline' : 'dirty');
+    $VM->Save();
+    RescheduleJobs();
+  }
+
+  my $Result = $VM->Name .": ". $VM->Status ." Task: $Status";
+  LogMsg "Task $JobId/$StepNo/$TaskNo done ($Result)\n";
+  Debug(Elapsed($Start), " Done. $Result\n");
+  exit($Status eq 'completed' ? 0 : 1);
+}
+
+sub FatalError($)
+{
+  my ($ErrMessage) = @_;
+
+  LogMsg "$JobId/$StepNo/$TaskNo $ErrMessage";
+  LogTaskError($ErrMessage);
+
+  WrapUpAndExit('boterror');
+}
+
+sub FatalTAError($$)
+{
+  my ($TA, $ErrMessage) = @_;
+  $ErrMessage .= ": ". $TA->GetLastError() if (defined $TA);
+
+  # A TestAgent operation failed, see if the VM is still accessible
+  my $IsPoweredOn = $VM->IsPoweredOn();
+  if (!defined $IsPoweredOn)
+  {
+    # The VM host is not accessible anymore so mark the VM as offline and
+    # requeue the task.
+    Error("$ErrMessage\n");
+    WrapUpAndExit('queued');
+  }
+
+  if ($IsPoweredOn)
+  {
+    $ErrMessage .= " The test VM has crashed, rebooted or lost connectivity (or the TestAgent server died)\n";
+  }
+  else
+  {
+    # Ignore the TestAgent error, it's irrelevant
+    $ErrMessage = "The test VM is powered off!\n";
+  }
+  FatalError($ErrMessage);
+}
+
+
+#
+# Check the VM
+#
+
 if (!$Debug and $VM->Status ne "running")
 {
-  FatalError "The VM is not ready for use (" . $VM->Status . ")\n",
-             $FullErrFileName, $Job, $Task;
+  FatalError("The VM is not ready for use (" . $VM->Status . ")\n");
 }
 elsif ($Debug and !$VM->IsPoweredOn)
 {
-  FatalError "The VM is not powered on\n", $FullErrFileName, $Job, $Task;
+  FatalError("The VM is not powered on\n");
 }
-my $Start = Time();
-LogMsg "Task $JobId/$StepNo/$TaskNo started\n";
+
+
+#
+# Figure out what to build
+#
 
 my ($Run64, $BaseName);
 foreach my $OtherStep (@{$Job->Steps->GetItems()})
@@ -273,26 +296,27 @@ foreach my $OtherStep (@{$Job->Steps->GetItems()})
     $OtherBaseName =~ s/\.exe$//;
     if (defined $BaseName and $BaseName ne $OtherBaseName)
     {
-      FatalError "$OtherBaseName doesn't match previously found $BaseName\n",
-                 $FullErrFileName, $Job, $Task;
+      FatalError("$OtherBaseName doesn't match previously found $BaseName\n");
     }
     $BaseName = $OtherBaseName;
   }
 }
 if (!defined $BaseName)
 {
-  FatalError "Can't determine base name\n", $FullErrFileName, $Job, $Task;
+  FatalError("Could not determine the test executable's base name\n");
 }
 
-my $ErrMessage;
+
+#
+# Run the build
+#
+
 my $FileName = $Step->FileName;
 my $TA = $VM->GetAgent();
 Debug(Elapsed($Start), " Sending '$StepDir/$FileName'\n");
 if (!$TA->SendFile("$StepDir/$FileName", "staging/patch.diff", 0))
 {
-  $ErrMessage = $TA->GetLastError();
-  FatalError "Could not copy the patch to the VM: $ErrMessage\n",
-             $FullErrFileName, $Job, $Task;
+  FatalTAError($TA, "Could not copy the patch to the VM");
 }
 my $Script = "#!/bin/sh\n" .
              "rm -f Build.log\n" .
@@ -303,44 +327,91 @@ $Script .= " >>Build.log 2>&1\n";
 Debug(Elapsed($Start), " Sending the script: [$Script]\n");
 if (!$TA->SendFileFromString($Script, "task", $TestAgent::SENDFILE_EXE))
 {
-  $ErrMessage = $TA->GetLastError();
-  FatalError "Can't send the build script to the VM: $ErrMessage\n",
-             $FullErrFileName, $Job, $Task;
+  FatalTAError($TA, "Coud not send the build script to the VM");
 }
-Debug(Elapsed($Start), " Running the script\n");
+
+Debug(Elapsed($Start), " Starting the script\n");
 my $Pid = $TA->Run(["./task"], 0);
-if (!$Pid or !defined $TA->Wait($Pid, $Task->Timeout, 60))
+if (!$Pid)
 {
-  $ErrMessage = $TA->GetLastError();
-  # Try to grab the build log before reporting the failure
-}
-my $NewStatus;
-Debug(Elapsed($Start), " Retrieving the build log '$FullLogFileName'\n");
-if ($TA->GetFile("Build.log", $FullLogFileName))
-{
-  $NewStatus = ProcessLog($FullLogFileName, $FullErrFileName);
+  FatalTAError($TA, "Failed to start the build");
 }
-elsif (!defined $ErrMessage)
+
+
+#
+# From that point on we want to at least try to grab the build
+# log before giving up
+#
+
+my ($NewStatus, $ErrMessage, $TAError);
+Debug(Elapsed($Start), " Waiting for the script (", $Task->Timeout, "s timeout)\n");
+if (!defined $TA->Wait($Pid, $Task->Timeout, 60))
 {
-  # This GetFile() error is the first one so report it
   $ErrMessage = $TA->GetLastError();
-  FatalError "Can't copy the build log from the VM: $ErrMessage\n",
-             $FullErrFileName, $Job, $Task;
-}
-if (defined $ErrMessage)
-{
-  # Now we can report the previous Run() / Wait() error
   if ($ErrMessage =~ /timed out waiting for the child process/)
   {
+    $ErrMessage = "The build timed out\n";
     $NewStatus = "badbuild";
-    LogTaskError("The build timed out\n", $FullErrFileName);
   }
   else
   {
-    FatalError "Failure running the build script in the VM: $ErrMessage\n",
-               $FullErrFileName, $Job, $Task;
+    $TAError = "An error occurred while waiting for the build to complete: $ErrMessage";
+    $ErrMessage = undef;
+  }
+}
+
+Debug(Elapsed($Start), " Retrieving the build log to '$FullLogFileName'\n");
+if ($TA->GetFile("Build.log", $FullLogFileName))
+{
+  if (open(my $LogFile, "<", $FullLogFileName))
+  {
+    # Collect and analyze the 'Build:' status line(s)
+    $ErrMessage ||= "";
+    foreach my $Line (<$LogFile>)
+    {
+      chomp($Line);
+      next if ($Line !~ /^Build: (.*)$/);
+      if ($1 eq "ok")
+      {
+        # We must have gotten the full log and the build did succeed.
+        # So forget any prior error.
+        $NewStatus = "completed";
+        $TAError = $ErrMessage = undef;
+      }
+      else
+      {
+        $NewStatus = ($1 eq "Patch failed to apply") ? "badpatch" : "badbuild";
+        # Collect all the build errors (32 bit, 64 bit, etc)
+        $ErrMessage .= "$1\n";
+      }
+    }
+    close($LogFile);
+
+    if (!defined $NewStatus)
+    {
+      $NewStatus = "badbuild";
+      $ErrMessage = "Missing build status line\n";
+    }
+  }
+  else
+  {
+    FatalError("Unable to open the build log for reading: $!\n");
   }
 }
+elsif (!defined $TAError)
+{
+  $TAError = "An error occurred while retrieving the build log: ". $TA->GetLastError();
+}
+
+# Report the build errors even though they may have been caused by
+# TestAgent trouble.
+LogTaskError($ErrMessage) if (defined $ErrMessage);
+FatalTAError(undef, $TAError) if (defined $TAError);
+
+
+#
+# Grab the executables for the next steps
+#
 
 # Don't try copying the test executables if the build step failed
 if ($NewStatus eq "completed")
@@ -365,32 +436,19 @@ if ($NewStatus eq "completed")
     {
       $TestExecutable = "build-mingw$Bits/programs/$BaseName/tests/${BaseName}.exe_test.exe";
     }
+    Debug(Elapsed($Start), " Retrieving '$OtherFileName'\n");
     if (!$TA->GetFile($TestExecutable, "$OtherStepDir/$OtherFileName"))
     {
-      $ErrMessage = $TA->GetLastError();
-      FatalError "Can't copy the generated executable from the VM: $ErrMessage\n",
-                 $FullErrFileName, $Job, $Task;
+      FatalTAError($TA, "Could not retrieve '$OtherFileName'");
     }
     chmod 0664, "$OtherStepDir/$OtherFileName";
   }
 }
 $TA->Disconnect();
 
-Debug(Elapsed($Start), " Done. New task status: $NewStatus\n");
-$Task->Status($NewStatus);
-$Task->ChildPid(undef);
-$Task->Ended(time);
-$Task->Save();
-$Job->UpdateStatus();
 
-# Get the up-to-date VM status and update it if nobody else changed it
-$VM = CreateVMs()->GetItem($VM->GetKey());
-if ($VM->Status eq 'running')
-{
-  $VM->Status('dirty');
-  $VM->Save();
-  RescheduleJobs();
-}
+#
+# Wrap up
+#
 
-LogMsg "Task $JobId/$StepNo/$TaskNo completed\n";
-exit 0;
+WrapUpAndExit($NewStatus);
diff --git a/testbot/bin/WineRunReconfig.pl b/testbot/bin/WineRunReconfig.pl
index 3af46c9..cc07359 100755
--- a/testbot/bin/WineRunReconfig.pl
+++ b/testbot/bin/WineRunReconfig.pl
@@ -4,7 +4,7 @@
 # See the bin/build/Reconfig.pl script.
 #
 # Copyright 2009 Ge van Geldorp
-# Copyright 2013 Francois Gouget
+# Copyright 2013-2016 Francois Gouget
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -39,6 +39,8 @@ sub BEGIN
 my $Name0 = $0;
 $Name0 =~ s+^.*/++;
 
+use File::Path;
+
 use WineTestBot::Config;
 use WineTestBot::Jobs;
 use WineTestBot::VMs;
@@ -58,100 +60,14 @@ sub Error(@)
   LogMsg @_;
 }
 
-sub LogTaskError($$)
-{
-  my ($ErrMessage, $FullErrFileName) = @_;
-  Debug("$Name0:error: $ErrMessage");
-
-  my $OldUMask = umask(002);
-  if (open(my $ErrFile, ">>", $FullErrFileName))
-  {
-    umask($OldUMask);
-    print $ErrFile $ErrMessage;
-    close($ErrFile);
-  }
-  else
-  {
-    umask($OldUMask);
-    Error "Unable to open '$FullErrFileName' for writing: $!\n";
-  }
-}
-
-sub FatalError($$$$)
-{
-  my ($ErrMessage, $FullErrFileName, $Job, $Task) = @_;
-  Debug("$Name0:error: $ErrMessage");
-
-  my ($JobKey, $StepKey, $TaskKey) = @{$Task->GetMasterKey()};
-  LogMsg "$JobKey/$StepKey/$TaskKey $ErrMessage";
-
-  LogTaskError($ErrMessage, $FullErrFileName);
-  $Task->Status("boterror");
-  $Task->Ended(time);
-  $Task->Save();
-  $Job->UpdateStatus();
-
-  # Get the up-to-date VM status and update it if nobody else changed it
-  my $VM = CreateVMs()->GetItem($Task->VM->GetKey());
-  if ($VM->Status eq 'running')
-  {
-    $VM->Status('dirty');
-    $VM->Save();
-    RescheduleJobs();
-  }
-
-  exit 1;
-}
-
-sub ProcessLog($$)
-{
-  my ($FullLogFileName, $FullErrFileName) = @_;
-
-  my ($Status, $Errors);
-  if (open(my $LogFile, "<", $FullLogFileName))
-  {
-    # Collect and analyze the 'Reconfig:' status line(s)
-    $Errors = "";
-    foreach my $Line (<$LogFile>)
-    {
-      chomp($Line);
-      next if ($Line !~ /^Reconfig: (.*)$/);
-      if ($1 ne "ok")
-      {
-        $Errors .= "$1\n";
-        $Status = "badbuild";
-      }
-      elsif (!defined $Status)
-      {
-        $Status = "completed";
-      }
-    }
-    close($LogFile);
-
-    if (!defined $Status)
-    {
-      $Status = "boterror";
-      $Errors = "Missing reconfig status line\n";
-    }
-  }
-  else
-  {
-    $Status = "boterror";
-    $Errors = "Unable to open the log file\n";
-    Error "Unable to open '$FullLogFileName' for reading: $!\n";
-  }
-
-  LogTaskError($Errors, $FullErrFileName) if ($Errors);
-  return $Status;
-}
 
+#
+# Setup and command line processing
+#
 
 $ENV{PATH} = "/usr/bin:/bin";
 delete $ENV{ENV};
 
-
-
-# Grab the command line options
 my $Usage;
 sub ValidateNumber($$)
 {
@@ -243,25 +159,130 @@ mkdir "$DataDir/jobs/$JobId/$StepNo";
 mkdir "$DataDir/jobs/$JobId/$StepNo/$TaskNo";
 umask($OldUMask);
 
+my $VM = $Task->VM;
 my $StepDir = "$DataDir/jobs/$JobId/$StepNo";
 my $TaskDir = "$StepDir/$TaskNo";
 my $FullLogFileName = "$TaskDir/log";
 my $FullErrFileName = "$TaskDir/err";
 
-my $VM = $Task->VM;
+my $Start = Time();
+LogMsg "Task $JobId/$StepNo/$TaskNo started\n";
+
+
+#
+# Error handling helpers
+#
+
+sub LogTaskError($)
+{
+  my ($ErrMessage) = @_;
+  Debug("$Name0:error: ", $ErrMessage);
+
+  my $OldUMask = umask(002);
+  if (open(my $ErrFile, ">>", $FullErrFileName))
+  {
+    umask($OldUMask);
+    print $ErrFile $ErrMessage;
+    close($ErrFile);
+  }
+  else
+  {
+    umask($OldUMask);
+    Error "Unable to open '$FullErrFileName' for writing: $!\n";
+  }
+}
+
+sub WrapUpAndExit($)
+{
+  my ($Status) = @_;
+
+  # Update the Task and Job
+  $Task->Status($Status);
+  $Task->ChildPid(undef);
+  if ($Status eq 'queued')
+  {
+    $Task->Started(undef);
+    $Task->Ended(undef);
+    Error "Unable to delete '$TaskDir': $!\n" if (!rmtree($TaskDir));
+  }
+  else
+  {
+    $Task->Ended(time());
+  }
+  $Task->Save();
+  $Job->UpdateStatus();
+
+  # Get the up-to-date VM status and update it if nobody else changed it
+  $VM = CreateVMs()->GetItem($VM->GetKey());
+  if ($VM->Status eq 'running')
+  {
+    $VM->Status($Status eq 'queued' ? 'offline' :
+                $Status eq 'completed' ? 'idle' : 'dirty');
+    $VM->Save();
+    RescheduleJobs();
+  }
+
+  my $Result = $VM->Name .": ". $VM->Status ." Task: $Status";
+  LogMsg "Task $JobId/$StepNo/$TaskNo done ($Result)\n";
+  Debug(Elapsed($Start), " Done. $Result\n");
+  exit($Status eq 'completed' ? 0 : 1);}
+
+sub FatalError($)
+{
+  my ($ErrMessage) = @_;
+
+  LogMsg "$JobId/$StepNo/$TaskNo $ErrMessage";
+  LogTaskError($ErrMessage);
+
+  WrapUpAndExit('boterror');
+}
+
+sub FatalTAError($$)
+{
+  my ($TA, $ErrMessage) = @_;
+  $ErrMessage .= ": ". $TA->GetLastError() if (defined $TA);
+
+  # A TestAgent operation failed, see if the VM is still accessible
+  my $IsPoweredOn = $VM->IsPoweredOn();
+  if (!defined $IsPoweredOn)
+  {
+    # The VM host is not accessible anymore so mark the VM as offline and
+    # requeue the task.
+    Error("$ErrMessage\n");
+    WrapUpAndExit('queued');
+  }
+
+  if ($IsPoweredOn)
+  {
+    $ErrMessage .= " The test VM has crashed, rebooted or lost connectivity (or the TestAgent server died)\n";
+  }
+  else
+  {
+    # Ignore the TestAgent error, it's irrelevant
+    $ErrMessage = "The test VM is powered off!\n";
+  }
+  FatalError($ErrMessage);
+}
+
+
+#
+# Check the VM
+#
+
 if (!$Debug and $VM->Status ne "running")
 {
-  FatalError "The VM is not ready for use (" . $VM->Status . ")\n",
-             $FullErrFileName, $Job, $Task;
+  FatalError("The VM is not ready for use (" . $VM->Status . ")\n");
 }
 elsif ($Debug and !$VM->IsPoweredOn)
 {
-  FatalError "The VM is not powered on\n", $FullErrFileName, $Job, $Task;
+  FatalError("The VM is not powered on\n");
 }
-my $Start = Time();
-LogMsg "Task $JobId/$StepNo/$TaskNo started\n";
 
-my $ErrMessage;
+
+#
+# Run the build
+#
+
 my $Script = "#!/bin/sh\n" .
              "rm -f Reconfig.log\n" .
              "../bin/build/Reconfig.pl >>Reconfig.log 2>&1\n";
@@ -269,84 +290,124 @@ my $TA = $VM->GetAgent();
 Debug(Elapsed($Start), " Sending the script: [$Script]\n");
 if (!$TA->SendFileFromString($Script, "task", $TestAgent::SENDFILE_EXE))
 {
-  $ErrMessage = $TA->GetLastError();
-  FatalError "Can't send the script to the VM: $ErrMessage\n",
-             $FullErrFileName, $Job, $Task;
+  FatalTAError($TA, "Could not send the reconfig script to the VM");
 }
-Debug(Elapsed($Start), " Running the script\n");
+
+Debug(Elapsed($Start), " Starting the script\n");
 my $Pid = $TA->Run(["./task"], 0);
-if (!$Pid or !defined $TA->Wait($Pid, $Task->Timeout, 60))
+if (!$Pid)
 {
-  $ErrMessage = $TA->GetLastError();
-  # Try to grab the reconfig log before reporting the failure
+  FatalTAError($TA, "Failed to start the build VM update");
 }
-my $NewStatus;
-Debug(Elapsed($Start), " Retrieving the reconfig log '$FullLogFileName'\n");
-if ($TA->GetFile("Reconfig.log", $FullLogFileName))
-{
-  $NewStatus = ProcessLog($FullLogFileName, $FullErrFileName);
-}
-elsif (!defined $ErrMessage)
+
+
+#
+# From that point on we want to at least try to grab the build
+# log before giving up
+#
+
+my ($NewStatus, $ErrMessage, $TAError);
+Debug(Elapsed($Start), " Waiting for the script (", $Task->Timeout, "s timeout)\n");
+if (!defined $TA->Wait($Pid, $Task->Timeout, 60))
 {
-  # This GetFile() error is the first one so report it
   $ErrMessage = $TA->GetLastError();
-  FatalError "Can't copy the reconfig log from the VM: $ErrMessage\n",
-             $FullErrFileName, $Job, $Task;
-}
-if (defined $ErrMessage)
-{
-  # Now we can report the previous Run() / Wait() error
   if ($ErrMessage =~ /timed out waiting for the child process/)
   {
+    $ErrMessage = "The build timed out\n";
     $NewStatus = "badbuild";
-    LogTaskError("The reconfig timed out\n", $FullErrFileName);
   }
   else
   {
-    FatalError "Could not run the reconfig script in the VM: $ErrMessage\n",
-               $FullErrFileName, $Job, $Task;
+    $TAError = "An error occurred while waiting for the build to complete: $ErrMessage";
+    $ErrMessage = undef;
   }
 }
 
+Debug(Elapsed($Start), " Retrieving the reconfig log to '$FullLogFileName'\n");
+if ($TA->GetFile("Reconfig.log", $FullLogFileName))
+{
+  if (open(my $LogFile, "<", $FullLogFileName))
+  {
+    # Collect and analyze the 'Reconfig:' status line(s).
+    my $LogErrors;
+    foreach my $Line (<$LogFile>)
+    {
+      chomp($Line);
+      next if ($Line !~ /^Reconfig: (.*)$/);
+      # Add the error message or an empty string for 'ok'
+      $LogErrors = ($LogErrors || "") . ($1 ne "ok" ? "$1\n" : "");
+    }
+    close($LogFile);
+
+    if (!defined $LogErrors)
+    {
+      if (!defined $ErrMessage)
+      {
+        $NewStatus = "badbuild";
+        $ErrMessage = "Missing reconfig status line\n";
+      }
+      # otherwise $ErrMessage probably already explains why the reconfig
+      # status line is missing
+    }
+    elsif ($LogErrors eq "")
+    {
+      # We must have gotten the full log and the build did succeed.
+      # So forget any prior error.
+      $NewStatus = "completed";
+      $TAError = $ErrMessage = undef;
+    }
+    else
+    {
+      $NewStatus = "badbuild";
+      $ErrMessage = $LogErrors . ($ErrMessage || "");
+    }
+  }
+  else
+  {
+    FatalError("Unable to open the build log for reading: $!\n");
+  }
+}
+elsif (!defined $TAError)
+{
+  $TAError = "An error occurred while retrieving the reconfig log: ". $TA->GetLastError();
+}
 $TA->Disconnect();
 
-if ($NewStatus eq "completed")
+# Report the reconfig errors even though they may have been caused by
+# TestAgent trouble.
+LogTaskError($ErrMessage) if (defined $ErrMessage);
+FatalTAError(undef, $TAError) if (defined $TAError);
+
+
+#
+# Update the build VM's snapshot
+#
+
+if ($NewStatus eq 'completed')
 {
   Debug(Elapsed($Start), " Deleting the old ", $VM->IdleSnapshot, " snapshot\n");
   $ErrMessage = $VM->RemoveSnapshot($VM->IdleSnapshot);
-  if (defined($ErrMessage))
+  if (defined $ErrMessage)
   {
-    FatalError "Can't remove snapshot: $ErrMessage\n",
-               $FullErrFileName, $Job, $Task;
+    # It's not clear if the snapshot is still usable. Rather than try to figure
+    # it out now, let the next task deal with it.
+    FatalError("Could not remove the ". $VM->IdleSnapshot ." snapshot: $ErrMessage\n");
   }
 
   Debug(Elapsed($Start), " Recreating the ", $VM->IdleSnapshot, " snapshot\n");
   $ErrMessage = $VM->CreateSnapshot($VM->IdleSnapshot);
-  if (defined($ErrMessage))
+  if (defined $ErrMessage)
   {
     # Without the snapshot the VM is not usable anymore but FatalError() will
     # just mark it as 'dirty'. It's only the next time it is used that the
     # problem will be noticed and that it will be taken offline.
-    FatalError "Can't take snapshot: $ErrMessage\n",
-               $FullErrFileName, $Job, $Task;
+    FatalError("Could not recreate the ". $VM->IdleSnapshot ." snapshot: $ErrMessage\n");
   }
 }
 
-Debug(Elapsed($Start), " Done. New task status: $NewStatus\n");
-$Task->Status($NewStatus);
-$Task->ChildPid(undef);
-$Task->Ended(time);
-$Task->Save();
-$Job->UpdateStatus();
 
-# Get the up-to-date VM status and update it if nobody else changed it
-$VM = CreateVMs()->GetItem($VM->GetKey());
-if ($VM->Status eq 'running')
-{
-  $VM->Status($NewStatus eq 'completed' ? 'idle' : 'dirty');
-  $VM->Save();
-  RescheduleJobs();
-}
+#
+# Wrap up
+#
 
-LogMsg "Task $JobId/$StepNo/$TaskNo completed\n";
-exit 0;
+WrapUpAndExit($NewStatus);
diff --git a/testbot/bin/WineRunTask.pl b/testbot/bin/WineRunTask.pl
index d82a156..d703d39 100755
--- a/testbot/bin/WineRunTask.pl
+++ b/testbot/bin/WineRunTask.pl
@@ -3,6 +3,7 @@
 # Sends and runs the tasks in the Windows test VMs.
 #
 # Copyright 2009 Ge van Geldorp
+# Copyright 2013-2016 Francois Gouget
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -37,6 +38,8 @@ sub BEGIN
 my $Name0 = $0;
 $Name0 =~ s+^.*/++;
 
+use File::Path;
+
 use WineTestBot::Config;
 use WineTestBot::Jobs;
 use WineTestBot::VMs;
@@ -56,60 +59,6 @@ sub Error(@)
   LogMsg @_;
 }
 
-sub LogTaskError($$)
-{
-  my ($ErrMessage, $FullErrFileName) = @_;
-  Debug("$Name0:error: $ErrMessage");
-
-  my $OldUMask = umask(002);
-  if (open(my $ErrFile, ">>", $FullErrFileName))
-  {
-    umask($OldUMask);
-    print $ErrFile $ErrMessage;
-    close($ErrFile);
-  }
-  else
-  {
-    umask($OldUMask);
-    Error "Unable to open '$FullErrFileName' for writing: $!\n";
-  }
-}
-
-sub FatalError($$$$$)
-{
-  my ($ErrMessage, $FullErrFileName, $Job, $Step, $Task) = @_;
-  Debug("$Name0:error: $ErrMessage");
-
-  my ($JobKey, $StepKey, $TaskKey) = @{$Task->GetMasterKey()};
-  LogMsg "$JobKey/$StepKey/$TaskKey $ErrMessage";
-
-  LogTaskError($ErrMessage, $FullErrFileName);
-  if ($Step->Type eq "suite")
-  {
-    # Link the test suite's results for future use in WineSendLog.pl.
-    my $LatestName = "$DataDir/latest/" . $Task->VM->Name . "_" .
-                     ($Step->FileType eq "exe64" ? "64" : "32") . ".err";
-    unlink($LatestName);
-    link($FullErrFileName, $LatestName);
-  }
-
-  $Task->Status("boterror");
-  $Task->Ended(time);
-  $Task->Save();
-  $Job->UpdateStatus();
-
-  # Get the up-to-date VM status and update it if nobody else changed it
-  my $VM = CreateVMs()->GetItem($Task->VM->GetKey());
-  if ($VM->Status eq 'running')
-  {
-    $VM->Status('dirty');
-    $VM->Save();
-    RescheduleJobs();
-  }
-
-  exit 1;
-}
-
 sub TakeScreenshot($$)
 {
   my ($VM, $FullScreenshotFileName) = @_;
@@ -136,38 +85,14 @@ sub TakeScreenshot($$)
   }
 }
 
-sub CountFailures($)
-{
-  my ($ReportFileName) = @_;
-
-  if (! open REPORTFILE, "<$ReportFileName")
-  {
-    return undef;
-  }
-
-  my $Failures;
-  my $Line;
-  while (defined($Line = <REPORTFILE>))
-  {
-    if ($Line =~ m/: \d+ tests? executed \(\d+ marked as todo, (\d+) failures?\), \d+ skipped\./)
-    {
-      $Failures += $1;
-    }
-    elsif ($Line =~ m/ done \(258\)/ ||
-           $Line =~ m/: unhandled exception [0-9a-fA-F]{8} at /)
-    {
-      $Failures++;
-    }
-  }
-  close REPORTFILE;
 
-  return $Failures;
-}
+#
+# Setup and command line processing
+#
 
 $ENV{PATH} = "/usr/bin:/bin";
 delete $ENV{ENV};
 
-# Grab the command line options
 my $Usage;
 sub ValidateNumber($$)
 {
@@ -267,50 +192,164 @@ my $FullLogFileName = "$TaskDir/log";
 my $FullErrFileName = "$TaskDir/err";
 my $FullScreenshotFileName = "$TaskDir/screenshot.png";
 
+my $Start = Time();
+LogMsg "Task $JobId/$StepNo/$TaskNo started\n";
+
+
+#
+# Error handling helpers
+#
+
+sub LogTaskError($)
+{
+  my ($ErrMessage) = @_;
+  Debug("$Name0:error: ", $ErrMessage);
+
+  my $OldUMask = umask(002);
+  if (open(my $ErrFile, ">>", $FullErrFileName))
+  {
+    umask($OldUMask);
+    print $ErrFile $ErrMessage;
+    close($ErrFile);
+  }
+  else
+  {
+    umask($OldUMask);
+    Error "Unable to open '$FullErrFileName' for writing: $!\n";
+  }
+}
+
+sub WrapUpAndExit($;$)
+{
+  my ($Status, $TestFailures) = @_;
+
+  # Update the Task and Job
+  $Task->Status($Status);
+  $Task->TestFailures($TestFailures);
+  $Task->ChildPid(undef);
+  if ($Status eq 'queued')
+  {
+    $Task->Started(undef);
+    $Task->Ended(undef);
+    Error "Unable to delete '$TaskDir': $!\n" if (!rmtree($TaskDir));
+  }
+  else
+  {
+    $Task->Ended(time());
+  }
+  $Task->Save();
+  $Job->UpdateStatus();
+
+  # Get the up-to-date VM status and update it if nobody else changed it
+  $VM = CreateVMs()->GetItem($VM->GetKey());
+  if ($VM->Status eq 'running')
+  {
+    $VM->Status($Status eq 'queued' ? 'offline' : 'dirty');
+    $VM->Save();
+    RescheduleJobs();
+  }
+
+  if ($Status eq 'completed' and $Step->Type eq 'suite')
+  {
+    # Update the reference VM suite results for WineSendLog.pl
+    my $LatestBaseName = join("", "$DataDir/latest/", $Task->VM->Name, "_",
+                              $Step->FileType eq "exe64" ? "64" : "32");
+    unlink("$LatestBaseName.log");
+    link($FullLogFileName, "$LatestBaseName.log") if (-f $FullLogFileName);
+    unlink("$LatestBaseName.err");
+    link($FullErrFileName, "$LatestBaseName.err") if (-f $FullErrFileName);
+  }
+
+  my $Result = $VM->Name .": ". $VM->Status ." Task: $Status Failures: ". (defined $TestFailures ? $TestFailures : "unset");
+  LogMsg "Task $JobId/$StepNo/$TaskNo done ($Result)\n";
+  Debug(Elapsed($Start), " Done. $Result\n");
+  exit($Status eq 'completed' ? 0 : 1);
+}
+
+sub FatalError($)
+{
+  my ($ErrMessage) = @_;
+
+  LogMsg "$JobId/$StepNo/$TaskNo $ErrMessage";
+  LogTaskError($ErrMessage);
+
+  WrapUpAndExit('boterror');
+}
+
+sub FatalTAError($$;$)
+{
+  my ($TA, $ErrMessage, $PossibleCrash) = @_;
+  $ErrMessage .= ": ". $TA->GetLastError() if (defined $TA);
+
+  # A TestAgent operation failed, see if the VM is still accessible
+  my $IsPoweredOn = $VM->IsPoweredOn();
+  if (!defined $IsPoweredOn)
+  {
+    # The VM host is not accessible anymore so mark the VM as offline and
+    # requeue the task.
+    Error("$ErrMessage\n");
+    WrapUpAndExit('queued');
+  }
+
+  my $VMState;
+  if ($IsPoweredOn)
+  {
+    $VMState = "The test VM has crashed, rebooted or lost connectivity (or the TestAgent server died)\n";
+  }
+  else
+  {
+    $VMState = "The test VM is powered off! Did the test shut it down?\n";
+    # Ignore the TestAgent error, it's irrelevant
+    $ErrMessage = "";
+  }
+  if ($PossibleCrash)
+  {
+    # The test did it!
+    $ErrMessage .= "\n" if ($ErrMessage);
+    LogTaskError("$ErrMessage$VMState");
+    WrapUpAndExit('completed', 1);
+  }
+  $ErrMessage .= " " if ($ErrMessage);
+  FatalError("$ErrMessage$VMState");
+}
+
+
+#
+# Check the VM
+#
+
 if (!$Debug and $VM->Status ne "running")
 {
-  FatalError "The VM is not ready for use (" . $VM->Status . ")\n",
-             $FullErrFileName, $Job, $Step, $Task;
+  FatalError("The VM is not ready for use (" . $VM->Status . ")\n");
 }
 elsif ($Debug and !$VM->IsPoweredOn)
 {
-  FatalError "The VM is not powered on\n", $FullErrFileName, $Job, $Step, $Task;
+  FatalError("The VM is not powered on\n");
 }
-LogMsg "Task $JobId/$StepNo/$TaskNo (" . $VM->Name . ") started\n";
 
-my $Start = Time();
-Debug("0.00 Setting the time\n");
+
+#
+# Setup the VM
+#
+
 my $TA = $VM->GetAgent();
+Debug(Elapsed($Start), " Setting the time\n");
 if (!$TA->SetTime())
 {
   # Not a fatal error
-  LogTaskError("Unable to set the VM system time: ". $TA->GetLastError() ."\n", $FullErrFileName);
+  LogTaskError("Unable to set the VM system time: ". $TA->GetLastError() .". Maybe the TestAgentd process is missing the required privileges.\n");
 }
 
-my $ErrMessage;
 my $FileType = $Step->FileType;
 if ($FileType ne "exe32" && $FileType ne "exe64")
 {
-  FatalError "Unexpected file type $FileType found\n",
-             $FullErrFileName, $Job, $Step, $Task;
+  FatalError("Unexpected file type $FileType found\n");
 }
 my $FileName = $Step->FileName;
 Debug(Elapsed($Start), " Sending '$StepDir/$FileName'\n");
 if (!$TA->SendFile("$StepDir/$FileName", $FileName, 0))
 {
-  $ErrMessage = $TA->GetLastError();
-  FatalError "Can't copy exe to VM: $ErrMessage\n",
-             $FullErrFileName, $Job, $Step, $Task;
-}
-my $TestLauncher = "TestLauncher" . 
-                   ($FileType eq "exe64" ? "64" : "32") .
-                   ".exe";
-Debug(Elapsed($Start), " Sending '$BinDir/windows/$TestLauncher'\n");
-if (!$TA->SendFile("$BinDir/windows/$TestLauncher", $TestLauncher, 0))
-{
-  $ErrMessage = $TA->GetLastError();
-  FatalError "Can't copy TestLauncher to VM: $ErrMessage\n",
-             $FullErrFileName, $Job, $Step, $Task;
+  FatalTAError($TA, "Could not copy the test executable to the VM");
 }
 
 my $Keepalive;
@@ -323,6 +362,13 @@ if ($Step->ReportSuccessfulTests)
 }
 if ($Step->Type eq "single")
 {
+  my $TestLauncher = "TestLauncher" . ($FileType eq "exe64" ? "64" : "32") . ".exe";
+  Debug(Elapsed($Start), " Sending '$BinDir/windows/$TestLauncher'\n");
+  if (!$TA->SendFile("$BinDir/windows/$TestLauncher", $TestLauncher, 0))
+  {
+    FatalTAError($TA, "Could not copy TestLauncher to the VM");
+  }
+
   $Script .= "$TestLauncher -t $Timeout $FileName ";
   # Add 1 second to the timeout so the client-side Wait() does not time out
   # right before $TestLauncher does.
@@ -371,80 +417,102 @@ elsif ($Step->Type eq "suite")
 Debug(Elapsed($Start), " Sending the script: [$Script]\n");
 if (!$TA->SendFileFromString($Script, "script.bat", $TestAgent::SENDFILE_EXE))
 {
-  $ErrMessage = $TA->GetLastError();
-  FatalError "Can't send the script to VM: $ErrMessage\n",
-             $FullErrFileName, $Job, $Step, $Task;
+  FatalTAError($TA, "Could not send the script to the VM");
 }
 
-Debug(Elapsed($Start), " Running the script\n");
+
+#
+# Run the test
+#
+
+Debug(Elapsed($Start), " Starting the script\n");
 my $Pid = $TA->Run(["./script.bat"], 0);
-if (!$Pid or !defined $TA->Wait($Pid, $Timeout, $Keepalive))
+if (!$Pid)
+{
+  FatalTAError($TA, "Failed to start the test");
+}
+
+
+#
+# From that point on we want to at least try to grab the test
+# log and a screenshot before giving up
+#
+
+my $NewStatus = 'completed';
+my ($TestFailures, $TAError, $PossibleCrash);
+Debug(Elapsed($Start), " Waiting for the script (", $Task->Timeout, "s timeout)\n");
+if (!defined $TA->Wait($Pid, $Timeout, $Keepalive))
 {
-  $ErrMessage = "Failure running script in VM: " . $TA->GetLastError();
+  my $ErrMessage = $TA->GetLastError();
+  if ($ErrMessage =~ /timed out waiting for the child process/)
+  {
+    LogTaskError("The test timed out\n");
+    $TestFailures = 1;
+  }
+  else
+  {
+    $PossibleCrash = 1;
+    $TAError = "An error occurred while waiting for the test to complete: $ErrMessage";
+  }
 }
 
-my $NewStatus = "boterror";
-Debug(Elapsed($Start), " Retrieving the report file '$FullLogFileName'\n");
+Debug(Elapsed($Start), " Retrieving the report file to '$FullLogFileName'\n");
 if ($TA->GetFile($RptFileName, $FullLogFileName))
 {
-  my $TestFailures = CountFailures($FullLogFileName);
-  if (!defined $TestFailures)
+  chmod 0664, $FullLogFileName;
+  if (open(my $LogFile, "<", $FullLogFileName))
   {
-    if (($ErrMessage || "") =~ /timed out waiting for the child process/)
+    my $LogFailures;
+    foreach my $Line (<$LogFile>)
     {
-      LogTaskError("The test timed out\n", $FullErrFileName);
-      $ErrMessage = undef;
+      # There may be more than one result line due to child processes
+      if ($Line =~ /: \d+ tests? executed \(\d+ marked as todo, (\d+) failures?\), \d+ skipped\./)
+      {
+        $LogFailures += $1;
+      }
+      elsif ($Line =~ / done \(258\)/ or
+             $Line =~ /: unhandled exception [0-9a-fA-F]{8} at /)
+      {
+        $LogFailures++;
+      }
     }
-    else
+    close($LogFile);
+    if (defined $LogFailures)
     {
-      LogTaskError("No test summary line found\n", $FullErrFileName);
+      $TestFailures += $LogFailures;
+      # The log file looks good so ignore the TestAgent error
+      # but still log it in case the log was in fact truncated.
+      LogTaskError($TAError) if (defined $TAError);
+      $TAError = undef;
+    }
+    elsif (!defined $LogFailures and !defined $TestFailures)
+    {
+      LogTaskError("Found no trace of the test summary line or of a crash\n");
+      $TestFailures = 1;
     }
-    $TestFailures = 1;
   }
-  $Task->TestFailures($TestFailures);
-  $NewStatus = "completed";
-
-  chmod 0664, $FullLogFileName;
-  if ($Step->Type eq "suite")
+  else
   {
-    # Link the test suite's results for future use in WineSendLog.pl.
-    my $LatestNameBase = "$DataDir/latest/" . $VM->Name . "_" .
-                         ($Step->FileType eq "exe64" ? "64" : "32");
-    unlink("${LatestNameBase}.log");
-    unlink("${LatestNameBase}.err");
-    link("$DataDir/jobs/" . $Job->Id . "/" . $Step->No . "/" . $Task->No . "/log",
-         "${LatestNameBase}.log");
+    $NewStatus = 'boterror';
+    Error "Unable to open '$FullLogFileName' for reading: $!\n";
+    LogTaskError("Unable to open the log file for reading: $!\n");
   }
 }
-elsif (!defined $ErrMessage)
+elsif (!defined $TAError)
 {
-  $ErrMessage = "Can't copy log from VM: " . $TA->GetLastError();
+  $TAError = "An error occurred while retrieving the test report: ". $TA->GetLastError();
 }
+$TA->Disconnect();
+
 
 Debug(Elapsed($Start), " Taking a screenshot\n");
-TakeScreenshot $VM, $FullScreenshotFileName;
-if (defined $ErrMessage)
-{
-  FatalError "$ErrMessage\n", $FullErrFileName, $Job, $Step, $Task;
-}
-$TA->Disconnect();
+TakeScreenshot($VM, $FullScreenshotFileName);
 
-Debug(Elapsed($Start), " Done. New task status: $NewStatus\n");
-$Task->Status($NewStatus);
-$Task->ChildPid(undef);
-$Task->Ended(time);
+FatalTAError(undef, $TAError, $PossibleCrash) if (defined $TAError);
 
-$Task->Save();
-$Job->UpdateStatus();
 
-# Get the up-to-date VM status and update it if nobody else changed it
-$VM = CreateVMs()->GetItem($VM->GetKey());
-if ($VM->Status eq 'running')
-{
-  $VM->Status('dirty');
-  $VM->Save();
-  RescheduleJobs();
-}
+#
+# Wrap up
+#
 
-LogMsg "Task $JobId/$StepNo/$TaskNo (" . $VM->Name . ") completed\n";
-exit 0;
+WrapUpAndExit($NewStatus, $TestFailures);
-- 
2.7.0



More information about the wine-patches mailing list