[4/5] testbot/TestAgent: Better handle connection timeouts.

Francois Gouget fgouget at codeweavers.com
Wed Mar 12 08:45:54 CDT 2014


The timeout can and does sometimes happen when connecting or authenticating to SSH or starting netcat, not just when creating the initial socket connection.
So apply the timeout and connection retries to the whole process.
This means RevertVM no longer needs its own attempt loop and the other scripts should no longer fail to connect where RevertVM succeeded.
Also let clients specify how many attempts to make in case they don't like the default of 3.
---

In my development environment the VMs and WineTestBot Engine are on the 
same machine so of course I never get any network issue. Even simple 
tests accessing the WineTestBot VMs over the Internet worked fine.

However, if I create a bit of traffic by sending a file to a VM over an 
SSH tunnel while attempting connections also over an SSH tunnel, then I 
start to see connection failures.

Specifically out of 600 connection attempts I got 45 connection 
timeouts, 43 of which happened while attempting to connect to SSH, 1 
while authenticating to SSH and 1 while starting netcat. All second 
attempts were successful in under 3 seconds (but probably not by much). 
So a 20 second connection timeout with three attempts should be 
reliable.

This should fix errors such as (from job 5610):

Unable to set the VM system time (): the "nc -q0 '10.42.42.238' '4242'" command returned 1
Can't copy exe to VM: the "nc -q0 '10.42.42.238' '4242'" command returned 1


Hopefully once the tunnel has been established the remaining 
transfers will follow more reliably.


 testbot/bin/RevertVM.pl              |   9 +-
 testbot/lib/WineTestBot/TestAgent.pm | 239 +++++++++++++++++++----------------
 2 files changed, 134 insertions(+), 114 deletions(-)

diff --git a/testbot/bin/RevertVM.pl b/testbot/bin/RevertVM.pl
index 2ce3be1..60a13f8 100755
--- a/testbot/bin/RevertVM.pl
+++ b/testbot/bin/RevertVM.pl
@@ -119,15 +119,10 @@ if (defined($ErrMessage))
   FatalError "Can't change status for VM $VMKey: $ErrMessage", $VM;
 }
 
-my $Success;
+LogMsg "Waiting for ", $VM->Name, " (up to ${WaitForToolsInVM}s per attempt)\n";
 my $TA = $VM->GetAgent();
 $TA->SetConnectTimeout($WaitForToolsInVM);
-foreach my $WaitCount (1..3)
-{
-  LogMsg "Waiting for ", $VM->Name, " (up to ${WaitForToolsInVM}s)\n";
-  $Success = $TA->Ping();
-  last if ($Success);
-}
+my $Success = $TA->Ping();
 $TA->Disconnect();
 if (!$Success)
 {
diff --git a/testbot/lib/WineTestBot/TestAgent.pm b/testbot/lib/WineTestBot/TestAgent.pm
index f810f2b..d17336e 100644
--- a/testbot/lib/WineTestBot/TestAgent.pm
+++ b/testbot/lib/WineTestBot/TestAgent.pm
@@ -95,7 +95,8 @@ sub new($$$;$)
     agentport  => $Port,
     port       => $Port,
     connection => "$Hostname:$Port",
-    ctimeout   => 30,
+    ctimeout   => 20,
+    cattempts  => 3,
     timeout    => 0,
     fd         => undef,
     deadline   => undef,
@@ -137,12 +138,21 @@ sub Disconnect($)
   $self->{agentversion} = undef;
 }
 
-sub SetConnectTimeout($$)
+sub SetConnectTimeout($$;$)
 {
-  my ($self, $Timeout) = @_;
-  my $OldTimeout = $self->{ctimeout};
-  $self->{ctimeout} = $Timeout;
-  return $OldTimeout;
+  my ($self, $Timeout, $Attempts) = @_;
+  my @Ret;
+  if (defined $Timeout)
+  {
+    push @Ret, $self->{ctimeout};
+    $self->{ctimeout} = $Timeout;
+  }
+  if (defined $Attempts)
+  {
+    push @Ret, $self->{cattempts};
+    $self->{cattempts} = $Attempts;
+  }
+  return @Ret;
 }
 
 sub SetTimeout($$)
@@ -820,125 +830,140 @@ sub _Connect($)
 {
   my ($self) = @_;
 
-  my $Err;
-  eval
+  my $Step;
+  foreach my $Dummy (1..$self->{cattempts})
   {
-    local $SIG{ALRM} = sub { die "timeout" };
-    $self->{deadline} = $self->{ctimeout} ? time() + $self->{ctimeout} : undef;
-    $self->_SetAlarm();
-
-    while (1)
+    eval
     {
+      local $SIG{ALRM} = sub { die "timeout" };
+      $self->{deadline} = $self->{ctimeout} ? time() + $self->{ctimeout} : undef;
+      $self->_SetAlarm();
+
+      $Step = "create_socket";
       $self->{fd} = &$create_socket(PeerHost => $self->{host},
                                     PeerPort => $self->{port},
                                     Type => SOCK_STREAM);
-      last if ($self->{fd});
-      $Err = $!;
-      # Ideally we should probably not retry on errors that are likely
-      # permanent, like a hostname that does not resolve. Instead we just
-      # rate-limit our connection attempts.
-      sleep(1);
-    }
-    alarm(0);
-  };
-  if (!$self->{fd})
-  {
-    $Err ||= $@;
-    $Err = "connection timed out" if ($Err =~ /^timeout /);
-    $self->_SetError($FATAL, "Unable to connect to $self->{connection}: $Err");
-    return undef;
-  }
+      if (!$self->{fd})
+      {
+        alarm(0);
+        $self->_SetError($FATAL, $!);
+        return; # out of eval
+      }
 
-  if ($self->{tunnel})
-  {
-    # We are in fact connected to the SSH server.
-    # Now forward that connection to the TestAgent server.
-    $self->{sshfd} = $self->{fd};
-    $self->{fd} = undef;
+      if ($self->{tunnel})
+      {
+        # We are in fact connected to the SSH server.
+        # Now forward that connection to the TestAgent server.
+        $Step = "tunnel_connect";
+        $self->{sshfd} = $self->{fd};
+        $self->{fd} = undef;
 
-    require Net::SSH2;
-    $self->{ssh} = Net::SSH2->new();
-    $self->{ssh}->debug(1) if ($Debug > 1);
+        require Net::SSH2;
+        $self->{ssh} = Net::SSH2->new();
+        $self->{ssh}->debug(1) if ($Debug > 1);
 
-    # Set up compression
-    $self->{ssh}->method('COMP_CS', 'zlib', 'none');
-    $self->{ssh}->method('COMP_SC', 'zlib', 'none');
+        # Set up compression
+        $self->{ssh}->method('COMP_CS', 'zlib', 'none');
+        $self->{ssh}->method('COMP_SC', 'zlib', 'none');
 
-    if (!$self->{ssh}->connect($self->{sshfd}))
-    {
-      $self->_SetError($FATAL, "Unable to connect to the SSH server: " . $self->_ssherror());
-      return undef;
-    }
+        if (!$self->{ssh}->connect($self->{sshfd}))
+        {
+          alarm(0);
+          $self->_SetError($FATAL, "Unable to connect to the SSH server: " . $self->_ssherror());
+          return; # out of eval
+        }
 
-    # Authenticate ourselves
-    my $Tunnel = $self->{tunnel};
-    my %AuthOptions=(username => $Tunnel->{username} || $ENV{USER});
-    foreach my $Key ("username", "password", "publickey", "privatekey",
-                     "hostname", "local_username")
-    {
-      $AuthOptions{$Key} = $Tunnel->{$Key} if (defined $Tunnel->{$Key});
-    }
-    # Old versions of Net::SSH2 won't automatically find DSA keys, and new ones
-    # still won't automatically find RSA ones.
-    if (defined $ENV{HOME} and !exists $AuthOptions{"privatekey"} and
-        !exists $AuthOptions{"publickey"})
-    {
-      foreach my $key ("id_dsa", "id_rsa")
-      {
-        if (-f "$ENV{HOME}/.ssh/$key" and -f "$ENV{HOME}/.ssh/$key.pub")
+        # Authenticate ourselves
+        $Step = "tunnel_auth";
+        my $Tunnel = $self->{tunnel};
+        my %AuthOptions=(username => $Tunnel->{username} || $ENV{USER});
+        foreach my $Key ("username", "password", "publickey", "privatekey",
+                         "hostname", "local_username")
         {
-          $AuthOptions{"privatekey"} = "$ENV{HOME}/.ssh/$key";
-          $AuthOptions{"publickey"} = "$ENV{HOME}/.ssh/$key.pub";
-          last;
+          $AuthOptions{$Key} = $Tunnel->{$Key} if (defined $Tunnel->{$Key});
+        }
+        # Old versions of Net::SSH2 won't automatically find DSA keys,
+        # and new ones still won't automatically find RSA ones.
+        if (defined $ENV{HOME} and !exists $AuthOptions{"privatekey"} and
+            !exists $AuthOptions{"publickey"})
+        {
+          foreach my $key ("id_dsa", "id_rsa")
+          {
+            if (-f "$ENV{HOME}/.ssh/$key" and -f "$ENV{HOME}/.ssh/$key.pub")
+            {
+              $AuthOptions{"privatekey"} = "$ENV{HOME}/.ssh/$key";
+              $AuthOptions{"publickey"} = "$ENV{HOME}/.ssh/$key.pub";
+              last;
+            }
+          }
+        }
+        # Interactive authentication makes no sense with automatic reconnects
+        $AuthOptions{interact} = 0;
+        if (!$self->{ssh}->auth(%AuthOptions))
+        {
+          alarm(0);
+          # auth() returns no error of any sort :-(
+          $self->_SetError($FATAL, "Unable to authenticate to the SSH server");
+          return; # out of eval
         }
-      }
-    }
-    # Interactive authentication makes no sense with automatic reconnects
-    $AuthOptions{interact} = 0;
-    if (!$self->{ssh}->auth(%AuthOptions))
-    {
-      # auth() returns no error of any sort :-(
-      $self->_SetError($FATAL, "Unable to authenticate to the SSH server");
-      return undef;
-    }
 
-    $self->{fd} = $self->{ssh}->channel();
-    if (!$self->{fd})
-    {
-      $self->_SetError($FATAL, "Unable to create the SSH channel: " . $self->_ssherror());
-      return undef;
-    }
+        $Step = "tunnel_channel";
+        $self->{fd} = $self->{ssh}->channel();
+        if (!$self->{fd})
+        {
+          alarm(0);
+          $self->_SetError($FATAL, "Unable to create the SSH channel: " . $self->_ssherror());
+          return; # out of eval
+        }
 
-    # Check that the agent hostname and port won't mess with quoting.
-    if ($self->{agenthost} !~ /^[-a-zA-Z0-9.]*$/ or
-        $self->{agentport} !~ /^[a-zA-Z0-9]*$/)
-    {
-      $self->_SetError($FATAL, "The agent hostname or port is invalid");
-      return undef;
-    }
+        # Check that the agent hostname and port won't mess with quoting.
+        if ($self->{agenthost} !~ /^[-a-zA-Z0-9.]*$/ or
+            $self->{agentport} !~ /^[a-zA-Z0-9]*$/)
+        {
+          alarm(0);
+          $self->_SetError($FATAL, "The agent hostname or port is invalid");
+          return; # out of eval
+        }
+
+        # Use netcat to forward the connection from the SSH server to the
+        # TestAgent server. Note that we won't know about netcat errors at
+        # this point.
+        $Step = "tunnel_netcat";
+        $self->{nc} = "nc -q0 '$self->{agenthost}' '$self->{agentport}'";
+        debug("Tunneling command: $self->{nc}\n");
+        if (!$self->{fd}->exec($self->{nc}))
+        {
+          alarm(0);
+          $self->_SetError($FATAL, "Unable to start netcat: " . $self->_ssherror());
+          return; # out of eval
+        }
+      }
+
+      # Get the protocol version supported by the server.
+      # This also lets us verify that the connection really works.
+      $Step = "agent_version";
+      $self->{agentversion} = $self->_RecvString();
+      if (!defined $self->{agentversion})
+      {
+        alarm(0);
+        # We have already been disconnected at this point
+        debug("could not get the protocol version spoken by the server\n");
+        return; # out of eval
+      }
 
-    # Use netcat to forward the connection from the SSH server to the TestAgent
-    # server. Note that we won't know about netcat errors at this point.
-    $self->{nc} = "nc -q0 '$self->{agenthost}' '$self->{agentport}'";
-    debug("Tunneling command: $self->{nc}\n");
-    if (!$self->{fd}->exec($self->{nc}))
+      alarm(0);
+      $Step = "done";
+    };
+    return 1 if ($Step eq "done");
+    if ($@)
     {
-      $self->_SetError($FATAL, "Unable to start netcat: " . $self->_ssherror());
-      return undef;
+      $self->_SetError($FATAL, "Timed out in $Step while connecting to $self->{connection}");
     }
+    # Ideally we should probably check the error and not retry if it is likely
+    # permanent, like a hostname that does not resolve.
+    sleep(1);
   }
-
-  # Get the protocol version supported by the server.
-  # This also lets us verify that the connection really works.
-  $self->{agentversion} = $self->_RecvString();
-  if (!defined $self->{agentversion})
-  {
-    # We have already been disconnected at this point
-    debug("could not get the protocol version spoken by the server\n");
-    return undef;
-  }
-
-  return 1;
+  return undef;
 }
 
 sub _StartRPC($$)
-- 
1.8.5.3




More information about the wine-patches mailing list