using Renci.SshNet;
using OTSSignsOrchestrator.Server.Data.Entities;
namespace OTSSignsOrchestrator.Server.Health.Checks;
///
/// Verifies the Docker stack is healthy by running docker stack ps {stackName}
/// via SSH and checking that all services report Running state.
///
public sealed class StackHealthCheck : IHealthCheck
{
private readonly IServiceProvider _services;
private readonly ILogger _logger;
public string CheckName => "StackHealth";
public bool AutoRemediate => false;
public StackHealthCheck(
IServiceProvider services,
ILogger logger)
{
_services = services;
_logger = logger;
}
public async Task RunAsync(Instance instance, CancellationToken ct)
{
var stackName = instance.DockerStackName;
if (string.IsNullOrEmpty(stackName))
return new HealthCheckResult(HealthStatus.Critical, "No Docker stack name configured");
try
{
var settings = _services.GetRequiredService();
var sshInfo = await GetSwarmSshHostAsync(settings);
using var sshClient = CreateSshClient(sshInfo);
sshClient.Connect();
try
{
// Get task status for all services in the stack
var output = RunSshCommand(sshClient,
$"docker stack ps {stackName} --format '{{{{.Name}}}}|{{{{.CurrentState}}}}|{{{{.DesiredState}}}}'");
var lines = output.Split('\n', StringSplitOptions.RemoveEmptyEntries);
var notRunning = new List();
foreach (var line in lines)
{
var parts = line.Split('|');
if (parts.Length < 3) continue;
var name = parts[0].Trim();
var currentState = parts[1].Trim();
var desiredState = parts[2].Trim();
// Only check tasks whose desired state is Running
if (desiredState.Equals("Running", StringComparison.OrdinalIgnoreCase) &&
!currentState.StartsWith("Running", StringComparison.OrdinalIgnoreCase))
{
notRunning.Add($"{name}: {currentState}");
}
}
if (notRunning.Count == 0)
return new HealthCheckResult(HealthStatus.Healthy,
$"All services in {stackName} are Running");
return new HealthCheckResult(HealthStatus.Critical,
$"{notRunning.Count} service(s) not running in {stackName}",
string.Join("\n", notRunning));
}
finally
{
sshClient.Disconnect();
}
}
catch (Exception ex)
{
return new HealthCheckResult(HealthStatus.Critical,
$"SSH check failed for {stackName}: {ex.Message}");
}
}
private static async Task GetSwarmSshHostAsync(Core.Services.SettingsService settings)
{
var host = await settings.GetAsync("Ssh.SwarmHost")
?? throw new InvalidOperationException("SSH Swarm host not configured.");
var portStr = await settings.GetAsync("Ssh.SwarmPort", "22");
var user = await settings.GetAsync("Ssh.SwarmUser", "root");
var keyPath = await settings.GetAsync("Ssh.SwarmKeyPath");
var password = await settings.GetAsync("Ssh.SwarmPassword");
if (!int.TryParse(portStr, out var port)) port = 22;
return new SshConnectionInfo(host, port, user, keyPath, password);
}
private static SshClient CreateSshClient(SshConnectionInfo info)
{
var authMethods = new List();
if (!string.IsNullOrEmpty(info.KeyPath))
authMethods.Add(new PrivateKeyAuthenticationMethod(info.Username, new PrivateKeyFile(info.KeyPath)));
if (!string.IsNullOrEmpty(info.Password))
authMethods.Add(new PasswordAuthenticationMethod(info.Username, info.Password));
if (authMethods.Count == 0)
{
var defaultKeyPath = Path.Combine(
Environment.GetFolderPath(Environment.SpecialFolder.UserProfile), ".ssh", "id_rsa");
if (File.Exists(defaultKeyPath))
authMethods.Add(new PrivateKeyAuthenticationMethod(info.Username, new PrivateKeyFile(defaultKeyPath)));
else
throw new InvalidOperationException($"No SSH auth method for {info.Host}:{info.Port}.");
}
var connInfo = new Renci.SshNet.ConnectionInfo(info.Host, info.Port, info.Username, authMethods.ToArray());
return new SshClient(connInfo);
}
private static string RunSshCommand(SshClient client, string command)
{
using var cmd = client.RunCommand(command);
if (cmd.ExitStatus != 0)
throw new InvalidOperationException($"SSH command failed (exit {cmd.ExitStatus}): {cmd.Error}");
return cmd.Result;
}
internal sealed record SshConnectionInfo(string Host, int Port, string Username, string? KeyPath, string? Password);
}