using Renci.SshNet; using OTSSignsOrchestrator.Server.Data.Entities; namespace OTSSignsOrchestrator.Server.Health.Checks; /// /// Verifies the Docker stack is healthy by running docker stack ps {stackName} /// via SSH and checking that all services report Running state. /// public sealed class StackHealthCheck : IHealthCheck { private readonly IServiceProvider _services; private readonly ILogger _logger; public string CheckName => "StackHealth"; public bool AutoRemediate => false; public StackHealthCheck( IServiceProvider services, ILogger logger) { _services = services; _logger = logger; } public async Task RunAsync(Instance instance, CancellationToken ct) { var stackName = instance.DockerStackName; if (string.IsNullOrEmpty(stackName)) return new HealthCheckResult(HealthStatus.Critical, "No Docker stack name configured"); try { var settings = _services.GetRequiredService(); var sshInfo = await GetSwarmSshHostAsync(settings); using var sshClient = CreateSshClient(sshInfo); sshClient.Connect(); try { // Get task status for all services in the stack var output = RunSshCommand(sshClient, $"docker stack ps {stackName} --format '{{{{.Name}}}}|{{{{.CurrentState}}}}|{{{{.DesiredState}}}}'"); var lines = output.Split('\n', StringSplitOptions.RemoveEmptyEntries); var notRunning = new List(); foreach (var line in lines) { var parts = line.Split('|'); if (parts.Length < 3) continue; var name = parts[0].Trim(); var currentState = parts[1].Trim(); var desiredState = parts[2].Trim(); // Only check tasks whose desired state is Running if (desiredState.Equals("Running", StringComparison.OrdinalIgnoreCase) && !currentState.StartsWith("Running", StringComparison.OrdinalIgnoreCase)) { notRunning.Add($"{name}: {currentState}"); } } if (notRunning.Count == 0) return new HealthCheckResult(HealthStatus.Healthy, $"All services in {stackName} are Running"); return new HealthCheckResult(HealthStatus.Critical, $"{notRunning.Count} service(s) not running in {stackName}", string.Join("\n", notRunning)); } finally { sshClient.Disconnect(); } } catch (Exception ex) { return new HealthCheckResult(HealthStatus.Critical, $"SSH check failed for {stackName}: {ex.Message}"); } } private static async Task GetSwarmSshHostAsync(Core.Services.SettingsService settings) { var host = await settings.GetAsync("Ssh.SwarmHost") ?? throw new InvalidOperationException("SSH Swarm host not configured."); var portStr = await settings.GetAsync("Ssh.SwarmPort", "22"); var user = await settings.GetAsync("Ssh.SwarmUser", "root"); var keyPath = await settings.GetAsync("Ssh.SwarmKeyPath"); var password = await settings.GetAsync("Ssh.SwarmPassword"); if (!int.TryParse(portStr, out var port)) port = 22; return new SshConnectionInfo(host, port, user, keyPath, password); } private static SshClient CreateSshClient(SshConnectionInfo info) { var authMethods = new List(); if (!string.IsNullOrEmpty(info.KeyPath)) authMethods.Add(new PrivateKeyAuthenticationMethod(info.Username, new PrivateKeyFile(info.KeyPath))); if (!string.IsNullOrEmpty(info.Password)) authMethods.Add(new PasswordAuthenticationMethod(info.Username, info.Password)); if (authMethods.Count == 0) { var defaultKeyPath = Path.Combine( Environment.GetFolderPath(Environment.SpecialFolder.UserProfile), ".ssh", "id_rsa"); if (File.Exists(defaultKeyPath)) authMethods.Add(new PrivateKeyAuthenticationMethod(info.Username, new PrivateKeyFile(defaultKeyPath))); else throw new InvalidOperationException($"No SSH auth method for {info.Host}:{info.Port}."); } var connInfo = new Renci.SshNet.ConnectionInfo(info.Host, info.Port, info.Username, authMethods.ToArray()); return new SshClient(connInfo); } private static string RunSshCommand(SshClient client, string command) { using var cmd = client.RunCommand(command); if (cmd.ExitStatus != 0) throw new InvalidOperationException($"SSH command failed (exit {cmd.ExitStatus}): {cmd.Error}"); return cmd.Result; } internal sealed record SshConnectionInfo(string Host, int Port, string Username, string? KeyPath, string? Password); }