Files
OTSSignsOrchestrator/OTSSignsOrchestrator.Desktop/Services/SshDockerCliService.cs
Matt Batchelder adf1a2e4db
Some checks failed
Build and Publish Docker Image / build-and-push (push) Has been cancelled
Add WAL file for database and log instance deployment failures
2026-02-19 08:27:54 -05:00

513 lines
20 KiB
C#

using System.Diagnostics;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using MySqlConnector;
using OTSSignsOrchestrator.Core.Configuration;
using OTSSignsOrchestrator.Core.Models.DTOs;
using OTSSignsOrchestrator.Core.Models.Entities;
using OTSSignsOrchestrator.Core.Services;
namespace OTSSignsOrchestrator.Desktop.Services;
/// <summary>
/// Docker CLI service that executes docker commands on a remote host over SSH.
/// Requires an SshHost to be set before use via SetHost().
/// </summary>
public class SshDockerCliService : IDockerCliService
{
private readonly SshConnectionService _ssh;
private readonly DockerOptions _options;
private readonly ILogger<SshDockerCliService> _logger;
private SshHost? _currentHost;
public SshDockerCliService(
SshConnectionService ssh,
IOptions<DockerOptions> options,
ILogger<SshDockerCliService> logger)
{
_ssh = ssh;
_options = options.Value;
_logger = logger;
}
/// <summary>
/// Set the SSH host to use for Docker commands.
/// </summary>
public void SetHost(SshHost host)
{
_currentHost = host;
}
public SshHost? CurrentHost => _currentHost;
public async Task<DeploymentResultDto> DeployStackAsync(string stackName, string composeYaml, bool resolveImage = false)
{
EnsureHost();
var sw = Stopwatch.StartNew();
var args = "docker stack deploy --compose-file -";
if (resolveImage)
args += " --resolve-image changed";
args += $" {stackName}";
_logger.LogInformation("Deploying stack via SSH: {StackName} on {Host}", stackName, _currentHost!.Label);
var (exitCode, stdout, stderr) = await _ssh.RunCommandWithStdinAsync(_currentHost, args, composeYaml);
sw.Stop();
var result = new DeploymentResultDto
{
StackName = stackName,
Success = exitCode == 0,
ExitCode = exitCode,
Output = stdout,
ErrorMessage = stderr,
Message = exitCode == 0 ? "Success" : "Failed",
DurationMs = sw.ElapsedMilliseconds
};
if (result.Success)
_logger.LogInformation("Stack deployed via SSH: {StackName} | duration={DurationMs}ms", stackName, result.DurationMs);
else
_logger.LogError("Stack deploy failed via SSH: {StackName} | error={Error}", stackName, result.ErrorMessage);
return result;
}
public async Task<DeploymentResultDto> RemoveStackAsync(string stackName)
{
EnsureHost();
var sw = Stopwatch.StartNew();
_logger.LogInformation("Removing stack via SSH: {StackName} on {Host}", stackName, _currentHost!.Label);
var (exitCode, stdout, stderr) = await _ssh.RunCommandAsync(_currentHost, $"docker stack rm {stackName}");
sw.Stop();
var result = new DeploymentResultDto
{
StackName = stackName,
Success = exitCode == 0,
ExitCode = exitCode,
Output = stdout,
ErrorMessage = stderr,
Message = exitCode == 0 ? "Success" : "Failed",
DurationMs = sw.ElapsedMilliseconds
};
if (result.Success)
_logger.LogInformation("Stack removed via SSH: {StackName}", stackName);
else
_logger.LogError("Stack remove failed via SSH: {StackName} | error={Error}", stackName, result.ErrorMessage);
return result;
}
public async Task<List<StackInfo>> ListStacksAsync()
{
EnsureHost();
var (exitCode, stdout, _) = await _ssh.RunCommandAsync(
_currentHost!, "docker stack ls --format '{{.Name}}\\t{{.Services}}'");
if (exitCode != 0 || string.IsNullOrWhiteSpace(stdout))
return new List<StackInfo>();
return stdout
.Split('\n', StringSplitOptions.RemoveEmptyEntries)
.Select(line =>
{
var parts = line.Split('\t', 2);
return new StackInfo
{
Name = parts[0].Trim(),
ServiceCount = parts.Length > 1 && int.TryParse(parts[1].Trim(), out var c) ? c : 0
};
})
.ToList();
}
public async Task<List<ServiceInfo>> InspectStackServicesAsync(string stackName)
{
EnsureHost();
var (exitCode, stdout, _) = await _ssh.RunCommandAsync(
_currentHost!, $"docker stack services {stackName} --format '{{{{.Name}}}}\\t{{{{.Image}}}}\\t{{{{.Replicas}}}}'");
if (exitCode != 0 || string.IsNullOrWhiteSpace(stdout))
return new List<ServiceInfo>();
return stdout
.Split('\n', StringSplitOptions.RemoveEmptyEntries)
.Select(line =>
{
var parts = line.Split('\t', 3);
return new ServiceInfo
{
Name = parts.Length > 0 ? parts[0].Trim() : "",
Image = parts.Length > 1 ? parts[1].Trim() : "",
Replicas = parts.Length > 2 ? parts[2].Trim() : ""
};
})
.ToList();
}
public async Task<bool> EnsureDirectoryAsync(string path)
{
EnsureHost();
var (exitCode, _, stderr) = await _ssh.RunCommandAsync(_currentHost!, $"mkdir -p {path}");
if (exitCode != 0)
_logger.LogWarning("Failed to create directory {Path} on {Host}: {Error}", path, _currentHost!.Label, stderr);
else
_logger.LogInformation("Ensured directory exists on {Host}: {Path}", _currentHost!.Label, path);
return exitCode == 0;
}
public async Task<bool> EnsureNfsFoldersAsync(
string nfsServer,
string nfsExport,
IEnumerable<string> folderNames,
string? nfsExportFolder = null)
{
EnsureHost();
var exportPath = (nfsExport ?? string.Empty).Trim('/');
var subFolder = (nfsExportFolder ?? string.Empty).Trim('/');
// Build the sub-path beneath the mount point where volume folders will be created
var subPath = string.IsNullOrEmpty(subFolder) ? string.Empty : $"/{subFolder}";
// Build mkdir targets relative to the temporary mount point
var folderList = folderNames.Select(f => $"\"$MNT{subPath}/{f}\"").ToList();
var mkdirTargets = string.Join(" ", folderList);
// Single SSH command: create temp dir, mount NFS, mkdir -p all folders, unmount, cleanup
// Use addr= to pin the server IP — avoids "Server address does not match proto= option"
// errors when the hostname resolves to IPv6 but proto=tcp implies IPv4.
var script = $"""
set -e
MNT=$(mktemp -d)
sudo mount -t nfs -o addr={nfsServer},nfsvers=4,proto=tcp,soft,timeo=50,retrans=2 {nfsServer}:/{exportPath} "$MNT"
sudo mkdir -p {mkdirTargets}
sudo umount "$MNT"
rmdir "$MNT"
""";
_logger.LogInformation(
"Mounting NFS export {Server}:/{Export} on Docker host {Host} to create {Count} folders",
nfsServer, exportPath, _currentHost!.Label, folderList.Count);
var (exitCode, stdout, stderr) = await _ssh.RunCommandAsync(_currentHost!, script, TimeSpan.FromSeconds(30));
if (exitCode == 0)
{
_logger.LogInformation(
"NFS export folders ensured via mount on {Host}: {Server}:/{Export}{Sub} ({Count} folders)",
_currentHost.Label, nfsServer, exportPath, subPath, folderList.Count);
}
else
{
_logger.LogWarning(
"Failed to create NFS export folders on {Host}: {Error}",
_currentHost.Label, (stderr ?? stdout ?? "unknown error").Trim());
return false;
}
return true;
}
public async Task<(bool Success, string? Error)> EnsureNfsFoldersWithErrorAsync(
string nfsServer,
string nfsExport,
IEnumerable<string> folderNames,
string? nfsExportFolder = null)
{
EnsureHost();
var exportPath = (nfsExport ?? string.Empty).Trim('/');
var subFolder = (nfsExportFolder ?? string.Empty).Trim('/');
var subPath = string.IsNullOrEmpty(subFolder) ? string.Empty : $"/{subFolder}";
var folderList = folderNames.Select(f => $"\"$MNT{subPath}/{f}\"").ToList();
var mkdirTargets = string.Join(" ", folderList);
var script = $"""
set -e
MNT=$(mktemp -d)
sudo mount -t nfs -o addr={nfsServer},nfsvers=4,proto=tcp,soft,timeo=50,retrans=2 {nfsServer}:/{exportPath} "$MNT"
sudo mkdir -p {mkdirTargets}
sudo umount "$MNT"
rmdir "$MNT"
""";
_logger.LogInformation(
"Mounting NFS export {Server}:/{Export} on Docker host {Host} to create {Count} folders",
nfsServer, exportPath, _currentHost!.Label, folderList.Count);
var (exitCode, stdout, stderr) = await _ssh.RunCommandAsync(_currentHost!, script, TimeSpan.FromSeconds(30));
if (exitCode == 0)
{
_logger.LogInformation(
"NFS export folders ensured via mount on {Host}: {Server}:/{Export}{Sub} ({Count} folders)",
_currentHost.Label, nfsServer, exportPath, subPath, folderList.Count);
return (true, null);
}
var error = (stderr ?? stdout ?? "unknown error").Trim();
_logger.LogWarning(
"Failed to create NFS export folders on {Host}: {Error}",
_currentHost.Label, error);
return (false, error);
}
public async Task<bool> ForceUpdateServiceAsync(string serviceName)
{
EnsureHost();
_logger.LogInformation("Force-updating service {ServiceName} on {Host}", serviceName, _currentHost!.Label);
var (exitCode, _, stderr) = await _ssh.RunCommandAsync(_currentHost!, $"docker service update --force {serviceName}");
if (exitCode != 0)
_logger.LogWarning("Force-update failed for {ServiceName}: {Error}", serviceName, stderr);
return exitCode == 0;
}
public async Task<(MySqlConnection Connection, IDisposable Tunnel)> OpenMySqlConnectionAsync(
string mysqlHost, int port,
string adminUser, string adminPassword)
{
EnsureHost();
_logger.LogInformation(
"Opening tunnelled MySQL connection to {MysqlHost}:{Port} via SSH",
mysqlHost, port);
var tunnel = _ssh.OpenForwardedPort(_currentHost!, mysqlHost, (uint)port);
var localPort = (int)tunnel.BoundPort;
var csb = new MySqlConnectionStringBuilder
{
Server = "127.0.0.1",
Port = (uint)localPort,
UserID = adminUser,
Password = adminPassword,
ConnectionTimeout = 15,
SslMode = MySqlSslMode.Disabled,
};
var connection = new MySqlConnection(csb.ConnectionString);
try
{
await connection.OpenAsync();
return (connection, tunnel);
}
catch
{
await connection.DisposeAsync();
tunnel.Dispose();
throw;
}
}
public async Task<(bool Success, string Error)> AlterMySqlUserPasswordAsync(
string mysqlHost, int port,
string adminUser, string adminPassword,
string targetUser, string newPassword)
{
_logger.LogInformation(
"Altering MySQL password for user {User} on {MysqlHost}:{Port} via SSH tunnel",
targetUser, mysqlHost, port);
try
{
var (connection, tunnel) = await OpenMySqlConnectionAsync(mysqlHost, port, adminUser, adminPassword);
await using (connection)
using (tunnel)
{
var escapedUser = targetUser.Replace("'", "''");
await using var cmd = connection.CreateCommand();
cmd.CommandText = $"ALTER USER '{escapedUser}'@'%' IDENTIFIED BY @pwd";
cmd.Parameters.AddWithValue("@pwd", newPassword);
await cmd.ExecuteNonQueryAsync();
}
_logger.LogInformation("MySQL password updated for user {User} via SSH tunnel", targetUser);
return (true, string.Empty);
}
catch (MySqlException ex)
{
_logger.LogError(ex, "MySQL ALTER USER failed via SSH tunnel for user {User}", targetUser);
return (false, ex.Message);
}
}
public async Task<bool> ServiceSwapSecretAsync(string serviceName, string oldSecretName, string newSecretName, string? targetAlias = null)
{
EnsureHost();
var target = targetAlias ?? oldSecretName;
var cmd = $"docker service update --secret-rm {oldSecretName} --secret-add \"source={newSecretName},target={target}\" {serviceName}";
_logger.LogInformation(
"Swapping secret on {ServiceName}: {OldSecret} → {NewSecret} (target={Target})",
serviceName, oldSecretName, newSecretName, target);
var (exitCode, _, stderr) = await _ssh.RunCommandAsync(_currentHost!, cmd);
if (exitCode != 0)
_logger.LogError("Secret swap failed for {ServiceName}: {Error}", serviceName, stderr);
return exitCode == 0;
}
public async Task<List<NodeInfo>> ListNodesAsync()
{
EnsureHost();
_logger.LogInformation("Listing swarm nodes via SSH on {Host}", _currentHost!.Label);
// Use docker node inspect on all nodes to get IP addresses (Status.Addr)
// that are not available from 'docker node ls'.
// First, get all node IDs.
var (lsExit, lsOut, lsErr) = await _ssh.RunCommandAsync(
_currentHost!, "docker node ls --format '{{.ID}}'");
if (lsExit != 0)
{
var msg = (lsErr ?? lsOut ?? "unknown error").Trim();
_logger.LogWarning("docker node ls failed on {Host} (exit {Code}): {Error}",
_currentHost.Label, lsExit, msg);
throw new InvalidOperationException(
$"Failed to list swarm nodes on {_currentHost.Label}: {msg}");
}
if (string.IsNullOrWhiteSpace(lsOut))
return new List<NodeInfo>();
var nodeIds = lsOut.Split('\n', StringSplitOptions.RemoveEmptyEntries)
.Select(id => id.Trim())
.Where(id => !string.IsNullOrEmpty(id))
.ToList();
if (nodeIds.Count == 0)
return new List<NodeInfo>();
// Inspect all nodes in a single call to get full details including IP address
var ids = string.Join(" ", nodeIds);
var format = "'{{.ID}}\t{{.Description.Hostname}}\t{{.Status.State}}\t{{.Spec.Availability}}\t{{.ManagerStatus.Addr}}\t{{.Status.Addr}}\t{{.Description.Engine.EngineVersion}}\t{{.Spec.Role}}'";
var (exitCode, stdout, stderr) = await _ssh.RunCommandAsync(
_currentHost!, $"docker node inspect --format {format} {ids}");
if (exitCode != 0)
{
var msg = (stderr ?? stdout ?? "unknown error").Trim();
_logger.LogWarning("docker node inspect failed on {Host} (exit {Code}): {Error}",
_currentHost.Label, exitCode, msg);
throw new InvalidOperationException(
$"Failed to inspect swarm nodes on {_currentHost.Label}: {msg}");
}
if (string.IsNullOrWhiteSpace(stdout))
return new List<NodeInfo>();
return stdout
.Split('\n', StringSplitOptions.RemoveEmptyEntries)
.Select(line =>
{
var parts = line.Split('\t', 8);
// ManagerStatus.Addr includes port (e.g. "10.0.0.1:2377"); Status.Addr is just the IP.
// Prefer Status.Addr; fall back to ManagerStatus.Addr (strip port) if Status.Addr is empty/template-error.
var statusAddr = parts.Length > 5 ? parts[5].Trim() : "";
var managerAddr = parts.Length > 4 ? parts[4].Trim() : "";
var ip = statusAddr;
if (string.IsNullOrEmpty(ip) || ip.StartsWith("<") || ip.StartsWith("{"))
{
// managerAddr may be "10.0.0.1:2377"
ip = managerAddr.Contains(':') ? managerAddr[..managerAddr.LastIndexOf(':')] : managerAddr;
}
// Clean up template rendering artefacts like "<no value>"
if (ip.StartsWith("<") || ip.StartsWith("{"))
ip = "";
var role = parts.Length > 7 ? parts[7].Trim() : "";
var managerStatus = "";
if (string.Equals(role, "manager", StringComparison.OrdinalIgnoreCase))
{
// Determine if this is the leader by checking if ManagerStatus.Addr is non-empty
managerStatus = !string.IsNullOrEmpty(managerAddr) && !managerAddr.StartsWith("<") ? "Reachable" : "";
}
return new NodeInfo
{
Id = parts.Length > 0 ? parts[0].Trim() : "",
Hostname = parts.Length > 1 ? parts[1].Trim() : "",
Status = parts.Length > 2 ? parts[2].Trim() : "",
Availability = parts.Length > 3 ? parts[3].Trim() : "",
ManagerStatus = managerStatus,
IpAddress = ip,
EngineVersion = parts.Length > 6 ? parts[6].Trim() : ""
};
})
.ToList();
}
private void EnsureHost()
{
if (_currentHost == null)
throw new InvalidOperationException("No SSH host configured. Call SetHost() before using Docker commands.");
}
public async Task<bool> RemoveStackVolumesAsync(string stackName)
{
EnsureHost();
// ── 1. Remove the stack first so containers release the volumes ─────
_logger.LogInformation("Removing stack {StackName} before volume cleanup", stackName);
var (rmExit, _, rmErr) = await _ssh.RunCommandAsync(_currentHost!,
$"docker stack rm {stackName} 2>&1 || true");
if (rmExit != 0)
_logger.LogWarning("Stack rm returned non-zero for {StackName}: {Err}", stackName, rmErr);
// Give Swarm a moment to tear down containers on all nodes
await Task.Delay(5000);
// ── 2. Clean volumes on the local (manager) node ────────────────────
var localCmd = $"docker volume ls --filter \"name={stackName}_\" -q | xargs -r docker volume rm 2>&1 || true";
var (_, localOut, _) = await _ssh.RunCommandAsync(_currentHost!, localCmd);
if (!string.IsNullOrEmpty(localOut?.Trim()))
_logger.LogInformation("Volume cleanup (manager): {Output}", localOut!.Trim());
// ── 3. Clean volumes on ALL swarm nodes via a temporary global service ──
// This deploys a short-lived container on every node that mounts the Docker
// socket and removes matching volumes. This handles worker nodes that the
// orchestrator has no direct SSH access to.
var cleanupSvcName = $"vol-cleanup-{stackName}".Replace("_", "-");
// Remove leftover cleanup service from a previous run (if any)
await _ssh.RunCommandAsync(_currentHost!,
$"docker service rm {cleanupSvcName} 2>/dev/null || true");
var createCmd = string.Join(" ",
"docker service create",
"--detach",
"--mode global",
"--restart-condition none",
$"--name {cleanupSvcName}",
"--mount type=bind,source=/var/run/docker.sock,target=/var/run/docker.sock",
"docker:cli",
"sh", "-c",
$"'docker volume ls -q --filter name={stackName}_ | xargs -r docker volume rm 2>&1; echo done'");
_logger.LogInformation("Deploying global volume-cleanup service on all swarm nodes for {StackName}", stackName);
var (svcExit, svcOut, svcErr) = await _ssh.RunCommandAsync(_currentHost!, createCmd);
if (svcExit != 0)
{
_logger.LogWarning("Global volume cleanup service creation failed: {Err}", svcErr);
}
else
{
// Wait for the cleanup tasks to finish on all nodes
_logger.LogInformation("Waiting for volume cleanup tasks to complete on all nodes...");
await Task.Delay(10000);
}
// Remove the cleanup service
await _ssh.RunCommandAsync(_currentHost!,
$"docker service rm {cleanupSvcName} 2>/dev/null || true");
_logger.LogInformation("Volume cleanup complete for stack {StackName}", stackName);
return true;
}
}