feat: Implement provisioning pipelines for subscription management
- Add ReactivatePipeline to handle subscription reactivation, including scaling Docker services, health verification, status updates, audit logging, and broadcasting status changes. - Introduce RotateCredentialsPipeline for OAuth2 credential rotation, managing the deletion of old apps, creation of new ones, credential storage, access verification, and audit logging. - Create StepRunner to manage job step execution, including lifecycle management and progress broadcasting via SignalR. - Implement SuspendPipeline for subscription suspension, scaling down services, updating statuses, logging audits, and broadcasting changes. - Add UpdateScreenLimitPipeline to update Xibo CMS screen limits and record snapshots. - Introduce XiboFeatureManifests for hardcoded feature ACLs per role. - Add docker-compose.dev.yml for local development with PostgreSQL setup.
This commit is contained in:
@@ -0,0 +1,37 @@
|
||||
using Quartz;
|
||||
using OTSSignsOrchestrator.Server.Health.Checks;
|
||||
|
||||
namespace OTSSignsOrchestrator.Server.Health;
|
||||
|
||||
/// <summary>
|
||||
/// Quartz job that runs the <see cref="AuthentikGlobalHealthCheck"/> every 2 minutes
|
||||
/// on a separate schedule from the per-instance health checks.
|
||||
/// </summary>
|
||||
[DisallowConcurrentExecution]
|
||||
public sealed class AuthentikGlobalHealthJob : IJob
|
||||
{
|
||||
private readonly AuthentikGlobalHealthCheck _check;
|
||||
private readonly ILogger<AuthentikGlobalHealthJob> _logger;
|
||||
|
||||
public AuthentikGlobalHealthJob(
|
||||
AuthentikGlobalHealthCheck check,
|
||||
ILogger<AuthentikGlobalHealthJob> logger)
|
||||
{
|
||||
_check = check;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public async Task Execute(IJobExecutionContext context)
|
||||
{
|
||||
try
|
||||
{
|
||||
var result = await _check.RunGlobalAsync(context.CancellationToken);
|
||||
_logger.LogInformation("Authentik global health: {Status} — {Message}",
|
||||
result.Status, result.Message);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Authentik global health job failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,183 @@
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using OTSSignsOrchestrator.Server.Clients;
|
||||
using OTSSignsOrchestrator.Server.Data;
|
||||
using OTSSignsOrchestrator.Server.Data.Entities;
|
||||
|
||||
namespace OTSSignsOrchestrator.Server.Health.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Verifies that both <c>ots-admin-{abbrev}</c> and <c>ots-svc-{abbrev}</c> exist
|
||||
/// with <c>userTypeId == 1</c> (SuperAdmin). MUST use <see cref="XiboApiClientExtensions.GetAllPagesAsync{T}"/>
|
||||
/// because Xibo paginates at 10 items by default.
|
||||
///
|
||||
/// <c>saml-usertypeid</c> is JIT-only and does NOT maintain SuperAdmin on existing users —
|
||||
/// this check IS the ongoing enforcement mechanism.
|
||||
/// </summary>
|
||||
public sealed class AdminIntegrityHealthCheck : IHealthCheck
|
||||
{
|
||||
private readonly XiboClientFactory _clientFactory;
|
||||
private readonly IServiceProvider _services;
|
||||
private readonly ILogger<AdminIntegrityHealthCheck> _logger;
|
||||
|
||||
public string CheckName => "AdminIntegrity";
|
||||
public bool AutoRemediate => true;
|
||||
|
||||
public AdminIntegrityHealthCheck(
|
||||
XiboClientFactory clientFactory,
|
||||
IServiceProvider services,
|
||||
ILogger<AdminIntegrityHealthCheck> logger)
|
||||
{
|
||||
_clientFactory = clientFactory;
|
||||
_services = services;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
|
||||
{
|
||||
var (client, abbrev) = await ResolveAsync(instance);
|
||||
if (client is null)
|
||||
return new HealthCheckResult(HealthStatus.Critical, "No OAuth app — cannot verify admin accounts");
|
||||
|
||||
var users = await client.GetAllPagesAsync(
|
||||
(start, length) => client.GetUsersAsync(start, length));
|
||||
|
||||
var adminName = $"ots-admin-{abbrev}";
|
||||
var svcName = $"ots-svc-{abbrev}";
|
||||
var problems = new List<string>();
|
||||
|
||||
foreach (var expected in new[] { adminName, svcName })
|
||||
{
|
||||
var user = users.FirstOrDefault(u =>
|
||||
u.TryGetValue("userName", out var n) &&
|
||||
string.Equals(n?.ToString(), expected, StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
if (user is null)
|
||||
{
|
||||
problems.Add($"{expected} is MISSING");
|
||||
continue;
|
||||
}
|
||||
|
||||
if (user.TryGetValue("userTypeId", out var typeObj) &&
|
||||
typeObj?.ToString() != "1")
|
||||
{
|
||||
problems.Add($"{expected} has userTypeId={typeObj} (expected 1)");
|
||||
}
|
||||
}
|
||||
|
||||
if (problems.Count == 0)
|
||||
return new HealthCheckResult(HealthStatus.Healthy, "Admin accounts intact");
|
||||
|
||||
return new HealthCheckResult(
|
||||
HealthStatus.Critical,
|
||||
$"Admin integrity issues: {string.Join("; ", problems)}",
|
||||
string.Join("\n", problems));
|
||||
}
|
||||
|
||||
public async Task<bool> RemediateAsync(Instance instance, CancellationToken ct)
|
||||
{
|
||||
var (client, abbrev) = await ResolveAsync(instance);
|
||||
if (client is null) return false;
|
||||
|
||||
await using var scope = _services.CreateAsyncScope();
|
||||
var db = scope.ServiceProvider.GetRequiredService<OrchestratorDbContext>();
|
||||
|
||||
var users = await client.GetAllPagesAsync(
|
||||
(start, length) => client.GetUsersAsync(start, length));
|
||||
|
||||
var adminName = $"ots-admin-{abbrev}";
|
||||
var svcName = $"ots-svc-{abbrev}";
|
||||
var allFixed = true;
|
||||
|
||||
foreach (var expected in new[] { adminName, svcName })
|
||||
{
|
||||
var user = users.FirstOrDefault(u =>
|
||||
u.TryGetValue("userName", out var n) &&
|
||||
string.Equals(n?.ToString(), expected, StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
if (user is null)
|
||||
{
|
||||
// Recreate missing account
|
||||
var email = $"{expected}@otssigns.internal";
|
||||
var password = GenerateRandomPassword(32);
|
||||
var createResp = await client.CreateUserAsync(new CreateUserRequest(
|
||||
expected, email, password, UserTypeId: 1, HomePageId: 1));
|
||||
|
||||
if (!createResp.IsSuccessStatusCode)
|
||||
{
|
||||
_logger.LogError("Failed to recreate {User}: {Err}", expected, createResp.Error?.Content);
|
||||
allFixed = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Audit
|
||||
db.AuditLogs.Add(new AuditLog
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
InstanceId = instance.Id,
|
||||
Actor = "HealthCheckEngine:AdminIntegrity",
|
||||
Action = "RecreateUser",
|
||||
Target = expected,
|
||||
Outcome = "Success",
|
||||
Detail = "User was missing — recreated as SuperAdmin",
|
||||
OccurredAt = DateTime.UtcNow,
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fix userTypeId if wrong
|
||||
if (user.TryGetValue("userTypeId", out var typeObj) && typeObj?.ToString() != "1")
|
||||
{
|
||||
var userId = int.Parse(user["userId"]?.ToString() ?? "0");
|
||||
if (userId == 0) { allFixed = false; continue; }
|
||||
|
||||
var updateResp = await client.UpdateUserAsync(userId, new UpdateUserRequest(
|
||||
UserName: null, Email: null, Password: null, UserTypeId: 1,
|
||||
HomePageId: null, Retired: null));
|
||||
|
||||
if (!updateResp.IsSuccessStatusCode)
|
||||
{
|
||||
_logger.LogError("Failed to fix userTypeId for {User}: {Err}",
|
||||
expected, updateResp.Error?.Content);
|
||||
allFixed = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
db.AuditLogs.Add(new AuditLog
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
InstanceId = instance.Id,
|
||||
Actor = "HealthCheckEngine:AdminIntegrity",
|
||||
Action = "FixUserType",
|
||||
Target = expected,
|
||||
Outcome = "Success",
|
||||
Detail = $"Changed userTypeId from {typeObj} to 1 (SuperAdmin)",
|
||||
OccurredAt = DateTime.UtcNow,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await db.SaveChangesAsync(ct);
|
||||
return allFixed;
|
||||
}
|
||||
|
||||
private async Task<(IXiboApiClient? Client, string Abbrev)> ResolveAsync(Instance instance)
|
||||
{
|
||||
var abbrev = instance.Customer.Abbreviation;
|
||||
var oauthApp = instance.OauthAppRegistries.FirstOrDefault();
|
||||
if (oauthApp is null) return (null, abbrev);
|
||||
|
||||
var settings = _services.GetRequiredService<Core.Services.SettingsService>();
|
||||
var secret = await settings.GetAsync(Core.Services.SettingsService.InstanceOAuthSecretId(abbrev));
|
||||
if (string.IsNullOrEmpty(secret)) return (null, abbrev);
|
||||
|
||||
var client = await _clientFactory.CreateAsync(instance.XiboUrl, oauthApp.ClientId, secret);
|
||||
return (client, abbrev);
|
||||
}
|
||||
|
||||
private static string GenerateRandomPassword(int length)
|
||||
{
|
||||
const string chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*";
|
||||
return System.Security.Cryptography.RandomNumberGenerator.GetString(chars, length);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,107 @@
|
||||
using System.Diagnostics;
|
||||
using Microsoft.AspNetCore.SignalR;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using OTSSignsOrchestrator.Server.Clients;
|
||||
using OTSSignsOrchestrator.Server.Data;
|
||||
using OTSSignsOrchestrator.Server.Data.Entities;
|
||||
using OTSSignsOrchestrator.Server.Hubs;
|
||||
|
||||
namespace OTSSignsOrchestrator.Server.Health.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Probes the central Authentik instance at <c>GET /api/v3/-/health/ready/</c>.
|
||||
/// Measures latency and writes an <see cref="AuthentikMetrics"/> row.
|
||||
/// If down: <c>Severity = Critical</c>, message "Central Authentik is DOWN — all customer web UI logins failing".
|
||||
/// This is a fleet-wide P1 alert. Runs every 2 minutes on a separate schedule.
|
||||
///
|
||||
/// This check is NOT per-instance — it runs once globally. The engine skips it for
|
||||
/// per-instance checks. Instead it is scheduled independently as a Quartz job.
|
||||
/// </summary>
|
||||
public sealed class AuthentikGlobalHealthCheck : IHealthCheck
|
||||
{
|
||||
private readonly IAuthentikClient _authentikClient;
|
||||
private readonly IServiceProvider _services;
|
||||
private readonly ILogger<AuthentikGlobalHealthCheck> _logger;
|
||||
|
||||
public string CheckName => "AuthentikGlobal";
|
||||
public bool AutoRemediate => false;
|
||||
|
||||
public AuthentikGlobalHealthCheck(
|
||||
IAuthentikClient authentikClient,
|
||||
IServiceProvider services,
|
||||
ILogger<AuthentikGlobalHealthCheck> logger)
|
||||
{
|
||||
_authentikClient = authentikClient;
|
||||
_services = services;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
|
||||
{
|
||||
// This check doesn't use the instance parameter — it checks global Authentik health.
|
||||
return await RunGlobalAsync(ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Core logic — callable from the Quartz job without an instance context.
|
||||
/// </summary>
|
||||
public async Task<HealthCheckResult> RunGlobalAsync(CancellationToken ct)
|
||||
{
|
||||
var sw = Stopwatch.StartNew();
|
||||
AuthentikMetricsStatus metricsStatus;
|
||||
string? errorMessage = null;
|
||||
HealthCheckResult result;
|
||||
|
||||
try
|
||||
{
|
||||
var response = await _authentikClient.CheckHealthAsync();
|
||||
sw.Stop();
|
||||
|
||||
if (response.IsSuccessStatusCode)
|
||||
{
|
||||
metricsStatus = AuthentikMetricsStatus.Healthy;
|
||||
result = new HealthCheckResult(HealthStatus.Healthy,
|
||||
$"Authentik healthy (latency: {sw.ElapsedMilliseconds}ms)");
|
||||
}
|
||||
else
|
||||
{
|
||||
metricsStatus = AuthentikMetricsStatus.Critical;
|
||||
errorMessage = $"HTTP {response.StatusCode}";
|
||||
result = new HealthCheckResult(HealthStatus.Critical,
|
||||
"Central Authentik is DOWN — all customer web UI logins failing",
|
||||
$"Health endpoint returned {response.StatusCode}");
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
sw.Stop();
|
||||
metricsStatus = AuthentikMetricsStatus.Critical;
|
||||
errorMessage = ex.Message;
|
||||
result = new HealthCheckResult(HealthStatus.Critical,
|
||||
"Central Authentik is DOWN — all customer web UI logins failing",
|
||||
ex.Message);
|
||||
}
|
||||
|
||||
// Write metrics row
|
||||
await using var scope = _services.CreateAsyncScope();
|
||||
var db = scope.ServiceProvider.GetRequiredService<OrchestratorDbContext>();
|
||||
db.AuthentikMetrics.Add(new AuthentikMetrics
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
CheckedAt = DateTime.UtcNow,
|
||||
Status = metricsStatus,
|
||||
LatencyMs = (int)sw.ElapsedMilliseconds,
|
||||
ErrorMessage = errorMessage,
|
||||
});
|
||||
await db.SaveChangesAsync(ct);
|
||||
|
||||
// Broadcast alert if critical
|
||||
if (result.Status == HealthStatus.Critical)
|
||||
{
|
||||
var hub = scope.ServiceProvider.GetRequiredService<IHubContext<FleetHub, IFleetClient>>();
|
||||
await hub.Clients.All.SendAlertRaised("Critical", result.Message);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
using OTSSignsOrchestrator.Server.Clients;
|
||||
using OTSSignsOrchestrator.Server.Data.Entities;
|
||||
|
||||
namespace OTSSignsOrchestrator.Server.Health.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Verifies the per-instance SAML provider in Authentik is active by checking
|
||||
/// the provider exists using the stored <see cref="Instance.AuthentikProviderId"/>.
|
||||
/// </summary>
|
||||
public sealed class AuthentikSamlProviderHealthCheck : IHealthCheck
|
||||
{
|
||||
private readonly IAuthentikClient _authentikClient;
|
||||
private readonly ILogger<AuthentikSamlProviderHealthCheck> _logger;
|
||||
|
||||
public string CheckName => "AuthentikSamlProvider";
|
||||
public bool AutoRemediate => false;
|
||||
|
||||
public AuthentikSamlProviderHealthCheck(
|
||||
IAuthentikClient authentikClient,
|
||||
ILogger<AuthentikSamlProviderHealthCheck> logger)
|
||||
{
|
||||
_authentikClient = authentikClient;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
|
||||
{
|
||||
if (string.IsNullOrEmpty(instance.AuthentikProviderId))
|
||||
{
|
||||
return new HealthCheckResult(HealthStatus.Degraded,
|
||||
"No Authentik provider ID stored — SAML not provisioned");
|
||||
}
|
||||
|
||||
if (!int.TryParse(instance.AuthentikProviderId, out var providerId))
|
||||
{
|
||||
return new HealthCheckResult(HealthStatus.Critical,
|
||||
$"Invalid Authentik provider ID: {instance.AuthentikProviderId}");
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var response = await _authentikClient.GetSamlProviderAsync(providerId);
|
||||
|
||||
if (response.IsSuccessStatusCode && response.Content is not null)
|
||||
{
|
||||
return new HealthCheckResult(HealthStatus.Healthy,
|
||||
$"SAML provider {providerId} is active in Authentik");
|
||||
}
|
||||
|
||||
return new HealthCheckResult(HealthStatus.Critical,
|
||||
$"SAML provider {providerId} not found or inaccessible",
|
||||
response.Error?.Content);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new HealthCheckResult(HealthStatus.Critical,
|
||||
$"Failed to check SAML provider: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
using OTSSignsOrchestrator.Server.Data.Entities;
|
||||
|
||||
namespace OTSSignsOrchestrator.Server.Health.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// For Pro plan BYOI customers: checks certificate expiry from <see cref="ByoiConfig"/>.
|
||||
/// Alerts at 60-day (Warning), 30-day (Warning), 7-day (Critical) thresholds.
|
||||
/// AutoRemediate=false — customer must rotate their IdP certificate via the portal.
|
||||
/// </summary>
|
||||
public sealed class ByoiCertExpiryHealthCheck : IHealthCheck
|
||||
{
|
||||
/// <summary>Alert thresholds in days (descending).</summary>
|
||||
internal static readonly int[] AlertThresholdDays = [60, 30, 7];
|
||||
|
||||
/// <summary>Days at or below which severity escalates to Critical.</summary>
|
||||
internal const int CriticalThresholdDays = 7;
|
||||
|
||||
public string CheckName => "ByoiCertExpiry";
|
||||
public bool AutoRemediate => false;
|
||||
|
||||
public Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
|
||||
{
|
||||
// Only applies to instances with an enabled BYOI config
|
||||
var byoiConfig = instance.ByoiConfigs.FirstOrDefault(b => b.Enabled);
|
||||
if (byoiConfig is null)
|
||||
{
|
||||
return Task.FromResult(new HealthCheckResult(HealthStatus.Healthy,
|
||||
"No BYOI config — check not applicable"));
|
||||
}
|
||||
|
||||
// Only Pro customers have BYOI
|
||||
if (instance.Customer.Plan != CustomerPlan.Pro)
|
||||
{
|
||||
return Task.FromResult(new HealthCheckResult(HealthStatus.Healthy,
|
||||
"Non-Pro plan — BYOI check not applicable"));
|
||||
}
|
||||
|
||||
var daysRemaining = (byoiConfig.CertExpiry - DateTime.UtcNow).TotalDays;
|
||||
|
||||
if (daysRemaining <= 0)
|
||||
{
|
||||
return Task.FromResult(new HealthCheckResult(HealthStatus.Critical,
|
||||
$"BYOI certificate has EXPIRED (expired {Math.Abs((int)daysRemaining)} days ago)",
|
||||
"Customer must rotate their IdP certificate via the portal immediately"));
|
||||
}
|
||||
|
||||
if (daysRemaining <= CriticalThresholdDays)
|
||||
{
|
||||
return Task.FromResult(new HealthCheckResult(HealthStatus.Critical,
|
||||
$"BYOI certificate expires in {(int)daysRemaining} days",
|
||||
"Urgent: customer must rotate their IdP certificate"));
|
||||
}
|
||||
|
||||
// Check warning thresholds (60 and 30 days)
|
||||
foreach (var threshold in AlertThresholdDays)
|
||||
{
|
||||
if (threshold <= CriticalThresholdDays) continue;
|
||||
if (daysRemaining <= threshold)
|
||||
{
|
||||
return Task.FromResult(new HealthCheckResult(HealthStatus.Degraded,
|
||||
$"BYOI certificate expires in {(int)daysRemaining} days (threshold: {threshold}d)",
|
||||
"Customer should plan certificate rotation"));
|
||||
}
|
||||
}
|
||||
|
||||
return Task.FromResult(new HealthCheckResult(HealthStatus.Healthy,
|
||||
$"BYOI certificate valid for {(int)daysRemaining} more days"));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
using OTSSignsOrchestrator.Server.Clients;
|
||||
using OTSSignsOrchestrator.Server.Data.Entities;
|
||||
|
||||
namespace OTSSignsOrchestrator.Server.Health.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Verifies the count of authorised displays does not exceed the customer's licensed
|
||||
/// <see cref="Customer.ScreenCount"/>. Uses <see cref="XiboApiClientExtensions.GetAllPagesAsync{T}"/>
|
||||
/// with <c>authorised=1</c> filter to get all authorised displays.
|
||||
/// </summary>
|
||||
public sealed class DisplayAuthorisedHealthCheck : IHealthCheck
|
||||
{
|
||||
private readonly XiboClientFactory _clientFactory;
|
||||
private readonly IServiceProvider _services;
|
||||
private readonly ILogger<DisplayAuthorisedHealthCheck> _logger;
|
||||
|
||||
public string CheckName => "DisplayAuthorised";
|
||||
public bool AutoRemediate => false;
|
||||
|
||||
public DisplayAuthorisedHealthCheck(
|
||||
XiboClientFactory clientFactory,
|
||||
IServiceProvider services,
|
||||
ILogger<DisplayAuthorisedHealthCheck> logger)
|
||||
{
|
||||
_clientFactory = clientFactory;
|
||||
_services = services;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
|
||||
{
|
||||
var (client, _) = await ResolveAsync(instance);
|
||||
if (client is null)
|
||||
return new HealthCheckResult(HealthStatus.Critical, "No OAuth app — cannot check displays");
|
||||
|
||||
try
|
||||
{
|
||||
var displays = await client.GetAllPagesAsync(
|
||||
(start, length) => client.GetDisplaysAsync(start, length, authorised: 1));
|
||||
|
||||
var authorisedCount = displays.Count;
|
||||
var licensed = instance.Customer.ScreenCount;
|
||||
|
||||
if (authorisedCount <= licensed)
|
||||
{
|
||||
return new HealthCheckResult(HealthStatus.Healthy,
|
||||
$"Authorised displays: {authorisedCount}/{licensed}");
|
||||
}
|
||||
|
||||
return new HealthCheckResult(HealthStatus.Degraded,
|
||||
$"Authorised displays ({authorisedCount}) exceeds license ({licensed})",
|
||||
$"Over-provisioned by {authorisedCount - licensed} display(s)");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new HealthCheckResult(HealthStatus.Critical,
|
||||
$"Failed to check displays: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<(IXiboApiClient? Client, string Abbrev)> ResolveAsync(Instance instance)
|
||||
{
|
||||
var abbrev = instance.Customer.Abbreviation;
|
||||
var oauthApp = instance.OauthAppRegistries.FirstOrDefault();
|
||||
if (oauthApp is null) return (null, abbrev);
|
||||
|
||||
var settings = _services.GetRequiredService<Core.Services.SettingsService>();
|
||||
var secret = await settings.GetAsync(Core.Services.SettingsService.InstanceOAuthSecretId(abbrev));
|
||||
if (string.IsNullOrEmpty(secret)) return (null, abbrev);
|
||||
|
||||
var client = await _clientFactory.CreateAsync(instance.XiboUrl, oauthApp.ClientId, secret);
|
||||
return (client, abbrev);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,124 @@
|
||||
using OTSSignsOrchestrator.Server.Clients;
|
||||
using OTSSignsOrchestrator.Server.Data;
|
||||
using OTSSignsOrchestrator.Server.Data.Entities;
|
||||
|
||||
namespace OTSSignsOrchestrator.Server.Health.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Verifies all 4 expected Xibo groups exist for the instance:
|
||||
/// <c>{abbrev}-viewer</c>, <c>{abbrev}-editor</c>, <c>{abbrev}-admin</c>, <c>ots-it-{abbrev}</c>.
|
||||
/// Uses <see cref="XiboApiClientExtensions.GetAllPagesAsync{T}"/> to avoid pagination truncation.
|
||||
/// </summary>
|
||||
public sealed class GroupStructureHealthCheck : IHealthCheck
|
||||
{
|
||||
private readonly XiboClientFactory _clientFactory;
|
||||
private readonly IServiceProvider _services;
|
||||
private readonly ILogger<GroupStructureHealthCheck> _logger;
|
||||
|
||||
public string CheckName => "GroupStructure";
|
||||
public bool AutoRemediate => true;
|
||||
|
||||
public GroupStructureHealthCheck(
|
||||
XiboClientFactory clientFactory,
|
||||
IServiceProvider services,
|
||||
ILogger<GroupStructureHealthCheck> logger)
|
||||
{
|
||||
_clientFactory = clientFactory;
|
||||
_services = services;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
|
||||
{
|
||||
var (client, abbrev) = await ResolveAsync(instance);
|
||||
if (client is null)
|
||||
return new HealthCheckResult(HealthStatus.Critical, "No OAuth app — cannot verify groups");
|
||||
|
||||
var expected = ExpectedGroups(abbrev);
|
||||
var groups = await client.GetAllPagesAsync(
|
||||
(start, length) => client.GetGroupsAsync(start, length));
|
||||
|
||||
var existing = groups
|
||||
.Select(g => g.TryGetValue("group", out var n) ? n?.ToString() : null)
|
||||
.Where(n => n is not null)
|
||||
.ToHashSet(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
var missing = expected.Where(e => !existing.Contains(e)).ToList();
|
||||
|
||||
if (missing.Count == 0)
|
||||
return new HealthCheckResult(HealthStatus.Healthy, "All 4 expected groups present");
|
||||
|
||||
return new HealthCheckResult(
|
||||
HealthStatus.Critical,
|
||||
$"Missing groups: {string.Join(", ", missing)}",
|
||||
$"Expected: {string.Join(", ", expected)}");
|
||||
}
|
||||
|
||||
public async Task<bool> RemediateAsync(Instance instance, CancellationToken ct)
|
||||
{
|
||||
var (client, abbrev) = await ResolveAsync(instance);
|
||||
if (client is null) return false;
|
||||
|
||||
await using var scope = _services.CreateAsyncScope();
|
||||
var db = scope.ServiceProvider.GetRequiredService<OrchestratorDbContext>();
|
||||
|
||||
var expected = ExpectedGroups(abbrev);
|
||||
var groups = await client.GetAllPagesAsync(
|
||||
(start, length) => client.GetGroupsAsync(start, length));
|
||||
|
||||
var existing = groups
|
||||
.Select(g => g.TryGetValue("group", out var n) ? n?.ToString() : null)
|
||||
.Where(n => n is not null)
|
||||
.ToHashSet(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
var allFixed = true;
|
||||
foreach (var name in expected.Where(e => !existing.Contains(e)))
|
||||
{
|
||||
var resp = await client.CreateGroupAsync(new CreateGroupRequest(name, $"Auto-created by health check for {abbrev}"));
|
||||
if (resp.IsSuccessStatusCode)
|
||||
{
|
||||
db.AuditLogs.Add(new AuditLog
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
InstanceId = instance.Id,
|
||||
Actor = "HealthCheckEngine:GroupStructure",
|
||||
Action = "CreateGroup",
|
||||
Target = name,
|
||||
Outcome = "Success",
|
||||
Detail = $"Recreated missing group {name}",
|
||||
OccurredAt = DateTime.UtcNow,
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
_logger.LogError("Failed to create group {Group}: {Err}", name, resp.Error?.Content);
|
||||
allFixed = false;
|
||||
}
|
||||
}
|
||||
|
||||
await db.SaveChangesAsync(ct);
|
||||
return allFixed;
|
||||
}
|
||||
|
||||
private static string[] ExpectedGroups(string abbrev) =>
|
||||
[
|
||||
$"{abbrev}-viewer",
|
||||
$"{abbrev}-editor",
|
||||
$"{abbrev}-admin",
|
||||
$"ots-it-{abbrev}",
|
||||
];
|
||||
|
||||
private async Task<(IXiboApiClient? Client, string Abbrev)> ResolveAsync(Instance instance)
|
||||
{
|
||||
var abbrev = instance.Customer.Abbreviation;
|
||||
var oauthApp = instance.OauthAppRegistries.FirstOrDefault();
|
||||
if (oauthApp is null) return (null, abbrev);
|
||||
|
||||
var settings = _services.GetRequiredService<Core.Services.SettingsService>();
|
||||
var secret = await settings.GetAsync(Core.Services.SettingsService.InstanceOAuthSecretId(abbrev));
|
||||
if (string.IsNullOrEmpty(secret)) return (null, abbrev);
|
||||
|
||||
var client = await _clientFactory.CreateAsync(instance.XiboUrl, oauthApp.ClientId, secret);
|
||||
return (client, abbrev);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,63 @@
|
||||
using OTSSignsOrchestrator.Server.Clients;
|
||||
using OTSSignsOrchestrator.Server.Data.Entities;
|
||||
|
||||
namespace OTSSignsOrchestrator.Server.Health.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Verifies the <c>invite-{abbrev}</c> flow exists in Authentik by searching for it
|
||||
/// in the invitation stages list.
|
||||
/// </summary>
|
||||
public sealed class InvitationFlowHealthCheck : IHealthCheck
|
||||
{
|
||||
private readonly IAuthentikClient _authentikClient;
|
||||
private readonly ILogger<InvitationFlowHealthCheck> _logger;
|
||||
|
||||
public string CheckName => "InvitationFlow";
|
||||
public bool AutoRemediate => false;
|
||||
|
||||
public InvitationFlowHealthCheck(
|
||||
IAuthentikClient authentikClient,
|
||||
ILogger<InvitationFlowHealthCheck> logger)
|
||||
{
|
||||
_authentikClient = authentikClient;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
|
||||
{
|
||||
var abbrev = instance.Customer.Abbreviation;
|
||||
var expectedName = $"invite-{abbrev}";
|
||||
|
||||
try
|
||||
{
|
||||
// Search Authentik groups for evidence of the invitation flow
|
||||
// The invitation is created as a stage invitation; we verify via the
|
||||
// Authentik API by searching for it by name.
|
||||
var groupResponse = await _authentikClient.ListGroupsAsync(expectedName);
|
||||
|
||||
if (groupResponse.IsSuccessStatusCode && groupResponse.Content?.Results is { Count: > 0 })
|
||||
{
|
||||
var found = groupResponse.Content.Results.Any(g =>
|
||||
g.TryGetValue("name", out var n) &&
|
||||
string.Equals(n?.ToString(), expectedName, StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
if (found)
|
||||
{
|
||||
return new HealthCheckResult(HealthStatus.Healthy,
|
||||
$"Invitation flow '{expectedName}' exists in Authentik");
|
||||
}
|
||||
}
|
||||
|
||||
// If groups don't show it, it's still possible the invitation was created
|
||||
// as a separate stage object. Log as degraded since we can't fully confirm.
|
||||
return new HealthCheckResult(HealthStatus.Degraded,
|
||||
$"Invitation flow '{expectedName}' not found in Authentik",
|
||||
"The invitation may exist but could not be verified via group search");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new HealthCheckResult(HealthStatus.Critical,
|
||||
$"Failed to check invitation flow: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,106 @@
|
||||
using Renci.SshNet;
|
||||
using OTSSignsOrchestrator.Server.Data.Entities;
|
||||
|
||||
namespace OTSSignsOrchestrator.Server.Health.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Verifies connectivity to the instance's MySQL database by running a simple query
|
||||
/// via SSH against the Docker Swarm host.
|
||||
/// </summary>
|
||||
public sealed class MySqlConnectHealthCheck : IHealthCheck
|
||||
{
|
||||
private readonly IServiceProvider _services;
|
||||
private readonly ILogger<MySqlConnectHealthCheck> _logger;
|
||||
|
||||
public string CheckName => "MySqlConnect";
|
||||
public bool AutoRemediate => false;
|
||||
|
||||
public MySqlConnectHealthCheck(
|
||||
IServiceProvider services,
|
||||
ILogger<MySqlConnectHealthCheck> logger)
|
||||
{
|
||||
_services = services;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
|
||||
{
|
||||
var dbName = instance.MysqlDatabase;
|
||||
if (string.IsNullOrEmpty(dbName))
|
||||
return new HealthCheckResult(HealthStatus.Critical, "No MySQL database configured");
|
||||
|
||||
try
|
||||
{
|
||||
var settings = _services.GetRequiredService<Core.Services.SettingsService>();
|
||||
var sshInfo = await GetSwarmSshHostAsync(settings);
|
||||
var mysqlHost = await settings.GetAsync(Core.Services.SettingsService.MySqlHost, "localhost");
|
||||
var mysqlPort = await settings.GetAsync(Core.Services.SettingsService.MySqlPort, "3306");
|
||||
var mysqlUser = await settings.GetAsync(Core.Services.SettingsService.MySqlAdminUser, "root");
|
||||
var mysqlPass = await settings.GetAsync(Core.Services.SettingsService.MySqlAdminPassword, "");
|
||||
|
||||
using var sshClient = CreateSshClient(sshInfo);
|
||||
sshClient.Connect();
|
||||
try
|
||||
{
|
||||
// Simple connectivity test — SELECT 1 against the instance database
|
||||
var cmd = $"mysql -h {mysqlHost} -P {mysqlPort} -u {mysqlUser} " +
|
||||
$"-p'{mysqlPass}' -e 'SELECT 1' {dbName} 2>&1";
|
||||
var output = RunSshCommand(sshClient, cmd);
|
||||
|
||||
return new HealthCheckResult(HealthStatus.Healthy,
|
||||
$"MySQL connection to {dbName} successful");
|
||||
}
|
||||
finally
|
||||
{
|
||||
sshClient.Disconnect();
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new HealthCheckResult(HealthStatus.Critical,
|
||||
$"MySQL connection failed for {dbName}: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<SshConnectionInfo> GetSwarmSshHostAsync(Core.Services.SettingsService settings)
|
||||
{
|
||||
var host = await settings.GetAsync("Ssh.SwarmHost")
|
||||
?? throw new InvalidOperationException("SSH Swarm host not configured.");
|
||||
var portStr = await settings.GetAsync("Ssh.SwarmPort", "22");
|
||||
var user = await settings.GetAsync("Ssh.SwarmUser", "root");
|
||||
var keyPath = await settings.GetAsync("Ssh.SwarmKeyPath");
|
||||
var password = await settings.GetAsync("Ssh.SwarmPassword");
|
||||
if (!int.TryParse(portStr, out var port)) port = 22;
|
||||
return new SshConnectionInfo(host, port, user, keyPath, password);
|
||||
}
|
||||
|
||||
private static SshClient CreateSshClient(SshConnectionInfo info)
|
||||
{
|
||||
var authMethods = new List<AuthenticationMethod>();
|
||||
if (!string.IsNullOrEmpty(info.KeyPath))
|
||||
authMethods.Add(new PrivateKeyAuthenticationMethod(info.Username, new PrivateKeyFile(info.KeyPath)));
|
||||
if (!string.IsNullOrEmpty(info.Password))
|
||||
authMethods.Add(new PasswordAuthenticationMethod(info.Username, info.Password));
|
||||
if (authMethods.Count == 0)
|
||||
{
|
||||
var defaultKeyPath = Path.Combine(
|
||||
Environment.GetFolderPath(Environment.SpecialFolder.UserProfile), ".ssh", "id_rsa");
|
||||
if (File.Exists(defaultKeyPath))
|
||||
authMethods.Add(new PrivateKeyAuthenticationMethod(info.Username, new PrivateKeyFile(defaultKeyPath)));
|
||||
else
|
||||
throw new InvalidOperationException($"No SSH auth method for {info.Host}:{info.Port}.");
|
||||
}
|
||||
var connInfo = new Renci.SshNet.ConnectionInfo(info.Host, info.Port, info.Username, authMethods.ToArray());
|
||||
return new SshClient(connInfo);
|
||||
}
|
||||
|
||||
private static string RunSshCommand(SshClient client, string command)
|
||||
{
|
||||
using var cmd = client.RunCommand(command);
|
||||
if (cmd.ExitStatus != 0)
|
||||
throw new InvalidOperationException($"SSH command failed (exit {cmd.ExitStatus}): {cmd.Error}");
|
||||
return cmd.Result;
|
||||
}
|
||||
|
||||
internal sealed record SshConnectionInfo(string Host, int Port, string Username, string? KeyPath, string? Password);
|
||||
}
|
||||
@@ -0,0 +1,121 @@
|
||||
using Renci.SshNet;
|
||||
using OTSSignsOrchestrator.Server.Data.Entities;
|
||||
|
||||
namespace OTSSignsOrchestrator.Server.Health.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Verifies NFS paths for the instance are accessible by running <c>ls</c> via SSH.
|
||||
/// </summary>
|
||||
public sealed class NfsAccessHealthCheck : IHealthCheck
|
||||
{
|
||||
private readonly IServiceProvider _services;
|
||||
private readonly ILogger<NfsAccessHealthCheck> _logger;
|
||||
|
||||
public string CheckName => "NfsAccess";
|
||||
public bool AutoRemediate => false;
|
||||
|
||||
public NfsAccessHealthCheck(
|
||||
IServiceProvider services,
|
||||
ILogger<NfsAccessHealthCheck> logger)
|
||||
{
|
||||
_services = services;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
|
||||
{
|
||||
var nfsPath = instance.NfsPath;
|
||||
if (string.IsNullOrEmpty(nfsPath))
|
||||
return new HealthCheckResult(HealthStatus.Critical, "No NFS path configured");
|
||||
|
||||
try
|
||||
{
|
||||
var settings = _services.GetRequiredService<Core.Services.SettingsService>();
|
||||
var sshInfo = await GetSwarmSshHostAsync(settings);
|
||||
var nfsServer = await settings.GetAsync(Core.Services.SettingsService.NfsServer);
|
||||
var nfsExport = await settings.GetAsync(Core.Services.SettingsService.NfsExport);
|
||||
|
||||
if (string.IsNullOrEmpty(nfsServer) || string.IsNullOrEmpty(nfsExport))
|
||||
return new HealthCheckResult(HealthStatus.Critical, "NFS server/export not configured");
|
||||
|
||||
using var sshClient = CreateSshClient(sshInfo);
|
||||
sshClient.Connect();
|
||||
try
|
||||
{
|
||||
// Mount temporarily and check the path is listable
|
||||
var mountPoint = $"/tmp/healthcheck-nfs-{Guid.NewGuid():N}";
|
||||
RunSshCommand(sshClient, $"sudo mkdir -p {mountPoint}");
|
||||
try
|
||||
{
|
||||
RunSshCommand(sshClient, $"sudo mount -t nfs4 {nfsServer}:{nfsExport} {mountPoint}");
|
||||
var output = RunSshCommand(sshClient, $"ls {mountPoint}/{nfsPath} 2>&1");
|
||||
|
||||
return new HealthCheckResult(HealthStatus.Healthy,
|
||||
$"NFS path accessible: {nfsPath}");
|
||||
}
|
||||
finally
|
||||
{
|
||||
RunSshCommandAllowFailure(sshClient, $"sudo umount {mountPoint}");
|
||||
RunSshCommandAllowFailure(sshClient, $"sudo rmdir {mountPoint}");
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
sshClient.Disconnect();
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new HealthCheckResult(HealthStatus.Critical,
|
||||
$"NFS access check failed for {nfsPath}: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<SshConnectionInfo> GetSwarmSshHostAsync(Core.Services.SettingsService settings)
|
||||
{
|
||||
var host = await settings.GetAsync("Ssh.SwarmHost")
|
||||
?? throw new InvalidOperationException("SSH Swarm host not configured.");
|
||||
var portStr = await settings.GetAsync("Ssh.SwarmPort", "22");
|
||||
var user = await settings.GetAsync("Ssh.SwarmUser", "root");
|
||||
var keyPath = await settings.GetAsync("Ssh.SwarmKeyPath");
|
||||
var password = await settings.GetAsync("Ssh.SwarmPassword");
|
||||
if (!int.TryParse(portStr, out var port)) port = 22;
|
||||
return new SshConnectionInfo(host, port, user, keyPath, password);
|
||||
}
|
||||
|
||||
private static SshClient CreateSshClient(SshConnectionInfo info)
|
||||
{
|
||||
var authMethods = new List<AuthenticationMethod>();
|
||||
if (!string.IsNullOrEmpty(info.KeyPath))
|
||||
authMethods.Add(new PrivateKeyAuthenticationMethod(info.Username, new PrivateKeyFile(info.KeyPath)));
|
||||
if (!string.IsNullOrEmpty(info.Password))
|
||||
authMethods.Add(new PasswordAuthenticationMethod(info.Username, info.Password));
|
||||
if (authMethods.Count == 0)
|
||||
{
|
||||
var defaultKeyPath = Path.Combine(
|
||||
Environment.GetFolderPath(Environment.SpecialFolder.UserProfile), ".ssh", "id_rsa");
|
||||
if (File.Exists(defaultKeyPath))
|
||||
authMethods.Add(new PrivateKeyAuthenticationMethod(info.Username, new PrivateKeyFile(defaultKeyPath)));
|
||||
else
|
||||
throw new InvalidOperationException($"No SSH auth method for {info.Host}:{info.Port}.");
|
||||
}
|
||||
var connInfo = new Renci.SshNet.ConnectionInfo(info.Host, info.Port, info.Username, authMethods.ToArray());
|
||||
return new SshClient(connInfo);
|
||||
}
|
||||
|
||||
private static string RunSshCommand(SshClient client, string command)
|
||||
{
|
||||
using var cmd = client.RunCommand(command);
|
||||
if (cmd.ExitStatus != 0)
|
||||
throw new InvalidOperationException($"SSH command failed (exit {cmd.ExitStatus}): {cmd.Error}");
|
||||
return cmd.Result;
|
||||
}
|
||||
|
||||
private static void RunSshCommandAllowFailure(SshClient client, string command)
|
||||
{
|
||||
using var cmd = client.RunCommand(command);
|
||||
// Intentionally ignore exit code — cleanup operations
|
||||
}
|
||||
|
||||
internal sealed record SshConnectionInfo(string Host, int Port, string Username, string? KeyPath, string? Password);
|
||||
}
|
||||
@@ -0,0 +1,50 @@
|
||||
using OTSSignsOrchestrator.Server.Data.Entities;
|
||||
|
||||
namespace OTSSignsOrchestrator.Server.Health.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Checks the age of the OAuth2 application credentials from <see cref="OauthAppRegistry.CreatedAt"/>.
|
||||
/// Alerts Warning at 180 days, Critical at 365 days. AutoRemediate=false — suggests
|
||||
/// a "rotate-oauth2" job instead.
|
||||
/// </summary>
|
||||
public sealed class OauthAppAgeHealthCheck : IHealthCheck
|
||||
{
|
||||
/// <summary>Days at which severity escalates to Warning.</summary>
|
||||
internal const int WarningThresholdDays = 180;
|
||||
|
||||
/// <summary>Days at which severity escalates to Critical.</summary>
|
||||
internal const int CriticalThresholdDays = 365;
|
||||
|
||||
public string CheckName => "OauthAppAge";
|
||||
public bool AutoRemediate => false;
|
||||
|
||||
public Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
|
||||
{
|
||||
var oauthApp = instance.OauthAppRegistries
|
||||
.OrderByDescending(o => o.CreatedAt)
|
||||
.FirstOrDefault();
|
||||
|
||||
if (oauthApp is null)
|
||||
return Task.FromResult(new HealthCheckResult(HealthStatus.Degraded,
|
||||
"No OAuth app registered"));
|
||||
|
||||
var ageDays = (DateTime.UtcNow - oauthApp.CreatedAt).TotalDays;
|
||||
|
||||
if (ageDays >= CriticalThresholdDays)
|
||||
{
|
||||
return Task.FromResult(new HealthCheckResult(HealthStatus.Critical,
|
||||
$"OAuth2 credentials are {(int)ageDays} days old (critical threshold: {CriticalThresholdDays}d)",
|
||||
"Create a 'rotate-credentials' job to rotate the OAuth2 application"));
|
||||
}
|
||||
|
||||
if (ageDays >= WarningThresholdDays)
|
||||
{
|
||||
return Task.FromResult(new HealthCheckResult(HealthStatus.Degraded,
|
||||
$"OAuth2 credentials are {(int)ageDays} days old (warning threshold: {WarningThresholdDays}d)",
|
||||
"Schedule credential rotation before they reach 365 days"));
|
||||
}
|
||||
|
||||
return Task.FromResult(new HealthCheckResult(HealthStatus.Healthy,
|
||||
$"OAuth2 credentials are {(int)ageDays} days old"));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
using OTSSignsOrchestrator.Server.Clients;
|
||||
using OTSSignsOrchestrator.Server.Data.Entities;
|
||||
|
||||
namespace OTSSignsOrchestrator.Server.Health.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Verifies the OAuth2 app in <see cref="OauthAppRegistry"/> can still authenticate
|
||||
/// by testing a <c>client_credentials</c> flow against the Xibo CMS instance.
|
||||
/// AutoRemediate=false — credential rotation requires a separate job.
|
||||
/// </summary>
|
||||
public sealed class OauthAppHealthCheck : IHealthCheck
|
||||
{
|
||||
private readonly XiboClientFactory _clientFactory;
|
||||
private readonly IServiceProvider _services;
|
||||
private readonly ILogger<OauthAppHealthCheck> _logger;
|
||||
|
||||
public string CheckName => "OauthApp";
|
||||
public bool AutoRemediate => false;
|
||||
|
||||
public OauthAppHealthCheck(
|
||||
XiboClientFactory clientFactory,
|
||||
IServiceProvider services,
|
||||
ILogger<OauthAppHealthCheck> logger)
|
||||
{
|
||||
_clientFactory = clientFactory;
|
||||
_services = services;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
|
||||
{
|
||||
var oauthApp = instance.OauthAppRegistries.FirstOrDefault();
|
||||
if (oauthApp is null)
|
||||
return new HealthCheckResult(HealthStatus.Critical, "No OAuth app registered");
|
||||
|
||||
var abbrev = instance.Customer.Abbreviation;
|
||||
var settings = _services.GetRequiredService<Core.Services.SettingsService>();
|
||||
var secret = await settings.GetAsync(Core.Services.SettingsService.InstanceOAuthSecretId(abbrev));
|
||||
|
||||
if (string.IsNullOrEmpty(secret))
|
||||
return new HealthCheckResult(HealthStatus.Critical,
|
||||
"OAuth client secret not found in Bitwarden — cannot authenticate");
|
||||
|
||||
try
|
||||
{
|
||||
// Attempt to create a client (which fetches a token via client_credentials)
|
||||
var client = await _clientFactory.CreateAsync(instance.XiboUrl, oauthApp.ClientId, secret);
|
||||
|
||||
// If we got here, the token was obtained successfully
|
||||
return new HealthCheckResult(HealthStatus.Healthy, "OAuth2 client_credentials flow successful");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new HealthCheckResult(HealthStatus.Critical,
|
||||
$"OAuth2 authentication failed: {ex.Message}",
|
||||
"Credential rotation job may be required");
|
||||
}
|
||||
}
|
||||
}
|
||||
127
OTSSignsOrchestrator.Server/Health/Checks/StackHealthCheck.cs
Normal file
127
OTSSignsOrchestrator.Server/Health/Checks/StackHealthCheck.cs
Normal file
@@ -0,0 +1,127 @@
|
||||
using Renci.SshNet;
|
||||
using OTSSignsOrchestrator.Server.Data.Entities;
|
||||
|
||||
namespace OTSSignsOrchestrator.Server.Health.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Verifies the Docker stack is healthy by running <c>docker stack ps {stackName}</c>
|
||||
/// via SSH and checking that all services report Running state.
|
||||
/// </summary>
|
||||
public sealed class StackHealthCheck : IHealthCheck
|
||||
{
|
||||
private readonly IServiceProvider _services;
|
||||
private readonly ILogger<StackHealthCheck> _logger;
|
||||
|
||||
public string CheckName => "StackHealth";
|
||||
public bool AutoRemediate => false;
|
||||
|
||||
public StackHealthCheck(
|
||||
IServiceProvider services,
|
||||
ILogger<StackHealthCheck> logger)
|
||||
{
|
||||
_services = services;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
|
||||
{
|
||||
var stackName = instance.DockerStackName;
|
||||
if (string.IsNullOrEmpty(stackName))
|
||||
return new HealthCheckResult(HealthStatus.Critical, "No Docker stack name configured");
|
||||
|
||||
try
|
||||
{
|
||||
var settings = _services.GetRequiredService<Core.Services.SettingsService>();
|
||||
var sshInfo = await GetSwarmSshHostAsync(settings);
|
||||
|
||||
using var sshClient = CreateSshClient(sshInfo);
|
||||
sshClient.Connect();
|
||||
|
||||
try
|
||||
{
|
||||
// Get task status for all services in the stack
|
||||
var output = RunSshCommand(sshClient,
|
||||
$"docker stack ps {stackName} --format '{{{{.Name}}}}|{{{{.CurrentState}}}}|{{{{.DesiredState}}}}'");
|
||||
|
||||
var lines = output.Split('\n', StringSplitOptions.RemoveEmptyEntries);
|
||||
var notRunning = new List<string>();
|
||||
|
||||
foreach (var line in lines)
|
||||
{
|
||||
var parts = line.Split('|');
|
||||
if (parts.Length < 3) continue;
|
||||
|
||||
var name = parts[0].Trim();
|
||||
var currentState = parts[1].Trim();
|
||||
var desiredState = parts[2].Trim();
|
||||
|
||||
// Only check tasks whose desired state is Running
|
||||
if (desiredState.Equals("Running", StringComparison.OrdinalIgnoreCase) &&
|
||||
!currentState.StartsWith("Running", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
notRunning.Add($"{name}: {currentState}");
|
||||
}
|
||||
}
|
||||
|
||||
if (notRunning.Count == 0)
|
||||
return new HealthCheckResult(HealthStatus.Healthy,
|
||||
$"All services in {stackName} are Running");
|
||||
|
||||
return new HealthCheckResult(HealthStatus.Critical,
|
||||
$"{notRunning.Count} service(s) not running in {stackName}",
|
||||
string.Join("\n", notRunning));
|
||||
}
|
||||
finally
|
||||
{
|
||||
sshClient.Disconnect();
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new HealthCheckResult(HealthStatus.Critical,
|
||||
$"SSH check failed for {stackName}: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<SshConnectionInfo> GetSwarmSshHostAsync(Core.Services.SettingsService settings)
|
||||
{
|
||||
var host = await settings.GetAsync("Ssh.SwarmHost")
|
||||
?? throw new InvalidOperationException("SSH Swarm host not configured.");
|
||||
var portStr = await settings.GetAsync("Ssh.SwarmPort", "22");
|
||||
var user = await settings.GetAsync("Ssh.SwarmUser", "root");
|
||||
var keyPath = await settings.GetAsync("Ssh.SwarmKeyPath");
|
||||
var password = await settings.GetAsync("Ssh.SwarmPassword");
|
||||
if (!int.TryParse(portStr, out var port)) port = 22;
|
||||
return new SshConnectionInfo(host, port, user, keyPath, password);
|
||||
}
|
||||
|
||||
private static SshClient CreateSshClient(SshConnectionInfo info)
|
||||
{
|
||||
var authMethods = new List<AuthenticationMethod>();
|
||||
if (!string.IsNullOrEmpty(info.KeyPath))
|
||||
authMethods.Add(new PrivateKeyAuthenticationMethod(info.Username, new PrivateKeyFile(info.KeyPath)));
|
||||
if (!string.IsNullOrEmpty(info.Password))
|
||||
authMethods.Add(new PasswordAuthenticationMethod(info.Username, info.Password));
|
||||
if (authMethods.Count == 0)
|
||||
{
|
||||
var defaultKeyPath = Path.Combine(
|
||||
Environment.GetFolderPath(Environment.SpecialFolder.UserProfile), ".ssh", "id_rsa");
|
||||
if (File.Exists(defaultKeyPath))
|
||||
authMethods.Add(new PrivateKeyAuthenticationMethod(info.Username, new PrivateKeyFile(defaultKeyPath)));
|
||||
else
|
||||
throw new InvalidOperationException($"No SSH auth method for {info.Host}:{info.Port}.");
|
||||
}
|
||||
var connInfo = new Renci.SshNet.ConnectionInfo(info.Host, info.Port, info.Username, authMethods.ToArray());
|
||||
return new SshClient(connInfo);
|
||||
}
|
||||
|
||||
private static string RunSshCommand(SshClient client, string command)
|
||||
{
|
||||
using var cmd = client.RunCommand(command);
|
||||
if (cmd.ExitStatus != 0)
|
||||
throw new InvalidOperationException($"SSH command failed (exit {cmd.ExitStatus}): {cmd.Error}");
|
||||
return cmd.Result;
|
||||
}
|
||||
|
||||
internal sealed record SshConnectionInfo(string Host, int Port, string Username, string? KeyPath, string? Password);
|
||||
}
|
||||
145
OTSSignsOrchestrator.Server/Health/Checks/ThemeHealthCheck.cs
Normal file
145
OTSSignsOrchestrator.Server/Health/Checks/ThemeHealthCheck.cs
Normal file
@@ -0,0 +1,145 @@
|
||||
using OTSSignsOrchestrator.Server.Clients;
|
||||
using OTSSignsOrchestrator.Server.Data;
|
||||
using OTSSignsOrchestrator.Server.Data.Entities;
|
||||
|
||||
namespace OTSSignsOrchestrator.Server.Health.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Verifies the Xibo CMS theme is set to <c>otssigns</c> by calling <c>GET /api/settings</c>.
|
||||
/// Auto-remediates by calling <c>PUT /api/settings</c> if the theme is incorrect.
|
||||
/// </summary>
|
||||
public sealed class ThemeHealthCheck : IHealthCheck
|
||||
{
|
||||
private readonly XiboClientFactory _clientFactory;
|
||||
private readonly IServiceProvider _services;
|
||||
private readonly ILogger<ThemeHealthCheck> _logger;
|
||||
|
||||
private const string ExpectedTheme = "otssigns";
|
||||
|
||||
public string CheckName => "Theme";
|
||||
public bool AutoRemediate => true;
|
||||
|
||||
public ThemeHealthCheck(
|
||||
XiboClientFactory clientFactory,
|
||||
IServiceProvider services,
|
||||
ILogger<ThemeHealthCheck> logger)
|
||||
{
|
||||
_clientFactory = clientFactory;
|
||||
_services = services;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
|
||||
{
|
||||
var (client, _) = await ResolveAsync(instance);
|
||||
if (client is null)
|
||||
return new HealthCheckResult(HealthStatus.Critical, "No OAuth app — cannot check theme");
|
||||
|
||||
try
|
||||
{
|
||||
var settingsResp = await client.GetSettingsAsync();
|
||||
if (!settingsResp.IsSuccessStatusCode)
|
||||
return new HealthCheckResult(HealthStatus.Critical,
|
||||
$"GET /settings returned {settingsResp.StatusCode}");
|
||||
|
||||
var settings = settingsResp.Content;
|
||||
if (settings is null)
|
||||
return new HealthCheckResult(HealthStatus.Critical, "Settings response was null");
|
||||
|
||||
// Xibo returns settings as a list of { setting, value } objects or a dictionary
|
||||
var themeName = ExtractSetting(settings, "THEME_NAME");
|
||||
if (string.Equals(themeName, ExpectedTheme, StringComparison.OrdinalIgnoreCase))
|
||||
return new HealthCheckResult(HealthStatus.Healthy, $"Theme is {ExpectedTheme}");
|
||||
|
||||
return new HealthCheckResult(HealthStatus.Critical,
|
||||
$"Theme is '{themeName}', expected '{ExpectedTheme}'");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new HealthCheckResult(HealthStatus.Critical,
|
||||
$"Theme check failed: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
public async Task<bool> RemediateAsync(Instance instance, CancellationToken ct)
|
||||
{
|
||||
var (client, _) = await ResolveAsync(instance);
|
||||
if (client is null) return false;
|
||||
|
||||
try
|
||||
{
|
||||
var resp = await client.UpdateSettingsAsync(
|
||||
new UpdateSettingsRequest(new Dictionary<string, string>
|
||||
{
|
||||
["THEME_NAME"] = ExpectedTheme,
|
||||
}));
|
||||
|
||||
if (resp.IsSuccessStatusCode)
|
||||
{
|
||||
await using var scope = _services.CreateAsyncScope();
|
||||
var db = scope.ServiceProvider.GetRequiredService<OrchestratorDbContext>();
|
||||
db.AuditLogs.Add(new AuditLog
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
InstanceId = instance.Id,
|
||||
Actor = "HealthCheckEngine:Theme",
|
||||
Action = "FixTheme",
|
||||
Target = instance.Customer.Abbreviation,
|
||||
Outcome = "Success",
|
||||
Detail = $"Reset THEME_NAME to {ExpectedTheme}",
|
||||
OccurredAt = DateTime.UtcNow,
|
||||
});
|
||||
await db.SaveChangesAsync(ct);
|
||||
return true;
|
||||
}
|
||||
|
||||
_logger.LogError("Failed to fix theme: {Err}", resp.Error?.Content);
|
||||
return false;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Theme remediation failed");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private static string? ExtractSetting(object settingsObj, string key)
|
||||
{
|
||||
// Settings may come back as a dictionary or a list of objects
|
||||
if (settingsObj is System.Text.Json.JsonElement je)
|
||||
{
|
||||
if (je.ValueKind == System.Text.Json.JsonValueKind.Object &&
|
||||
je.TryGetProperty(key, out var val))
|
||||
return val.GetString();
|
||||
|
||||
if (je.ValueKind == System.Text.Json.JsonValueKind.Array)
|
||||
{
|
||||
foreach (var item in je.EnumerateArray())
|
||||
{
|
||||
if (item.TryGetProperty("setting", out var settingProp) &&
|
||||
string.Equals(settingProp.GetString(), key, StringComparison.OrdinalIgnoreCase) &&
|
||||
item.TryGetProperty("value", out var valueProp))
|
||||
{
|
||||
return valueProp.GetString();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<(IXiboApiClient? Client, string Abbrev)> ResolveAsync(Instance instance)
|
||||
{
|
||||
var abbrev = instance.Customer.Abbreviation;
|
||||
var oauthApp = instance.OauthAppRegistries.FirstOrDefault();
|
||||
if (oauthApp is null) return (null, abbrev);
|
||||
|
||||
var settings = _services.GetRequiredService<Core.Services.SettingsService>();
|
||||
var secret = await settings.GetAsync(Core.Services.SettingsService.InstanceOAuthSecretId(abbrev));
|
||||
if (string.IsNullOrEmpty(secret)) return (null, abbrev);
|
||||
|
||||
var client = await _clientFactory.CreateAsync(instance.XiboUrl, oauthApp.ClientId, secret);
|
||||
return (client, abbrev);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,61 @@
|
||||
using OTSSignsOrchestrator.Server.Clients;
|
||||
using OTSSignsOrchestrator.Server.Data.Entities;
|
||||
|
||||
namespace OTSSignsOrchestrator.Server.Health.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Verifies the Xibo CMS API is reachable by calling GET /about and expecting a 200 response.
|
||||
/// </summary>
|
||||
public sealed class XiboApiHealthCheck : IHealthCheck
|
||||
{
|
||||
private readonly XiboClientFactory _clientFactory;
|
||||
private readonly IServiceProvider _services;
|
||||
private readonly ILogger<XiboApiHealthCheck> _logger;
|
||||
|
||||
public string CheckName => "XiboApi";
|
||||
public bool AutoRemediate => false;
|
||||
|
||||
public XiboApiHealthCheck(
|
||||
XiboClientFactory clientFactory,
|
||||
IServiceProvider services,
|
||||
ILogger<XiboApiHealthCheck> logger)
|
||||
{
|
||||
_clientFactory = clientFactory;
|
||||
_services = services;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
|
||||
{
|
||||
var client = await ResolveClientAsync(instance);
|
||||
if (client is null)
|
||||
return new HealthCheckResult(HealthStatus.Critical, "No OAuth app registered — cannot reach API");
|
||||
|
||||
try
|
||||
{
|
||||
var response = await client.GetAboutAsync();
|
||||
return response.IsSuccessStatusCode
|
||||
? new HealthCheckResult(HealthStatus.Healthy, "Xibo API reachable")
|
||||
: new HealthCheckResult(HealthStatus.Critical,
|
||||
$"Xibo API returned {response.StatusCode}",
|
||||
response.Error?.Content);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new HealthCheckResult(HealthStatus.Critical, $"Xibo API unreachable: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<IXiboApiClient?> ResolveClientAsync(Instance instance)
|
||||
{
|
||||
var oauthApp = instance.OauthAppRegistries.FirstOrDefault();
|
||||
if (oauthApp is null) return null;
|
||||
|
||||
var settings = _services.GetRequiredService<OTSSignsOrchestrator.Core.Services.SettingsService>();
|
||||
var abbrev = instance.Customer.Abbreviation;
|
||||
var secret = await settings.GetAsync(Core.Services.SettingsService.InstanceOAuthSecretId(abbrev));
|
||||
if (string.IsNullOrEmpty(secret)) return null;
|
||||
|
||||
return await _clientFactory.CreateAsync(instance.XiboUrl, oauthApp.ClientId, secret);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,87 @@
|
||||
using OTSSignsOrchestrator.Server.Clients;
|
||||
using OTSSignsOrchestrator.Server.Data.Entities;
|
||||
using Microsoft.Extensions.Configuration;
|
||||
|
||||
namespace OTSSignsOrchestrator.Server.Health.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Compares the installed Xibo CMS version (from GET /about) against the latest known
|
||||
/// release configured in <c>HealthChecks:LatestXiboVersion</c>. Reports Degraded if behind.
|
||||
/// </summary>
|
||||
public sealed class XiboVersionHealthCheck : IHealthCheck
|
||||
{
|
||||
private readonly XiboClientFactory _clientFactory;
|
||||
private readonly IServiceProvider _services;
|
||||
private readonly IConfiguration _configuration;
|
||||
private readonly ILogger<XiboVersionHealthCheck> _logger;
|
||||
|
||||
public string CheckName => "XiboVersion";
|
||||
public bool AutoRemediate => false;
|
||||
|
||||
public XiboVersionHealthCheck(
|
||||
XiboClientFactory clientFactory,
|
||||
IServiceProvider services,
|
||||
IConfiguration configuration,
|
||||
ILogger<XiboVersionHealthCheck> logger)
|
||||
{
|
||||
_clientFactory = clientFactory;
|
||||
_services = services;
|
||||
_configuration = configuration;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
|
||||
{
|
||||
var latestVersion = _configuration["HealthChecks:LatestXiboVersion"];
|
||||
if (string.IsNullOrEmpty(latestVersion))
|
||||
return new HealthCheckResult(HealthStatus.Healthy, "LatestXiboVersion not configured — skipping");
|
||||
|
||||
var (client, _) = await ResolveAsync(instance);
|
||||
if (client is null)
|
||||
return new HealthCheckResult(HealthStatus.Critical, "No OAuth app — cannot check version");
|
||||
|
||||
try
|
||||
{
|
||||
var response = await client.GetAboutAsync();
|
||||
if (!response.IsSuccessStatusCode || response.Content is null)
|
||||
return new HealthCheckResult(HealthStatus.Critical, "GET /about failed");
|
||||
|
||||
string? installedVersion = null;
|
||||
if (response.Content is System.Text.Json.JsonElement je &&
|
||||
je.TryGetProperty("version", out var verProp))
|
||||
{
|
||||
installedVersion = verProp.GetString();
|
||||
}
|
||||
|
||||
if (string.IsNullOrEmpty(installedVersion))
|
||||
return new HealthCheckResult(HealthStatus.Degraded, "Could not determine installed version");
|
||||
|
||||
if (string.Equals(installedVersion, latestVersion, StringComparison.OrdinalIgnoreCase))
|
||||
return new HealthCheckResult(HealthStatus.Healthy,
|
||||
$"Xibo version {installedVersion} is current");
|
||||
|
||||
return new HealthCheckResult(HealthStatus.Degraded,
|
||||
$"Xibo version {installedVersion}, latest is {latestVersion}",
|
||||
"Consider scheduling an upgrade");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new HealthCheckResult(HealthStatus.Critical,
|
||||
$"Version check failed: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<(IXiboApiClient? Client, string Abbrev)> ResolveAsync(Instance instance)
|
||||
{
|
||||
var abbrev = instance.Customer.Abbreviation;
|
||||
var oauthApp = instance.OauthAppRegistries.FirstOrDefault();
|
||||
if (oauthApp is null) return (null, abbrev);
|
||||
|
||||
var settings = _services.GetRequiredService<Core.Services.SettingsService>();
|
||||
var secret = await settings.GetAsync(Core.Services.SettingsService.InstanceOAuthSecretId(abbrev));
|
||||
if (string.IsNullOrEmpty(secret)) return (null, abbrev);
|
||||
|
||||
var client = await _clientFactory.CreateAsync(instance.XiboUrl, oauthApp.ClientId, secret);
|
||||
return (client, abbrev);
|
||||
}
|
||||
}
|
||||
289
OTSSignsOrchestrator.Server/Health/HealthCheckEngine.cs
Normal file
289
OTSSignsOrchestrator.Server/Health/HealthCheckEngine.cs
Normal file
@@ -0,0 +1,289 @@
|
||||
using Microsoft.AspNetCore.SignalR;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using Quartz;
|
||||
using OTSSignsOrchestrator.Server.Data;
|
||||
using OTSSignsOrchestrator.Server.Data.Entities;
|
||||
using OTSSignsOrchestrator.Server.Hubs;
|
||||
|
||||
namespace OTSSignsOrchestrator.Server.Health;
|
||||
|
||||
/// <summary>
|
||||
/// Background service that schedules and runs all <see cref="IHealthCheck"/> implementations
|
||||
/// against every active <see cref="Instance"/>. Persists <see cref="HealthEvent"/> rows,
|
||||
/// aggregates worst-severity to update <see cref="Instance.HealthStatus"/>,
|
||||
/// broadcasts changes via <see cref="FleetHub"/>, and triggers auto-remediation when applicable.
|
||||
///
|
||||
/// Uses Quartz to stagger per-instance jobs across the check interval (avoids thundering herd).
|
||||
/// Concurrency is capped at 4 simultaneous check runs via <see cref="SemaphoreSlim"/>.
|
||||
/// </summary>
|
||||
public sealed class HealthCheckEngine : BackgroundService
|
||||
{
|
||||
private readonly IServiceProvider _services;
|
||||
private readonly ISchedulerFactory _schedulerFactory;
|
||||
private readonly ILogger<HealthCheckEngine> _logger;
|
||||
|
||||
/// <summary>Default interval between full health-check sweeps.</summary>
|
||||
internal static readonly TimeSpan DefaultCheckInterval = TimeSpan.FromMinutes(5);
|
||||
|
||||
public HealthCheckEngine(
|
||||
IServiceProvider services,
|
||||
ISchedulerFactory schedulerFactory,
|
||||
ILogger<HealthCheckEngine> logger)
|
||||
{
|
||||
_services = services;
|
||||
_schedulerFactory = schedulerFactory;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
// Wait briefly for the rest of the app to start
|
||||
await Task.Delay(TimeSpan.FromSeconds(5), stoppingToken);
|
||||
|
||||
var scheduler = await _schedulerFactory.GetScheduler(stoppingToken);
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await ScheduleInstanceChecks(scheduler, stoppingToken);
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
_logger.LogError(ex, "Error scheduling health check sweep");
|
||||
}
|
||||
|
||||
await Task.Delay(DefaultCheckInterval, stoppingToken);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Load all active instances and schedule staggered Quartz jobs so that
|
||||
/// check start times are spread across the interval.
|
||||
/// </summary>
|
||||
private async Task ScheduleInstanceChecks(IScheduler scheduler, CancellationToken ct)
|
||||
{
|
||||
await using var scope = _services.CreateAsyncScope();
|
||||
var db = scope.ServiceProvider.GetRequiredService<OrchestratorDbContext>();
|
||||
|
||||
var instances = await db.Instances
|
||||
.AsNoTracking()
|
||||
.Include(i => i.Customer)
|
||||
.Where(i => i.Customer.Status == CustomerStatus.Active)
|
||||
.ToListAsync(ct);
|
||||
|
||||
if (instances.Count == 0)
|
||||
return;
|
||||
|
||||
// Spread jobs across 80 % of the check interval to leave a buffer
|
||||
var spreadMs = (int)(DefaultCheckInterval.TotalMilliseconds * 0.8);
|
||||
var stepMs = instances.Count > 1 ? spreadMs / (instances.Count - 1) : 0;
|
||||
|
||||
for (var i = 0; i < instances.Count; i++)
|
||||
{
|
||||
var instance = instances[i];
|
||||
var delay = TimeSpan.FromMilliseconds(stepMs * i);
|
||||
|
||||
var jobKey = new JobKey($"health-{instance.Id}", "health-checks");
|
||||
|
||||
// Remove previous trigger if it still exists (idempotent reschedule)
|
||||
if (await scheduler.CheckExists(jobKey, ct))
|
||||
await scheduler.DeleteJob(jobKey, ct);
|
||||
|
||||
var job = JobBuilder.Create<InstanceHealthCheckJob>()
|
||||
.WithIdentity(jobKey)
|
||||
.UsingJobData("instanceId", instance.Id.ToString())
|
||||
.Build();
|
||||
|
||||
var trigger = TriggerBuilder.Create()
|
||||
.WithIdentity($"health-{instance.Id}-trigger", "health-checks")
|
||||
.StartAt(DateTimeOffset.UtcNow.Add(delay))
|
||||
.Build();
|
||||
|
||||
await scheduler.ScheduleJob(job, trigger, ct);
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Scheduled health checks for {Count} active instance(s)", instances.Count);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Quartz job that executes all <see cref="IHealthCheck"/> implementations for a single instance.
|
||||
/// </summary>
|
||||
[DisallowConcurrentExecution]
|
||||
public sealed class InstanceHealthCheckJob : IJob
|
||||
{
|
||||
/// <summary>Global concurrency limiter — max 4 parallel health check runs.</summary>
|
||||
private static readonly SemaphoreSlim s_concurrency = new(4);
|
||||
|
||||
private readonly IServiceProvider _services;
|
||||
private readonly ILogger<InstanceHealthCheckJob> _logger;
|
||||
|
||||
public InstanceHealthCheckJob(
|
||||
IServiceProvider services,
|
||||
ILogger<InstanceHealthCheckJob> logger)
|
||||
{
|
||||
_services = services;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public async Task Execute(IJobExecutionContext context)
|
||||
{
|
||||
var instanceIdStr = context.MergedJobDataMap.GetString("instanceId");
|
||||
if (!Guid.TryParse(instanceIdStr, out var instanceId))
|
||||
{
|
||||
_logger.LogWarning("InstanceHealthCheckJob: invalid instanceId {Id}", instanceIdStr);
|
||||
return;
|
||||
}
|
||||
|
||||
await s_concurrency.WaitAsync(context.CancellationToken);
|
||||
try
|
||||
{
|
||||
await RunChecksForInstanceAsync(instanceId, context.CancellationToken);
|
||||
}
|
||||
finally
|
||||
{
|
||||
s_concurrency.Release();
|
||||
}
|
||||
}
|
||||
|
||||
private async Task RunChecksForInstanceAsync(Guid instanceId, CancellationToken ct)
|
||||
{
|
||||
await using var scope = _services.CreateAsyncScope();
|
||||
var db = scope.ServiceProvider.GetRequiredService<OrchestratorDbContext>();
|
||||
var hub = scope.ServiceProvider.GetRequiredService<IHubContext<FleetHub, IFleetClient>>();
|
||||
var checks = scope.ServiceProvider.GetServices<IHealthCheck>();
|
||||
|
||||
var instance = await db.Instances
|
||||
.Include(i => i.Customer)
|
||||
.Include(i => i.OauthAppRegistries)
|
||||
.Include(i => i.ByoiConfigs)
|
||||
.FirstOrDefaultAsync(i => i.Id == instanceId, ct);
|
||||
|
||||
if (instance is null)
|
||||
{
|
||||
_logger.LogWarning("InstanceHealthCheckJob: instance {Id} not found", instanceId);
|
||||
return;
|
||||
}
|
||||
|
||||
var abbrev = instance.Customer.Abbreviation;
|
||||
var worstStatus = HealthStatus.Healthy;
|
||||
|
||||
foreach (var check in checks)
|
||||
{
|
||||
// Skip the AuthentikGlobalHealthCheck — it runs on its own schedule
|
||||
if (check.CheckName == "AuthentikGlobal")
|
||||
continue;
|
||||
|
||||
HealthCheckResult result;
|
||||
try
|
||||
{
|
||||
result = await check.RunAsync(instance, ct);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Health check {Check} failed for {Abbrev}", check.CheckName, abbrev);
|
||||
result = new HealthCheckResult(HealthStatus.Critical, $"Check threw exception: {ex.Message}");
|
||||
}
|
||||
|
||||
// Persist HealthEvent
|
||||
var healthEvent = new HealthEvent
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
InstanceId = instanceId,
|
||||
CheckName = check.CheckName,
|
||||
Status = ToEventStatus(result.Status),
|
||||
Message = result.Message,
|
||||
Remediated = false,
|
||||
OccurredAt = DateTime.UtcNow,
|
||||
};
|
||||
|
||||
// Auto-remediation
|
||||
if (check.AutoRemediate && result.Status == HealthStatus.Critical)
|
||||
{
|
||||
try
|
||||
{
|
||||
var fixed_ = await check.RemediateAsync(instance, ct);
|
||||
healthEvent.Remediated = fixed_;
|
||||
|
||||
// Append-only audit log
|
||||
db.AuditLogs.Add(new AuditLog
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
InstanceId = instanceId,
|
||||
Actor = $"HealthCheckEngine:{check.CheckName}",
|
||||
Action = "AutoRemediate",
|
||||
Target = abbrev,
|
||||
Outcome = fixed_ ? "Success" : "Failed",
|
||||
Detail = result.Detail,
|
||||
OccurredAt = DateTime.UtcNow,
|
||||
});
|
||||
|
||||
if (fixed_)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Auto-remediated {Check} for {Abbrev}", check.CheckName, abbrev);
|
||||
// Downgrade severity since we fixed it
|
||||
healthEvent.Status = HealthEventStatus.Healthy;
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Remediation for {Check} failed on {Abbrev}", check.CheckName, abbrev);
|
||||
db.AuditLogs.Add(new AuditLog
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
InstanceId = instanceId,
|
||||
Actor = $"HealthCheckEngine:{check.CheckName}",
|
||||
Action = "AutoRemediate",
|
||||
Target = abbrev,
|
||||
Outcome = "Error",
|
||||
Detail = ex.Message,
|
||||
OccurredAt = DateTime.UtcNow,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
db.HealthEvents.Add(healthEvent);
|
||||
|
||||
// Track worst severity (only from non-remediated results)
|
||||
if (!healthEvent.Remediated)
|
||||
{
|
||||
var status = FromEventStatus(healthEvent.Status);
|
||||
if (status > worstStatus)
|
||||
worstStatus = status;
|
||||
}
|
||||
}
|
||||
|
||||
// Update instance health status
|
||||
var previousStatus = instance.HealthStatus;
|
||||
instance.HealthStatus = worstStatus;
|
||||
instance.LastHealthCheck = DateTime.UtcNow;
|
||||
|
||||
await db.SaveChangesAsync(ct);
|
||||
|
||||
// Broadcast status change
|
||||
if (previousStatus != worstStatus)
|
||||
{
|
||||
await hub.Clients.All.SendInstanceStatusChanged(
|
||||
instance.CustomerId.ToString(), worstStatus.ToString());
|
||||
}
|
||||
}
|
||||
|
||||
private static HealthEventStatus ToEventStatus(HealthStatus status) => status switch
|
||||
{
|
||||
HealthStatus.Healthy => HealthEventStatus.Healthy,
|
||||
HealthStatus.Degraded => HealthEventStatus.Degraded,
|
||||
HealthStatus.Critical => HealthEventStatus.Critical,
|
||||
_ => HealthEventStatus.Critical,
|
||||
};
|
||||
|
||||
private static HealthStatus FromEventStatus(HealthEventStatus status) => status switch
|
||||
{
|
||||
HealthEventStatus.Healthy => HealthStatus.Healthy,
|
||||
HealthEventStatus.Degraded => HealthStatus.Degraded,
|
||||
HealthEventStatus.Critical => HealthStatus.Critical,
|
||||
_ => HealthStatus.Critical,
|
||||
};
|
||||
}
|
||||
32
OTSSignsOrchestrator.Server/Health/IHealthCheck.cs
Normal file
32
OTSSignsOrchestrator.Server/Health/IHealthCheck.cs
Normal file
@@ -0,0 +1,32 @@
|
||||
using OTSSignsOrchestrator.Server.Data.Entities;
|
||||
|
||||
namespace OTSSignsOrchestrator.Server.Health;
|
||||
|
||||
/// <summary>
|
||||
/// Result of a single health check execution.
|
||||
/// </summary>
|
||||
public record HealthCheckResult(HealthStatus Status, string Message, string? Detail = null);
|
||||
|
||||
/// <summary>
|
||||
/// Contract for an individual health check that runs against a specific <see cref="Instance"/>.
|
||||
/// </summary>
|
||||
public interface IHealthCheck
|
||||
{
|
||||
/// <summary>Human-readable name written to <see cref="HealthEvent.CheckName"/>.</summary>
|
||||
string CheckName { get; }
|
||||
|
||||
/// <summary>
|
||||
/// When true the engine will automatically call <see cref="RemediateAsync"/>
|
||||
/// if the check returns <see cref="HealthStatus.Critical"/>.
|
||||
/// </summary>
|
||||
bool AutoRemediate { get; }
|
||||
|
||||
/// <summary>Execute the check for <paramref name="instance"/>.</summary>
|
||||
Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct);
|
||||
|
||||
/// <summary>
|
||||
/// Attempt automatic remediation. Return true if the issue was fixed.
|
||||
/// The default implementation does nothing and returns false.
|
||||
/// </summary>
|
||||
Task<bool> RemediateAsync(Instance instance, CancellationToken ct) => Task.FromResult(false);
|
||||
}
|
||||
Reference in New Issue
Block a user