feat: Implement provisioning pipelines for subscription management

- Add ReactivatePipeline to handle subscription reactivation, including scaling Docker services, health verification, status updates, audit logging, and broadcasting status changes.
- Introduce RotateCredentialsPipeline for OAuth2 credential rotation, managing the deletion of old apps, creation of new ones, credential storage, access verification, and audit logging.
- Create StepRunner to manage job step execution, including lifecycle management and progress broadcasting via SignalR.
- Implement SuspendPipeline for subscription suspension, scaling down services, updating statuses, logging audits, and broadcasting changes.
- Add UpdateScreenLimitPipeline to update Xibo CMS screen limits and record snapshots.
- Introduce XiboFeatureManifests for hardcoded feature ACLs per role.
- Add docker-compose.dev.yml for local development with PostgreSQL setup.
This commit is contained in:
Matt Batchelder
2026-03-18 10:27:26 -04:00
parent c2e03de8bb
commit c6d46098dd
77 changed files with 9412 additions and 29 deletions

View File

@@ -0,0 +1,37 @@
using Quartz;
using OTSSignsOrchestrator.Server.Health.Checks;
namespace OTSSignsOrchestrator.Server.Health;
/// <summary>
/// Quartz job that runs the <see cref="AuthentikGlobalHealthCheck"/> every 2 minutes
/// on a separate schedule from the per-instance health checks.
/// </summary>
[DisallowConcurrentExecution]
public sealed class AuthentikGlobalHealthJob : IJob
{
private readonly AuthentikGlobalHealthCheck _check;
private readonly ILogger<AuthentikGlobalHealthJob> _logger;
public AuthentikGlobalHealthJob(
AuthentikGlobalHealthCheck check,
ILogger<AuthentikGlobalHealthJob> logger)
{
_check = check;
_logger = logger;
}
public async Task Execute(IJobExecutionContext context)
{
try
{
var result = await _check.RunGlobalAsync(context.CancellationToken);
_logger.LogInformation("Authentik global health: {Status} — {Message}",
result.Status, result.Message);
}
catch (Exception ex)
{
_logger.LogError(ex, "Authentik global health job failed");
}
}
}

View File

@@ -0,0 +1,183 @@
using Microsoft.EntityFrameworkCore;
using OTSSignsOrchestrator.Server.Clients;
using OTSSignsOrchestrator.Server.Data;
using OTSSignsOrchestrator.Server.Data.Entities;
namespace OTSSignsOrchestrator.Server.Health.Checks;
/// <summary>
/// Verifies that both <c>ots-admin-{abbrev}</c> and <c>ots-svc-{abbrev}</c> exist
/// with <c>userTypeId == 1</c> (SuperAdmin). MUST use <see cref="XiboApiClientExtensions.GetAllPagesAsync{T}"/>
/// because Xibo paginates at 10 items by default.
///
/// <c>saml-usertypeid</c> is JIT-only and does NOT maintain SuperAdmin on existing users —
/// this check IS the ongoing enforcement mechanism.
/// </summary>
public sealed class AdminIntegrityHealthCheck : IHealthCheck
{
private readonly XiboClientFactory _clientFactory;
private readonly IServiceProvider _services;
private readonly ILogger<AdminIntegrityHealthCheck> _logger;
public string CheckName => "AdminIntegrity";
public bool AutoRemediate => true;
public AdminIntegrityHealthCheck(
XiboClientFactory clientFactory,
IServiceProvider services,
ILogger<AdminIntegrityHealthCheck> logger)
{
_clientFactory = clientFactory;
_services = services;
_logger = logger;
}
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
{
var (client, abbrev) = await ResolveAsync(instance);
if (client is null)
return new HealthCheckResult(HealthStatus.Critical, "No OAuth app — cannot verify admin accounts");
var users = await client.GetAllPagesAsync(
(start, length) => client.GetUsersAsync(start, length));
var adminName = $"ots-admin-{abbrev}";
var svcName = $"ots-svc-{abbrev}";
var problems = new List<string>();
foreach (var expected in new[] { adminName, svcName })
{
var user = users.FirstOrDefault(u =>
u.TryGetValue("userName", out var n) &&
string.Equals(n?.ToString(), expected, StringComparison.OrdinalIgnoreCase));
if (user is null)
{
problems.Add($"{expected} is MISSING");
continue;
}
if (user.TryGetValue("userTypeId", out var typeObj) &&
typeObj?.ToString() != "1")
{
problems.Add($"{expected} has userTypeId={typeObj} (expected 1)");
}
}
if (problems.Count == 0)
return new HealthCheckResult(HealthStatus.Healthy, "Admin accounts intact");
return new HealthCheckResult(
HealthStatus.Critical,
$"Admin integrity issues: {string.Join("; ", problems)}",
string.Join("\n", problems));
}
public async Task<bool> RemediateAsync(Instance instance, CancellationToken ct)
{
var (client, abbrev) = await ResolveAsync(instance);
if (client is null) return false;
await using var scope = _services.CreateAsyncScope();
var db = scope.ServiceProvider.GetRequiredService<OrchestratorDbContext>();
var users = await client.GetAllPagesAsync(
(start, length) => client.GetUsersAsync(start, length));
var adminName = $"ots-admin-{abbrev}";
var svcName = $"ots-svc-{abbrev}";
var allFixed = true;
foreach (var expected in new[] { adminName, svcName })
{
var user = users.FirstOrDefault(u =>
u.TryGetValue("userName", out var n) &&
string.Equals(n?.ToString(), expected, StringComparison.OrdinalIgnoreCase));
if (user is null)
{
// Recreate missing account
var email = $"{expected}@otssigns.internal";
var password = GenerateRandomPassword(32);
var createResp = await client.CreateUserAsync(new CreateUserRequest(
expected, email, password, UserTypeId: 1, HomePageId: 1));
if (!createResp.IsSuccessStatusCode)
{
_logger.LogError("Failed to recreate {User}: {Err}", expected, createResp.Error?.Content);
allFixed = false;
continue;
}
// Audit
db.AuditLogs.Add(new AuditLog
{
Id = Guid.NewGuid(),
InstanceId = instance.Id,
Actor = "HealthCheckEngine:AdminIntegrity",
Action = "RecreateUser",
Target = expected,
Outcome = "Success",
Detail = "User was missing — recreated as SuperAdmin",
OccurredAt = DateTime.UtcNow,
});
}
else
{
// Fix userTypeId if wrong
if (user.TryGetValue("userTypeId", out var typeObj) && typeObj?.ToString() != "1")
{
var userId = int.Parse(user["userId"]?.ToString() ?? "0");
if (userId == 0) { allFixed = false; continue; }
var updateResp = await client.UpdateUserAsync(userId, new UpdateUserRequest(
UserName: null, Email: null, Password: null, UserTypeId: 1,
HomePageId: null, Retired: null));
if (!updateResp.IsSuccessStatusCode)
{
_logger.LogError("Failed to fix userTypeId for {User}: {Err}",
expected, updateResp.Error?.Content);
allFixed = false;
continue;
}
db.AuditLogs.Add(new AuditLog
{
Id = Guid.NewGuid(),
InstanceId = instance.Id,
Actor = "HealthCheckEngine:AdminIntegrity",
Action = "FixUserType",
Target = expected,
Outcome = "Success",
Detail = $"Changed userTypeId from {typeObj} to 1 (SuperAdmin)",
OccurredAt = DateTime.UtcNow,
});
}
}
}
await db.SaveChangesAsync(ct);
return allFixed;
}
private async Task<(IXiboApiClient? Client, string Abbrev)> ResolveAsync(Instance instance)
{
var abbrev = instance.Customer.Abbreviation;
var oauthApp = instance.OauthAppRegistries.FirstOrDefault();
if (oauthApp is null) return (null, abbrev);
var settings = _services.GetRequiredService<Core.Services.SettingsService>();
var secret = await settings.GetAsync(Core.Services.SettingsService.InstanceOAuthSecretId(abbrev));
if (string.IsNullOrEmpty(secret)) return (null, abbrev);
var client = await _clientFactory.CreateAsync(instance.XiboUrl, oauthApp.ClientId, secret);
return (client, abbrev);
}
private static string GenerateRandomPassword(int length)
{
const string chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*";
return System.Security.Cryptography.RandomNumberGenerator.GetString(chars, length);
}
}

View File

@@ -0,0 +1,107 @@
using System.Diagnostics;
using Microsoft.AspNetCore.SignalR;
using Microsoft.EntityFrameworkCore;
using OTSSignsOrchestrator.Server.Clients;
using OTSSignsOrchestrator.Server.Data;
using OTSSignsOrchestrator.Server.Data.Entities;
using OTSSignsOrchestrator.Server.Hubs;
namespace OTSSignsOrchestrator.Server.Health.Checks;
/// <summary>
/// Probes the central Authentik instance at <c>GET /api/v3/-/health/ready/</c>.
/// Measures latency and writes an <see cref="AuthentikMetrics"/> row.
/// If down: <c>Severity = Critical</c>, message "Central Authentik is DOWN — all customer web UI logins failing".
/// This is a fleet-wide P1 alert. Runs every 2 minutes on a separate schedule.
///
/// This check is NOT per-instance — it runs once globally. The engine skips it for
/// per-instance checks. Instead it is scheduled independently as a Quartz job.
/// </summary>
public sealed class AuthentikGlobalHealthCheck : IHealthCheck
{
private readonly IAuthentikClient _authentikClient;
private readonly IServiceProvider _services;
private readonly ILogger<AuthentikGlobalHealthCheck> _logger;
public string CheckName => "AuthentikGlobal";
public bool AutoRemediate => false;
public AuthentikGlobalHealthCheck(
IAuthentikClient authentikClient,
IServiceProvider services,
ILogger<AuthentikGlobalHealthCheck> logger)
{
_authentikClient = authentikClient;
_services = services;
_logger = logger;
}
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
{
// This check doesn't use the instance parameter — it checks global Authentik health.
return await RunGlobalAsync(ct);
}
/// <summary>
/// Core logic — callable from the Quartz job without an instance context.
/// </summary>
public async Task<HealthCheckResult> RunGlobalAsync(CancellationToken ct)
{
var sw = Stopwatch.StartNew();
AuthentikMetricsStatus metricsStatus;
string? errorMessage = null;
HealthCheckResult result;
try
{
var response = await _authentikClient.CheckHealthAsync();
sw.Stop();
if (response.IsSuccessStatusCode)
{
metricsStatus = AuthentikMetricsStatus.Healthy;
result = new HealthCheckResult(HealthStatus.Healthy,
$"Authentik healthy (latency: {sw.ElapsedMilliseconds}ms)");
}
else
{
metricsStatus = AuthentikMetricsStatus.Critical;
errorMessage = $"HTTP {response.StatusCode}";
result = new HealthCheckResult(HealthStatus.Critical,
"Central Authentik is DOWN — all customer web UI logins failing",
$"Health endpoint returned {response.StatusCode}");
}
}
catch (Exception ex)
{
sw.Stop();
metricsStatus = AuthentikMetricsStatus.Critical;
errorMessage = ex.Message;
result = new HealthCheckResult(HealthStatus.Critical,
"Central Authentik is DOWN — all customer web UI logins failing",
ex.Message);
}
// Write metrics row
await using var scope = _services.CreateAsyncScope();
var db = scope.ServiceProvider.GetRequiredService<OrchestratorDbContext>();
db.AuthentikMetrics.Add(new AuthentikMetrics
{
Id = Guid.NewGuid(),
CheckedAt = DateTime.UtcNow,
Status = metricsStatus,
LatencyMs = (int)sw.ElapsedMilliseconds,
ErrorMessage = errorMessage,
});
await db.SaveChangesAsync(ct);
// Broadcast alert if critical
if (result.Status == HealthStatus.Critical)
{
var hub = scope.ServiceProvider.GetRequiredService<IHubContext<FleetHub, IFleetClient>>();
await hub.Clients.All.SendAlertRaised("Critical", result.Message);
}
return result;
}
}

View File

@@ -0,0 +1,60 @@
using OTSSignsOrchestrator.Server.Clients;
using OTSSignsOrchestrator.Server.Data.Entities;
namespace OTSSignsOrchestrator.Server.Health.Checks;
/// <summary>
/// Verifies the per-instance SAML provider in Authentik is active by checking
/// the provider exists using the stored <see cref="Instance.AuthentikProviderId"/>.
/// </summary>
public sealed class AuthentikSamlProviderHealthCheck : IHealthCheck
{
private readonly IAuthentikClient _authentikClient;
private readonly ILogger<AuthentikSamlProviderHealthCheck> _logger;
public string CheckName => "AuthentikSamlProvider";
public bool AutoRemediate => false;
public AuthentikSamlProviderHealthCheck(
IAuthentikClient authentikClient,
ILogger<AuthentikSamlProviderHealthCheck> logger)
{
_authentikClient = authentikClient;
_logger = logger;
}
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
{
if (string.IsNullOrEmpty(instance.AuthentikProviderId))
{
return new HealthCheckResult(HealthStatus.Degraded,
"No Authentik provider ID stored — SAML not provisioned");
}
if (!int.TryParse(instance.AuthentikProviderId, out var providerId))
{
return new HealthCheckResult(HealthStatus.Critical,
$"Invalid Authentik provider ID: {instance.AuthentikProviderId}");
}
try
{
var response = await _authentikClient.GetSamlProviderAsync(providerId);
if (response.IsSuccessStatusCode && response.Content is not null)
{
return new HealthCheckResult(HealthStatus.Healthy,
$"SAML provider {providerId} is active in Authentik");
}
return new HealthCheckResult(HealthStatus.Critical,
$"SAML provider {providerId} not found or inaccessible",
response.Error?.Content);
}
catch (Exception ex)
{
return new HealthCheckResult(HealthStatus.Critical,
$"Failed to check SAML provider: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,69 @@
using OTSSignsOrchestrator.Server.Data.Entities;
namespace OTSSignsOrchestrator.Server.Health.Checks;
/// <summary>
/// For Pro plan BYOI customers: checks certificate expiry from <see cref="ByoiConfig"/>.
/// Alerts at 60-day (Warning), 30-day (Warning), 7-day (Critical) thresholds.
/// AutoRemediate=false — customer must rotate their IdP certificate via the portal.
/// </summary>
public sealed class ByoiCertExpiryHealthCheck : IHealthCheck
{
/// <summary>Alert thresholds in days (descending).</summary>
internal static readonly int[] AlertThresholdDays = [60, 30, 7];
/// <summary>Days at or below which severity escalates to Critical.</summary>
internal const int CriticalThresholdDays = 7;
public string CheckName => "ByoiCertExpiry";
public bool AutoRemediate => false;
public Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
{
// Only applies to instances with an enabled BYOI config
var byoiConfig = instance.ByoiConfigs.FirstOrDefault(b => b.Enabled);
if (byoiConfig is null)
{
return Task.FromResult(new HealthCheckResult(HealthStatus.Healthy,
"No BYOI config — check not applicable"));
}
// Only Pro customers have BYOI
if (instance.Customer.Plan != CustomerPlan.Pro)
{
return Task.FromResult(new HealthCheckResult(HealthStatus.Healthy,
"Non-Pro plan — BYOI check not applicable"));
}
var daysRemaining = (byoiConfig.CertExpiry - DateTime.UtcNow).TotalDays;
if (daysRemaining <= 0)
{
return Task.FromResult(new HealthCheckResult(HealthStatus.Critical,
$"BYOI certificate has EXPIRED (expired {Math.Abs((int)daysRemaining)} days ago)",
"Customer must rotate their IdP certificate via the portal immediately"));
}
if (daysRemaining <= CriticalThresholdDays)
{
return Task.FromResult(new HealthCheckResult(HealthStatus.Critical,
$"BYOI certificate expires in {(int)daysRemaining} days",
"Urgent: customer must rotate their IdP certificate"));
}
// Check warning thresholds (60 and 30 days)
foreach (var threshold in AlertThresholdDays)
{
if (threshold <= CriticalThresholdDays) continue;
if (daysRemaining <= threshold)
{
return Task.FromResult(new HealthCheckResult(HealthStatus.Degraded,
$"BYOI certificate expires in {(int)daysRemaining} days (threshold: {threshold}d)",
"Customer should plan certificate rotation"));
}
}
return Task.FromResult(new HealthCheckResult(HealthStatus.Healthy,
$"BYOI certificate valid for {(int)daysRemaining} more days"));
}
}

View File

@@ -0,0 +1,74 @@
using OTSSignsOrchestrator.Server.Clients;
using OTSSignsOrchestrator.Server.Data.Entities;
namespace OTSSignsOrchestrator.Server.Health.Checks;
/// <summary>
/// Verifies the count of authorised displays does not exceed the customer's licensed
/// <see cref="Customer.ScreenCount"/>. Uses <see cref="XiboApiClientExtensions.GetAllPagesAsync{T}"/>
/// with <c>authorised=1</c> filter to get all authorised displays.
/// </summary>
public sealed class DisplayAuthorisedHealthCheck : IHealthCheck
{
private readonly XiboClientFactory _clientFactory;
private readonly IServiceProvider _services;
private readonly ILogger<DisplayAuthorisedHealthCheck> _logger;
public string CheckName => "DisplayAuthorised";
public bool AutoRemediate => false;
public DisplayAuthorisedHealthCheck(
XiboClientFactory clientFactory,
IServiceProvider services,
ILogger<DisplayAuthorisedHealthCheck> logger)
{
_clientFactory = clientFactory;
_services = services;
_logger = logger;
}
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
{
var (client, _) = await ResolveAsync(instance);
if (client is null)
return new HealthCheckResult(HealthStatus.Critical, "No OAuth app — cannot check displays");
try
{
var displays = await client.GetAllPagesAsync(
(start, length) => client.GetDisplaysAsync(start, length, authorised: 1));
var authorisedCount = displays.Count;
var licensed = instance.Customer.ScreenCount;
if (authorisedCount <= licensed)
{
return new HealthCheckResult(HealthStatus.Healthy,
$"Authorised displays: {authorisedCount}/{licensed}");
}
return new HealthCheckResult(HealthStatus.Degraded,
$"Authorised displays ({authorisedCount}) exceeds license ({licensed})",
$"Over-provisioned by {authorisedCount - licensed} display(s)");
}
catch (Exception ex)
{
return new HealthCheckResult(HealthStatus.Critical,
$"Failed to check displays: {ex.Message}");
}
}
private async Task<(IXiboApiClient? Client, string Abbrev)> ResolveAsync(Instance instance)
{
var abbrev = instance.Customer.Abbreviation;
var oauthApp = instance.OauthAppRegistries.FirstOrDefault();
if (oauthApp is null) return (null, abbrev);
var settings = _services.GetRequiredService<Core.Services.SettingsService>();
var secret = await settings.GetAsync(Core.Services.SettingsService.InstanceOAuthSecretId(abbrev));
if (string.IsNullOrEmpty(secret)) return (null, abbrev);
var client = await _clientFactory.CreateAsync(instance.XiboUrl, oauthApp.ClientId, secret);
return (client, abbrev);
}
}

View File

@@ -0,0 +1,124 @@
using OTSSignsOrchestrator.Server.Clients;
using OTSSignsOrchestrator.Server.Data;
using OTSSignsOrchestrator.Server.Data.Entities;
namespace OTSSignsOrchestrator.Server.Health.Checks;
/// <summary>
/// Verifies all 4 expected Xibo groups exist for the instance:
/// <c>{abbrev}-viewer</c>, <c>{abbrev}-editor</c>, <c>{abbrev}-admin</c>, <c>ots-it-{abbrev}</c>.
/// Uses <see cref="XiboApiClientExtensions.GetAllPagesAsync{T}"/> to avoid pagination truncation.
/// </summary>
public sealed class GroupStructureHealthCheck : IHealthCheck
{
private readonly XiboClientFactory _clientFactory;
private readonly IServiceProvider _services;
private readonly ILogger<GroupStructureHealthCheck> _logger;
public string CheckName => "GroupStructure";
public bool AutoRemediate => true;
public GroupStructureHealthCheck(
XiboClientFactory clientFactory,
IServiceProvider services,
ILogger<GroupStructureHealthCheck> logger)
{
_clientFactory = clientFactory;
_services = services;
_logger = logger;
}
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
{
var (client, abbrev) = await ResolveAsync(instance);
if (client is null)
return new HealthCheckResult(HealthStatus.Critical, "No OAuth app — cannot verify groups");
var expected = ExpectedGroups(abbrev);
var groups = await client.GetAllPagesAsync(
(start, length) => client.GetGroupsAsync(start, length));
var existing = groups
.Select(g => g.TryGetValue("group", out var n) ? n?.ToString() : null)
.Where(n => n is not null)
.ToHashSet(StringComparer.OrdinalIgnoreCase);
var missing = expected.Where(e => !existing.Contains(e)).ToList();
if (missing.Count == 0)
return new HealthCheckResult(HealthStatus.Healthy, "All 4 expected groups present");
return new HealthCheckResult(
HealthStatus.Critical,
$"Missing groups: {string.Join(", ", missing)}",
$"Expected: {string.Join(", ", expected)}");
}
public async Task<bool> RemediateAsync(Instance instance, CancellationToken ct)
{
var (client, abbrev) = await ResolveAsync(instance);
if (client is null) return false;
await using var scope = _services.CreateAsyncScope();
var db = scope.ServiceProvider.GetRequiredService<OrchestratorDbContext>();
var expected = ExpectedGroups(abbrev);
var groups = await client.GetAllPagesAsync(
(start, length) => client.GetGroupsAsync(start, length));
var existing = groups
.Select(g => g.TryGetValue("group", out var n) ? n?.ToString() : null)
.Where(n => n is not null)
.ToHashSet(StringComparer.OrdinalIgnoreCase);
var allFixed = true;
foreach (var name in expected.Where(e => !existing.Contains(e)))
{
var resp = await client.CreateGroupAsync(new CreateGroupRequest(name, $"Auto-created by health check for {abbrev}"));
if (resp.IsSuccessStatusCode)
{
db.AuditLogs.Add(new AuditLog
{
Id = Guid.NewGuid(),
InstanceId = instance.Id,
Actor = "HealthCheckEngine:GroupStructure",
Action = "CreateGroup",
Target = name,
Outcome = "Success",
Detail = $"Recreated missing group {name}",
OccurredAt = DateTime.UtcNow,
});
}
else
{
_logger.LogError("Failed to create group {Group}: {Err}", name, resp.Error?.Content);
allFixed = false;
}
}
await db.SaveChangesAsync(ct);
return allFixed;
}
private static string[] ExpectedGroups(string abbrev) =>
[
$"{abbrev}-viewer",
$"{abbrev}-editor",
$"{abbrev}-admin",
$"ots-it-{abbrev}",
];
private async Task<(IXiboApiClient? Client, string Abbrev)> ResolveAsync(Instance instance)
{
var abbrev = instance.Customer.Abbreviation;
var oauthApp = instance.OauthAppRegistries.FirstOrDefault();
if (oauthApp is null) return (null, abbrev);
var settings = _services.GetRequiredService<Core.Services.SettingsService>();
var secret = await settings.GetAsync(Core.Services.SettingsService.InstanceOAuthSecretId(abbrev));
if (string.IsNullOrEmpty(secret)) return (null, abbrev);
var client = await _clientFactory.CreateAsync(instance.XiboUrl, oauthApp.ClientId, secret);
return (client, abbrev);
}
}

View File

@@ -0,0 +1,63 @@
using OTSSignsOrchestrator.Server.Clients;
using OTSSignsOrchestrator.Server.Data.Entities;
namespace OTSSignsOrchestrator.Server.Health.Checks;
/// <summary>
/// Verifies the <c>invite-{abbrev}</c> flow exists in Authentik by searching for it
/// in the invitation stages list.
/// </summary>
public sealed class InvitationFlowHealthCheck : IHealthCheck
{
private readonly IAuthentikClient _authentikClient;
private readonly ILogger<InvitationFlowHealthCheck> _logger;
public string CheckName => "InvitationFlow";
public bool AutoRemediate => false;
public InvitationFlowHealthCheck(
IAuthentikClient authentikClient,
ILogger<InvitationFlowHealthCheck> logger)
{
_authentikClient = authentikClient;
_logger = logger;
}
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
{
var abbrev = instance.Customer.Abbreviation;
var expectedName = $"invite-{abbrev}";
try
{
// Search Authentik groups for evidence of the invitation flow
// The invitation is created as a stage invitation; we verify via the
// Authentik API by searching for it by name.
var groupResponse = await _authentikClient.ListGroupsAsync(expectedName);
if (groupResponse.IsSuccessStatusCode && groupResponse.Content?.Results is { Count: > 0 })
{
var found = groupResponse.Content.Results.Any(g =>
g.TryGetValue("name", out var n) &&
string.Equals(n?.ToString(), expectedName, StringComparison.OrdinalIgnoreCase));
if (found)
{
return new HealthCheckResult(HealthStatus.Healthy,
$"Invitation flow '{expectedName}' exists in Authentik");
}
}
// If groups don't show it, it's still possible the invitation was created
// as a separate stage object. Log as degraded since we can't fully confirm.
return new HealthCheckResult(HealthStatus.Degraded,
$"Invitation flow '{expectedName}' not found in Authentik",
"The invitation may exist but could not be verified via group search");
}
catch (Exception ex)
{
return new HealthCheckResult(HealthStatus.Critical,
$"Failed to check invitation flow: {ex.Message}");
}
}
}

View File

@@ -0,0 +1,106 @@
using Renci.SshNet;
using OTSSignsOrchestrator.Server.Data.Entities;
namespace OTSSignsOrchestrator.Server.Health.Checks;
/// <summary>
/// Verifies connectivity to the instance's MySQL database by running a simple query
/// via SSH against the Docker Swarm host.
/// </summary>
public sealed class MySqlConnectHealthCheck : IHealthCheck
{
private readonly IServiceProvider _services;
private readonly ILogger<MySqlConnectHealthCheck> _logger;
public string CheckName => "MySqlConnect";
public bool AutoRemediate => false;
public MySqlConnectHealthCheck(
IServiceProvider services,
ILogger<MySqlConnectHealthCheck> logger)
{
_services = services;
_logger = logger;
}
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
{
var dbName = instance.MysqlDatabase;
if (string.IsNullOrEmpty(dbName))
return new HealthCheckResult(HealthStatus.Critical, "No MySQL database configured");
try
{
var settings = _services.GetRequiredService<Core.Services.SettingsService>();
var sshInfo = await GetSwarmSshHostAsync(settings);
var mysqlHost = await settings.GetAsync(Core.Services.SettingsService.MySqlHost, "localhost");
var mysqlPort = await settings.GetAsync(Core.Services.SettingsService.MySqlPort, "3306");
var mysqlUser = await settings.GetAsync(Core.Services.SettingsService.MySqlAdminUser, "root");
var mysqlPass = await settings.GetAsync(Core.Services.SettingsService.MySqlAdminPassword, "");
using var sshClient = CreateSshClient(sshInfo);
sshClient.Connect();
try
{
// Simple connectivity test — SELECT 1 against the instance database
var cmd = $"mysql -h {mysqlHost} -P {mysqlPort} -u {mysqlUser} " +
$"-p'{mysqlPass}' -e 'SELECT 1' {dbName} 2>&1";
var output = RunSshCommand(sshClient, cmd);
return new HealthCheckResult(HealthStatus.Healthy,
$"MySQL connection to {dbName} successful");
}
finally
{
sshClient.Disconnect();
}
}
catch (Exception ex)
{
return new HealthCheckResult(HealthStatus.Critical,
$"MySQL connection failed for {dbName}: {ex.Message}");
}
}
private static async Task<SshConnectionInfo> GetSwarmSshHostAsync(Core.Services.SettingsService settings)
{
var host = await settings.GetAsync("Ssh.SwarmHost")
?? throw new InvalidOperationException("SSH Swarm host not configured.");
var portStr = await settings.GetAsync("Ssh.SwarmPort", "22");
var user = await settings.GetAsync("Ssh.SwarmUser", "root");
var keyPath = await settings.GetAsync("Ssh.SwarmKeyPath");
var password = await settings.GetAsync("Ssh.SwarmPassword");
if (!int.TryParse(portStr, out var port)) port = 22;
return new SshConnectionInfo(host, port, user, keyPath, password);
}
private static SshClient CreateSshClient(SshConnectionInfo info)
{
var authMethods = new List<AuthenticationMethod>();
if (!string.IsNullOrEmpty(info.KeyPath))
authMethods.Add(new PrivateKeyAuthenticationMethod(info.Username, new PrivateKeyFile(info.KeyPath)));
if (!string.IsNullOrEmpty(info.Password))
authMethods.Add(new PasswordAuthenticationMethod(info.Username, info.Password));
if (authMethods.Count == 0)
{
var defaultKeyPath = Path.Combine(
Environment.GetFolderPath(Environment.SpecialFolder.UserProfile), ".ssh", "id_rsa");
if (File.Exists(defaultKeyPath))
authMethods.Add(new PrivateKeyAuthenticationMethod(info.Username, new PrivateKeyFile(defaultKeyPath)));
else
throw new InvalidOperationException($"No SSH auth method for {info.Host}:{info.Port}.");
}
var connInfo = new Renci.SshNet.ConnectionInfo(info.Host, info.Port, info.Username, authMethods.ToArray());
return new SshClient(connInfo);
}
private static string RunSshCommand(SshClient client, string command)
{
using var cmd = client.RunCommand(command);
if (cmd.ExitStatus != 0)
throw new InvalidOperationException($"SSH command failed (exit {cmd.ExitStatus}): {cmd.Error}");
return cmd.Result;
}
internal sealed record SshConnectionInfo(string Host, int Port, string Username, string? KeyPath, string? Password);
}

View File

@@ -0,0 +1,121 @@
using Renci.SshNet;
using OTSSignsOrchestrator.Server.Data.Entities;
namespace OTSSignsOrchestrator.Server.Health.Checks;
/// <summary>
/// Verifies NFS paths for the instance are accessible by running <c>ls</c> via SSH.
/// </summary>
public sealed class NfsAccessHealthCheck : IHealthCheck
{
private readonly IServiceProvider _services;
private readonly ILogger<NfsAccessHealthCheck> _logger;
public string CheckName => "NfsAccess";
public bool AutoRemediate => false;
public NfsAccessHealthCheck(
IServiceProvider services,
ILogger<NfsAccessHealthCheck> logger)
{
_services = services;
_logger = logger;
}
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
{
var nfsPath = instance.NfsPath;
if (string.IsNullOrEmpty(nfsPath))
return new HealthCheckResult(HealthStatus.Critical, "No NFS path configured");
try
{
var settings = _services.GetRequiredService<Core.Services.SettingsService>();
var sshInfo = await GetSwarmSshHostAsync(settings);
var nfsServer = await settings.GetAsync(Core.Services.SettingsService.NfsServer);
var nfsExport = await settings.GetAsync(Core.Services.SettingsService.NfsExport);
if (string.IsNullOrEmpty(nfsServer) || string.IsNullOrEmpty(nfsExport))
return new HealthCheckResult(HealthStatus.Critical, "NFS server/export not configured");
using var sshClient = CreateSshClient(sshInfo);
sshClient.Connect();
try
{
// Mount temporarily and check the path is listable
var mountPoint = $"/tmp/healthcheck-nfs-{Guid.NewGuid():N}";
RunSshCommand(sshClient, $"sudo mkdir -p {mountPoint}");
try
{
RunSshCommand(sshClient, $"sudo mount -t nfs4 {nfsServer}:{nfsExport} {mountPoint}");
var output = RunSshCommand(sshClient, $"ls {mountPoint}/{nfsPath} 2>&1");
return new HealthCheckResult(HealthStatus.Healthy,
$"NFS path accessible: {nfsPath}");
}
finally
{
RunSshCommandAllowFailure(sshClient, $"sudo umount {mountPoint}");
RunSshCommandAllowFailure(sshClient, $"sudo rmdir {mountPoint}");
}
}
finally
{
sshClient.Disconnect();
}
}
catch (Exception ex)
{
return new HealthCheckResult(HealthStatus.Critical,
$"NFS access check failed for {nfsPath}: {ex.Message}");
}
}
private static async Task<SshConnectionInfo> GetSwarmSshHostAsync(Core.Services.SettingsService settings)
{
var host = await settings.GetAsync("Ssh.SwarmHost")
?? throw new InvalidOperationException("SSH Swarm host not configured.");
var portStr = await settings.GetAsync("Ssh.SwarmPort", "22");
var user = await settings.GetAsync("Ssh.SwarmUser", "root");
var keyPath = await settings.GetAsync("Ssh.SwarmKeyPath");
var password = await settings.GetAsync("Ssh.SwarmPassword");
if (!int.TryParse(portStr, out var port)) port = 22;
return new SshConnectionInfo(host, port, user, keyPath, password);
}
private static SshClient CreateSshClient(SshConnectionInfo info)
{
var authMethods = new List<AuthenticationMethod>();
if (!string.IsNullOrEmpty(info.KeyPath))
authMethods.Add(new PrivateKeyAuthenticationMethod(info.Username, new PrivateKeyFile(info.KeyPath)));
if (!string.IsNullOrEmpty(info.Password))
authMethods.Add(new PasswordAuthenticationMethod(info.Username, info.Password));
if (authMethods.Count == 0)
{
var defaultKeyPath = Path.Combine(
Environment.GetFolderPath(Environment.SpecialFolder.UserProfile), ".ssh", "id_rsa");
if (File.Exists(defaultKeyPath))
authMethods.Add(new PrivateKeyAuthenticationMethod(info.Username, new PrivateKeyFile(defaultKeyPath)));
else
throw new InvalidOperationException($"No SSH auth method for {info.Host}:{info.Port}.");
}
var connInfo = new Renci.SshNet.ConnectionInfo(info.Host, info.Port, info.Username, authMethods.ToArray());
return new SshClient(connInfo);
}
private static string RunSshCommand(SshClient client, string command)
{
using var cmd = client.RunCommand(command);
if (cmd.ExitStatus != 0)
throw new InvalidOperationException($"SSH command failed (exit {cmd.ExitStatus}): {cmd.Error}");
return cmd.Result;
}
private static void RunSshCommandAllowFailure(SshClient client, string command)
{
using var cmd = client.RunCommand(command);
// Intentionally ignore exit code — cleanup operations
}
internal sealed record SshConnectionInfo(string Host, int Port, string Username, string? KeyPath, string? Password);
}

View File

@@ -0,0 +1,50 @@
using OTSSignsOrchestrator.Server.Data.Entities;
namespace OTSSignsOrchestrator.Server.Health.Checks;
/// <summary>
/// Checks the age of the OAuth2 application credentials from <see cref="OauthAppRegistry.CreatedAt"/>.
/// Alerts Warning at 180 days, Critical at 365 days. AutoRemediate=false — suggests
/// a "rotate-oauth2" job instead.
/// </summary>
public sealed class OauthAppAgeHealthCheck : IHealthCheck
{
/// <summary>Days at which severity escalates to Warning.</summary>
internal const int WarningThresholdDays = 180;
/// <summary>Days at which severity escalates to Critical.</summary>
internal const int CriticalThresholdDays = 365;
public string CheckName => "OauthAppAge";
public bool AutoRemediate => false;
public Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
{
var oauthApp = instance.OauthAppRegistries
.OrderByDescending(o => o.CreatedAt)
.FirstOrDefault();
if (oauthApp is null)
return Task.FromResult(new HealthCheckResult(HealthStatus.Degraded,
"No OAuth app registered"));
var ageDays = (DateTime.UtcNow - oauthApp.CreatedAt).TotalDays;
if (ageDays >= CriticalThresholdDays)
{
return Task.FromResult(new HealthCheckResult(HealthStatus.Critical,
$"OAuth2 credentials are {(int)ageDays} days old (critical threshold: {CriticalThresholdDays}d)",
"Create a 'rotate-credentials' job to rotate the OAuth2 application"));
}
if (ageDays >= WarningThresholdDays)
{
return Task.FromResult(new HealthCheckResult(HealthStatus.Degraded,
$"OAuth2 credentials are {(int)ageDays} days old (warning threshold: {WarningThresholdDays}d)",
"Schedule credential rotation before they reach 365 days"));
}
return Task.FromResult(new HealthCheckResult(HealthStatus.Healthy,
$"OAuth2 credentials are {(int)ageDays} days old"));
}
}

View File

@@ -0,0 +1,59 @@
using OTSSignsOrchestrator.Server.Clients;
using OTSSignsOrchestrator.Server.Data.Entities;
namespace OTSSignsOrchestrator.Server.Health.Checks;
/// <summary>
/// Verifies the OAuth2 app in <see cref="OauthAppRegistry"/> can still authenticate
/// by testing a <c>client_credentials</c> flow against the Xibo CMS instance.
/// AutoRemediate=false — credential rotation requires a separate job.
/// </summary>
public sealed class OauthAppHealthCheck : IHealthCheck
{
private readonly XiboClientFactory _clientFactory;
private readonly IServiceProvider _services;
private readonly ILogger<OauthAppHealthCheck> _logger;
public string CheckName => "OauthApp";
public bool AutoRemediate => false;
public OauthAppHealthCheck(
XiboClientFactory clientFactory,
IServiceProvider services,
ILogger<OauthAppHealthCheck> logger)
{
_clientFactory = clientFactory;
_services = services;
_logger = logger;
}
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
{
var oauthApp = instance.OauthAppRegistries.FirstOrDefault();
if (oauthApp is null)
return new HealthCheckResult(HealthStatus.Critical, "No OAuth app registered");
var abbrev = instance.Customer.Abbreviation;
var settings = _services.GetRequiredService<Core.Services.SettingsService>();
var secret = await settings.GetAsync(Core.Services.SettingsService.InstanceOAuthSecretId(abbrev));
if (string.IsNullOrEmpty(secret))
return new HealthCheckResult(HealthStatus.Critical,
"OAuth client secret not found in Bitwarden — cannot authenticate");
try
{
// Attempt to create a client (which fetches a token via client_credentials)
var client = await _clientFactory.CreateAsync(instance.XiboUrl, oauthApp.ClientId, secret);
// If we got here, the token was obtained successfully
return new HealthCheckResult(HealthStatus.Healthy, "OAuth2 client_credentials flow successful");
}
catch (Exception ex)
{
return new HealthCheckResult(HealthStatus.Critical,
$"OAuth2 authentication failed: {ex.Message}",
"Credential rotation job may be required");
}
}
}

View File

@@ -0,0 +1,127 @@
using Renci.SshNet;
using OTSSignsOrchestrator.Server.Data.Entities;
namespace OTSSignsOrchestrator.Server.Health.Checks;
/// <summary>
/// Verifies the Docker stack is healthy by running <c>docker stack ps {stackName}</c>
/// via SSH and checking that all services report Running state.
/// </summary>
public sealed class StackHealthCheck : IHealthCheck
{
private readonly IServiceProvider _services;
private readonly ILogger<StackHealthCheck> _logger;
public string CheckName => "StackHealth";
public bool AutoRemediate => false;
public StackHealthCheck(
IServiceProvider services,
ILogger<StackHealthCheck> logger)
{
_services = services;
_logger = logger;
}
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
{
var stackName = instance.DockerStackName;
if (string.IsNullOrEmpty(stackName))
return new HealthCheckResult(HealthStatus.Critical, "No Docker stack name configured");
try
{
var settings = _services.GetRequiredService<Core.Services.SettingsService>();
var sshInfo = await GetSwarmSshHostAsync(settings);
using var sshClient = CreateSshClient(sshInfo);
sshClient.Connect();
try
{
// Get task status for all services in the stack
var output = RunSshCommand(sshClient,
$"docker stack ps {stackName} --format '{{{{.Name}}}}|{{{{.CurrentState}}}}|{{{{.DesiredState}}}}'");
var lines = output.Split('\n', StringSplitOptions.RemoveEmptyEntries);
var notRunning = new List<string>();
foreach (var line in lines)
{
var parts = line.Split('|');
if (parts.Length < 3) continue;
var name = parts[0].Trim();
var currentState = parts[1].Trim();
var desiredState = parts[2].Trim();
// Only check tasks whose desired state is Running
if (desiredState.Equals("Running", StringComparison.OrdinalIgnoreCase) &&
!currentState.StartsWith("Running", StringComparison.OrdinalIgnoreCase))
{
notRunning.Add($"{name}: {currentState}");
}
}
if (notRunning.Count == 0)
return new HealthCheckResult(HealthStatus.Healthy,
$"All services in {stackName} are Running");
return new HealthCheckResult(HealthStatus.Critical,
$"{notRunning.Count} service(s) not running in {stackName}",
string.Join("\n", notRunning));
}
finally
{
sshClient.Disconnect();
}
}
catch (Exception ex)
{
return new HealthCheckResult(HealthStatus.Critical,
$"SSH check failed for {stackName}: {ex.Message}");
}
}
private static async Task<SshConnectionInfo> GetSwarmSshHostAsync(Core.Services.SettingsService settings)
{
var host = await settings.GetAsync("Ssh.SwarmHost")
?? throw new InvalidOperationException("SSH Swarm host not configured.");
var portStr = await settings.GetAsync("Ssh.SwarmPort", "22");
var user = await settings.GetAsync("Ssh.SwarmUser", "root");
var keyPath = await settings.GetAsync("Ssh.SwarmKeyPath");
var password = await settings.GetAsync("Ssh.SwarmPassword");
if (!int.TryParse(portStr, out var port)) port = 22;
return new SshConnectionInfo(host, port, user, keyPath, password);
}
private static SshClient CreateSshClient(SshConnectionInfo info)
{
var authMethods = new List<AuthenticationMethod>();
if (!string.IsNullOrEmpty(info.KeyPath))
authMethods.Add(new PrivateKeyAuthenticationMethod(info.Username, new PrivateKeyFile(info.KeyPath)));
if (!string.IsNullOrEmpty(info.Password))
authMethods.Add(new PasswordAuthenticationMethod(info.Username, info.Password));
if (authMethods.Count == 0)
{
var defaultKeyPath = Path.Combine(
Environment.GetFolderPath(Environment.SpecialFolder.UserProfile), ".ssh", "id_rsa");
if (File.Exists(defaultKeyPath))
authMethods.Add(new PrivateKeyAuthenticationMethod(info.Username, new PrivateKeyFile(defaultKeyPath)));
else
throw new InvalidOperationException($"No SSH auth method for {info.Host}:{info.Port}.");
}
var connInfo = new Renci.SshNet.ConnectionInfo(info.Host, info.Port, info.Username, authMethods.ToArray());
return new SshClient(connInfo);
}
private static string RunSshCommand(SshClient client, string command)
{
using var cmd = client.RunCommand(command);
if (cmd.ExitStatus != 0)
throw new InvalidOperationException($"SSH command failed (exit {cmd.ExitStatus}): {cmd.Error}");
return cmd.Result;
}
internal sealed record SshConnectionInfo(string Host, int Port, string Username, string? KeyPath, string? Password);
}

View File

@@ -0,0 +1,145 @@
using OTSSignsOrchestrator.Server.Clients;
using OTSSignsOrchestrator.Server.Data;
using OTSSignsOrchestrator.Server.Data.Entities;
namespace OTSSignsOrchestrator.Server.Health.Checks;
/// <summary>
/// Verifies the Xibo CMS theme is set to <c>otssigns</c> by calling <c>GET /api/settings</c>.
/// Auto-remediates by calling <c>PUT /api/settings</c> if the theme is incorrect.
/// </summary>
public sealed class ThemeHealthCheck : IHealthCheck
{
private readonly XiboClientFactory _clientFactory;
private readonly IServiceProvider _services;
private readonly ILogger<ThemeHealthCheck> _logger;
private const string ExpectedTheme = "otssigns";
public string CheckName => "Theme";
public bool AutoRemediate => true;
public ThemeHealthCheck(
XiboClientFactory clientFactory,
IServiceProvider services,
ILogger<ThemeHealthCheck> logger)
{
_clientFactory = clientFactory;
_services = services;
_logger = logger;
}
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
{
var (client, _) = await ResolveAsync(instance);
if (client is null)
return new HealthCheckResult(HealthStatus.Critical, "No OAuth app — cannot check theme");
try
{
var settingsResp = await client.GetSettingsAsync();
if (!settingsResp.IsSuccessStatusCode)
return new HealthCheckResult(HealthStatus.Critical,
$"GET /settings returned {settingsResp.StatusCode}");
var settings = settingsResp.Content;
if (settings is null)
return new HealthCheckResult(HealthStatus.Critical, "Settings response was null");
// Xibo returns settings as a list of { setting, value } objects or a dictionary
var themeName = ExtractSetting(settings, "THEME_NAME");
if (string.Equals(themeName, ExpectedTheme, StringComparison.OrdinalIgnoreCase))
return new HealthCheckResult(HealthStatus.Healthy, $"Theme is {ExpectedTheme}");
return new HealthCheckResult(HealthStatus.Critical,
$"Theme is '{themeName}', expected '{ExpectedTheme}'");
}
catch (Exception ex)
{
return new HealthCheckResult(HealthStatus.Critical,
$"Theme check failed: {ex.Message}");
}
}
public async Task<bool> RemediateAsync(Instance instance, CancellationToken ct)
{
var (client, _) = await ResolveAsync(instance);
if (client is null) return false;
try
{
var resp = await client.UpdateSettingsAsync(
new UpdateSettingsRequest(new Dictionary<string, string>
{
["THEME_NAME"] = ExpectedTheme,
}));
if (resp.IsSuccessStatusCode)
{
await using var scope = _services.CreateAsyncScope();
var db = scope.ServiceProvider.GetRequiredService<OrchestratorDbContext>();
db.AuditLogs.Add(new AuditLog
{
Id = Guid.NewGuid(),
InstanceId = instance.Id,
Actor = "HealthCheckEngine:Theme",
Action = "FixTheme",
Target = instance.Customer.Abbreviation,
Outcome = "Success",
Detail = $"Reset THEME_NAME to {ExpectedTheme}",
OccurredAt = DateTime.UtcNow,
});
await db.SaveChangesAsync(ct);
return true;
}
_logger.LogError("Failed to fix theme: {Err}", resp.Error?.Content);
return false;
}
catch (Exception ex)
{
_logger.LogError(ex, "Theme remediation failed");
return false;
}
}
private static string? ExtractSetting(object settingsObj, string key)
{
// Settings may come back as a dictionary or a list of objects
if (settingsObj is System.Text.Json.JsonElement je)
{
if (je.ValueKind == System.Text.Json.JsonValueKind.Object &&
je.TryGetProperty(key, out var val))
return val.GetString();
if (je.ValueKind == System.Text.Json.JsonValueKind.Array)
{
foreach (var item in je.EnumerateArray())
{
if (item.TryGetProperty("setting", out var settingProp) &&
string.Equals(settingProp.GetString(), key, StringComparison.OrdinalIgnoreCase) &&
item.TryGetProperty("value", out var valueProp))
{
return valueProp.GetString();
}
}
}
}
return null;
}
private async Task<(IXiboApiClient? Client, string Abbrev)> ResolveAsync(Instance instance)
{
var abbrev = instance.Customer.Abbreviation;
var oauthApp = instance.OauthAppRegistries.FirstOrDefault();
if (oauthApp is null) return (null, abbrev);
var settings = _services.GetRequiredService<Core.Services.SettingsService>();
var secret = await settings.GetAsync(Core.Services.SettingsService.InstanceOAuthSecretId(abbrev));
if (string.IsNullOrEmpty(secret)) return (null, abbrev);
var client = await _clientFactory.CreateAsync(instance.XiboUrl, oauthApp.ClientId, secret);
return (client, abbrev);
}
}

View File

@@ -0,0 +1,61 @@
using OTSSignsOrchestrator.Server.Clients;
using OTSSignsOrchestrator.Server.Data.Entities;
namespace OTSSignsOrchestrator.Server.Health.Checks;
/// <summary>
/// Verifies the Xibo CMS API is reachable by calling GET /about and expecting a 200 response.
/// </summary>
public sealed class XiboApiHealthCheck : IHealthCheck
{
private readonly XiboClientFactory _clientFactory;
private readonly IServiceProvider _services;
private readonly ILogger<XiboApiHealthCheck> _logger;
public string CheckName => "XiboApi";
public bool AutoRemediate => false;
public XiboApiHealthCheck(
XiboClientFactory clientFactory,
IServiceProvider services,
ILogger<XiboApiHealthCheck> logger)
{
_clientFactory = clientFactory;
_services = services;
_logger = logger;
}
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
{
var client = await ResolveClientAsync(instance);
if (client is null)
return new HealthCheckResult(HealthStatus.Critical, "No OAuth app registered — cannot reach API");
try
{
var response = await client.GetAboutAsync();
return response.IsSuccessStatusCode
? new HealthCheckResult(HealthStatus.Healthy, "Xibo API reachable")
: new HealthCheckResult(HealthStatus.Critical,
$"Xibo API returned {response.StatusCode}",
response.Error?.Content);
}
catch (Exception ex)
{
return new HealthCheckResult(HealthStatus.Critical, $"Xibo API unreachable: {ex.Message}");
}
}
private async Task<IXiboApiClient?> ResolveClientAsync(Instance instance)
{
var oauthApp = instance.OauthAppRegistries.FirstOrDefault();
if (oauthApp is null) return null;
var settings = _services.GetRequiredService<OTSSignsOrchestrator.Core.Services.SettingsService>();
var abbrev = instance.Customer.Abbreviation;
var secret = await settings.GetAsync(Core.Services.SettingsService.InstanceOAuthSecretId(abbrev));
if (string.IsNullOrEmpty(secret)) return null;
return await _clientFactory.CreateAsync(instance.XiboUrl, oauthApp.ClientId, secret);
}
}

View File

@@ -0,0 +1,87 @@
using OTSSignsOrchestrator.Server.Clients;
using OTSSignsOrchestrator.Server.Data.Entities;
using Microsoft.Extensions.Configuration;
namespace OTSSignsOrchestrator.Server.Health.Checks;
/// <summary>
/// Compares the installed Xibo CMS version (from GET /about) against the latest known
/// release configured in <c>HealthChecks:LatestXiboVersion</c>. Reports Degraded if behind.
/// </summary>
public sealed class XiboVersionHealthCheck : IHealthCheck
{
private readonly XiboClientFactory _clientFactory;
private readonly IServiceProvider _services;
private readonly IConfiguration _configuration;
private readonly ILogger<XiboVersionHealthCheck> _logger;
public string CheckName => "XiboVersion";
public bool AutoRemediate => false;
public XiboVersionHealthCheck(
XiboClientFactory clientFactory,
IServiceProvider services,
IConfiguration configuration,
ILogger<XiboVersionHealthCheck> logger)
{
_clientFactory = clientFactory;
_services = services;
_configuration = configuration;
_logger = logger;
}
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
{
var latestVersion = _configuration["HealthChecks:LatestXiboVersion"];
if (string.IsNullOrEmpty(latestVersion))
return new HealthCheckResult(HealthStatus.Healthy, "LatestXiboVersion not configured — skipping");
var (client, _) = await ResolveAsync(instance);
if (client is null)
return new HealthCheckResult(HealthStatus.Critical, "No OAuth app — cannot check version");
try
{
var response = await client.GetAboutAsync();
if (!response.IsSuccessStatusCode || response.Content is null)
return new HealthCheckResult(HealthStatus.Critical, "GET /about failed");
string? installedVersion = null;
if (response.Content is System.Text.Json.JsonElement je &&
je.TryGetProperty("version", out var verProp))
{
installedVersion = verProp.GetString();
}
if (string.IsNullOrEmpty(installedVersion))
return new HealthCheckResult(HealthStatus.Degraded, "Could not determine installed version");
if (string.Equals(installedVersion, latestVersion, StringComparison.OrdinalIgnoreCase))
return new HealthCheckResult(HealthStatus.Healthy,
$"Xibo version {installedVersion} is current");
return new HealthCheckResult(HealthStatus.Degraded,
$"Xibo version {installedVersion}, latest is {latestVersion}",
"Consider scheduling an upgrade");
}
catch (Exception ex)
{
return new HealthCheckResult(HealthStatus.Critical,
$"Version check failed: {ex.Message}");
}
}
private async Task<(IXiboApiClient? Client, string Abbrev)> ResolveAsync(Instance instance)
{
var abbrev = instance.Customer.Abbreviation;
var oauthApp = instance.OauthAppRegistries.FirstOrDefault();
if (oauthApp is null) return (null, abbrev);
var settings = _services.GetRequiredService<Core.Services.SettingsService>();
var secret = await settings.GetAsync(Core.Services.SettingsService.InstanceOAuthSecretId(abbrev));
if (string.IsNullOrEmpty(secret)) return (null, abbrev);
var client = await _clientFactory.CreateAsync(instance.XiboUrl, oauthApp.ClientId, secret);
return (client, abbrev);
}
}

View File

@@ -0,0 +1,289 @@
using Microsoft.AspNetCore.SignalR;
using Microsoft.EntityFrameworkCore;
using Quartz;
using OTSSignsOrchestrator.Server.Data;
using OTSSignsOrchestrator.Server.Data.Entities;
using OTSSignsOrchestrator.Server.Hubs;
namespace OTSSignsOrchestrator.Server.Health;
/// <summary>
/// Background service that schedules and runs all <see cref="IHealthCheck"/> implementations
/// against every active <see cref="Instance"/>. Persists <see cref="HealthEvent"/> rows,
/// aggregates worst-severity to update <see cref="Instance.HealthStatus"/>,
/// broadcasts changes via <see cref="FleetHub"/>, and triggers auto-remediation when applicable.
///
/// Uses Quartz to stagger per-instance jobs across the check interval (avoids thundering herd).
/// Concurrency is capped at 4 simultaneous check runs via <see cref="SemaphoreSlim"/>.
/// </summary>
public sealed class HealthCheckEngine : BackgroundService
{
private readonly IServiceProvider _services;
private readonly ISchedulerFactory _schedulerFactory;
private readonly ILogger<HealthCheckEngine> _logger;
/// <summary>Default interval between full health-check sweeps.</summary>
internal static readonly TimeSpan DefaultCheckInterval = TimeSpan.FromMinutes(5);
public HealthCheckEngine(
IServiceProvider services,
ISchedulerFactory schedulerFactory,
ILogger<HealthCheckEngine> logger)
{
_services = services;
_schedulerFactory = schedulerFactory;
_logger = logger;
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
// Wait briefly for the rest of the app to start
await Task.Delay(TimeSpan.FromSeconds(5), stoppingToken);
var scheduler = await _schedulerFactory.GetScheduler(stoppingToken);
while (!stoppingToken.IsCancellationRequested)
{
try
{
await ScheduleInstanceChecks(scheduler, stoppingToken);
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
_logger.LogError(ex, "Error scheduling health check sweep");
}
await Task.Delay(DefaultCheckInterval, stoppingToken);
}
}
/// <summary>
/// Load all active instances and schedule staggered Quartz jobs so that
/// check start times are spread across the interval.
/// </summary>
private async Task ScheduleInstanceChecks(IScheduler scheduler, CancellationToken ct)
{
await using var scope = _services.CreateAsyncScope();
var db = scope.ServiceProvider.GetRequiredService<OrchestratorDbContext>();
var instances = await db.Instances
.AsNoTracking()
.Include(i => i.Customer)
.Where(i => i.Customer.Status == CustomerStatus.Active)
.ToListAsync(ct);
if (instances.Count == 0)
return;
// Spread jobs across 80 % of the check interval to leave a buffer
var spreadMs = (int)(DefaultCheckInterval.TotalMilliseconds * 0.8);
var stepMs = instances.Count > 1 ? spreadMs / (instances.Count - 1) : 0;
for (var i = 0; i < instances.Count; i++)
{
var instance = instances[i];
var delay = TimeSpan.FromMilliseconds(stepMs * i);
var jobKey = new JobKey($"health-{instance.Id}", "health-checks");
// Remove previous trigger if it still exists (idempotent reschedule)
if (await scheduler.CheckExists(jobKey, ct))
await scheduler.DeleteJob(jobKey, ct);
var job = JobBuilder.Create<InstanceHealthCheckJob>()
.WithIdentity(jobKey)
.UsingJobData("instanceId", instance.Id.ToString())
.Build();
var trigger = TriggerBuilder.Create()
.WithIdentity($"health-{instance.Id}-trigger", "health-checks")
.StartAt(DateTimeOffset.UtcNow.Add(delay))
.Build();
await scheduler.ScheduleJob(job, trigger, ct);
}
_logger.LogInformation(
"Scheduled health checks for {Count} active instance(s)", instances.Count);
}
}
/// <summary>
/// Quartz job that executes all <see cref="IHealthCheck"/> implementations for a single instance.
/// </summary>
[DisallowConcurrentExecution]
public sealed class InstanceHealthCheckJob : IJob
{
/// <summary>Global concurrency limiter — max 4 parallel health check runs.</summary>
private static readonly SemaphoreSlim s_concurrency = new(4);
private readonly IServiceProvider _services;
private readonly ILogger<InstanceHealthCheckJob> _logger;
public InstanceHealthCheckJob(
IServiceProvider services,
ILogger<InstanceHealthCheckJob> logger)
{
_services = services;
_logger = logger;
}
public async Task Execute(IJobExecutionContext context)
{
var instanceIdStr = context.MergedJobDataMap.GetString("instanceId");
if (!Guid.TryParse(instanceIdStr, out var instanceId))
{
_logger.LogWarning("InstanceHealthCheckJob: invalid instanceId {Id}", instanceIdStr);
return;
}
await s_concurrency.WaitAsync(context.CancellationToken);
try
{
await RunChecksForInstanceAsync(instanceId, context.CancellationToken);
}
finally
{
s_concurrency.Release();
}
}
private async Task RunChecksForInstanceAsync(Guid instanceId, CancellationToken ct)
{
await using var scope = _services.CreateAsyncScope();
var db = scope.ServiceProvider.GetRequiredService<OrchestratorDbContext>();
var hub = scope.ServiceProvider.GetRequiredService<IHubContext<FleetHub, IFleetClient>>();
var checks = scope.ServiceProvider.GetServices<IHealthCheck>();
var instance = await db.Instances
.Include(i => i.Customer)
.Include(i => i.OauthAppRegistries)
.Include(i => i.ByoiConfigs)
.FirstOrDefaultAsync(i => i.Id == instanceId, ct);
if (instance is null)
{
_logger.LogWarning("InstanceHealthCheckJob: instance {Id} not found", instanceId);
return;
}
var abbrev = instance.Customer.Abbreviation;
var worstStatus = HealthStatus.Healthy;
foreach (var check in checks)
{
// Skip the AuthentikGlobalHealthCheck — it runs on its own schedule
if (check.CheckName == "AuthentikGlobal")
continue;
HealthCheckResult result;
try
{
result = await check.RunAsync(instance, ct);
}
catch (Exception ex)
{
_logger.LogError(ex, "Health check {Check} failed for {Abbrev}", check.CheckName, abbrev);
result = new HealthCheckResult(HealthStatus.Critical, $"Check threw exception: {ex.Message}");
}
// Persist HealthEvent
var healthEvent = new HealthEvent
{
Id = Guid.NewGuid(),
InstanceId = instanceId,
CheckName = check.CheckName,
Status = ToEventStatus(result.Status),
Message = result.Message,
Remediated = false,
OccurredAt = DateTime.UtcNow,
};
// Auto-remediation
if (check.AutoRemediate && result.Status == HealthStatus.Critical)
{
try
{
var fixed_ = await check.RemediateAsync(instance, ct);
healthEvent.Remediated = fixed_;
// Append-only audit log
db.AuditLogs.Add(new AuditLog
{
Id = Guid.NewGuid(),
InstanceId = instanceId,
Actor = $"HealthCheckEngine:{check.CheckName}",
Action = "AutoRemediate",
Target = abbrev,
Outcome = fixed_ ? "Success" : "Failed",
Detail = result.Detail,
OccurredAt = DateTime.UtcNow,
});
if (fixed_)
{
_logger.LogInformation(
"Auto-remediated {Check} for {Abbrev}", check.CheckName, abbrev);
// Downgrade severity since we fixed it
healthEvent.Status = HealthEventStatus.Healthy;
}
}
catch (Exception ex)
{
_logger.LogError(ex, "Remediation for {Check} failed on {Abbrev}", check.CheckName, abbrev);
db.AuditLogs.Add(new AuditLog
{
Id = Guid.NewGuid(),
InstanceId = instanceId,
Actor = $"HealthCheckEngine:{check.CheckName}",
Action = "AutoRemediate",
Target = abbrev,
Outcome = "Error",
Detail = ex.Message,
OccurredAt = DateTime.UtcNow,
});
}
}
db.HealthEvents.Add(healthEvent);
// Track worst severity (only from non-remediated results)
if (!healthEvent.Remediated)
{
var status = FromEventStatus(healthEvent.Status);
if (status > worstStatus)
worstStatus = status;
}
}
// Update instance health status
var previousStatus = instance.HealthStatus;
instance.HealthStatus = worstStatus;
instance.LastHealthCheck = DateTime.UtcNow;
await db.SaveChangesAsync(ct);
// Broadcast status change
if (previousStatus != worstStatus)
{
await hub.Clients.All.SendInstanceStatusChanged(
instance.CustomerId.ToString(), worstStatus.ToString());
}
}
private static HealthEventStatus ToEventStatus(HealthStatus status) => status switch
{
HealthStatus.Healthy => HealthEventStatus.Healthy,
HealthStatus.Degraded => HealthEventStatus.Degraded,
HealthStatus.Critical => HealthEventStatus.Critical,
_ => HealthEventStatus.Critical,
};
private static HealthStatus FromEventStatus(HealthEventStatus status) => status switch
{
HealthEventStatus.Healthy => HealthStatus.Healthy,
HealthEventStatus.Degraded => HealthStatus.Degraded,
HealthEventStatus.Critical => HealthStatus.Critical,
_ => HealthStatus.Critical,
};
}

View File

@@ -0,0 +1,32 @@
using OTSSignsOrchestrator.Server.Data.Entities;
namespace OTSSignsOrchestrator.Server.Health;
/// <summary>
/// Result of a single health check execution.
/// </summary>
public record HealthCheckResult(HealthStatus Status, string Message, string? Detail = null);
/// <summary>
/// Contract for an individual health check that runs against a specific <see cref="Instance"/>.
/// </summary>
public interface IHealthCheck
{
/// <summary>Human-readable name written to <see cref="HealthEvent.CheckName"/>.</summary>
string CheckName { get; }
/// <summary>
/// When true the engine will automatically call <see cref="RemediateAsync"/>
/// if the check returns <see cref="HealthStatus.Critical"/>.
/// </summary>
bool AutoRemediate { get; }
/// <summary>Execute the check for <paramref name="instance"/>.</summary>
Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct);
/// <summary>
/// Attempt automatic remediation. Return true if the issue was fixed.
/// The default implementation does nothing and returns false.
/// </summary>
Task<bool> RemediateAsync(Instance instance, CancellationToken ct) => Task.FromResult(false);
}