feat: Implement provisioning pipelines for subscription management
- Add ReactivatePipeline to handle subscription reactivation, including scaling Docker services, health verification, status updates, audit logging, and broadcasting status changes. - Introduce RotateCredentialsPipeline for OAuth2 credential rotation, managing the deletion of old apps, creation of new ones, credential storage, access verification, and audit logging. - Create StepRunner to manage job step execution, including lifecycle management and progress broadcasting via SignalR. - Implement SuspendPipeline for subscription suspension, scaling down services, updating statuses, logging audits, and broadcasting changes. - Add UpdateScreenLimitPipeline to update Xibo CMS screen limits and record snapshots. - Introduce XiboFeatureManifests for hardcoded feature ACLs per role. - Add docker-compose.dev.yml for local development with PostgreSQL setup.
This commit is contained in:
@@ -0,0 +1,107 @@
|
||||
using System.Diagnostics;
|
||||
using Microsoft.AspNetCore.SignalR;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using OTSSignsOrchestrator.Server.Clients;
|
||||
using OTSSignsOrchestrator.Server.Data;
|
||||
using OTSSignsOrchestrator.Server.Data.Entities;
|
||||
using OTSSignsOrchestrator.Server.Hubs;
|
||||
|
||||
namespace OTSSignsOrchestrator.Server.Health.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Probes the central Authentik instance at <c>GET /api/v3/-/health/ready/</c>.
|
||||
/// Measures latency and writes an <see cref="AuthentikMetrics"/> row.
|
||||
/// If down: <c>Severity = Critical</c>, message "Central Authentik is DOWN — all customer web UI logins failing".
|
||||
/// This is a fleet-wide P1 alert. Runs every 2 minutes on a separate schedule.
|
||||
///
|
||||
/// This check is NOT per-instance — it runs once globally. The engine skips it for
|
||||
/// per-instance checks. Instead it is scheduled independently as a Quartz job.
|
||||
/// </summary>
|
||||
public sealed class AuthentikGlobalHealthCheck : IHealthCheck
|
||||
{
|
||||
private readonly IAuthentikClient _authentikClient;
|
||||
private readonly IServiceProvider _services;
|
||||
private readonly ILogger<AuthentikGlobalHealthCheck> _logger;
|
||||
|
||||
public string CheckName => "AuthentikGlobal";
|
||||
public bool AutoRemediate => false;
|
||||
|
||||
public AuthentikGlobalHealthCheck(
|
||||
IAuthentikClient authentikClient,
|
||||
IServiceProvider services,
|
||||
ILogger<AuthentikGlobalHealthCheck> logger)
|
||||
{
|
||||
_authentikClient = authentikClient;
|
||||
_services = services;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
|
||||
{
|
||||
// This check doesn't use the instance parameter — it checks global Authentik health.
|
||||
return await RunGlobalAsync(ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Core logic — callable from the Quartz job without an instance context.
|
||||
/// </summary>
|
||||
public async Task<HealthCheckResult> RunGlobalAsync(CancellationToken ct)
|
||||
{
|
||||
var sw = Stopwatch.StartNew();
|
||||
AuthentikMetricsStatus metricsStatus;
|
||||
string? errorMessage = null;
|
||||
HealthCheckResult result;
|
||||
|
||||
try
|
||||
{
|
||||
var response = await _authentikClient.CheckHealthAsync();
|
||||
sw.Stop();
|
||||
|
||||
if (response.IsSuccessStatusCode)
|
||||
{
|
||||
metricsStatus = AuthentikMetricsStatus.Healthy;
|
||||
result = new HealthCheckResult(HealthStatus.Healthy,
|
||||
$"Authentik healthy (latency: {sw.ElapsedMilliseconds}ms)");
|
||||
}
|
||||
else
|
||||
{
|
||||
metricsStatus = AuthentikMetricsStatus.Critical;
|
||||
errorMessage = $"HTTP {response.StatusCode}";
|
||||
result = new HealthCheckResult(HealthStatus.Critical,
|
||||
"Central Authentik is DOWN — all customer web UI logins failing",
|
||||
$"Health endpoint returned {response.StatusCode}");
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
sw.Stop();
|
||||
metricsStatus = AuthentikMetricsStatus.Critical;
|
||||
errorMessage = ex.Message;
|
||||
result = new HealthCheckResult(HealthStatus.Critical,
|
||||
"Central Authentik is DOWN — all customer web UI logins failing",
|
||||
ex.Message);
|
||||
}
|
||||
|
||||
// Write metrics row
|
||||
await using var scope = _services.CreateAsyncScope();
|
||||
var db = scope.ServiceProvider.GetRequiredService<OrchestratorDbContext>();
|
||||
db.AuthentikMetrics.Add(new AuthentikMetrics
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
CheckedAt = DateTime.UtcNow,
|
||||
Status = metricsStatus,
|
||||
LatencyMs = (int)sw.ElapsedMilliseconds,
|
||||
ErrorMessage = errorMessage,
|
||||
});
|
||||
await db.SaveChangesAsync(ct);
|
||||
|
||||
// Broadcast alert if critical
|
||||
if (result.Status == HealthStatus.Critical)
|
||||
{
|
||||
var hub = scope.ServiceProvider.GetRequiredService<IHubContext<FleetHub, IFleetClient>>();
|
||||
await hub.Clients.All.SendAlertRaised("Critical", result.Message);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user