108 lines
4.0 KiB
C#
108 lines
4.0 KiB
C#
|
|
using System.Diagnostics;
|
||
|
|
using Microsoft.AspNetCore.SignalR;
|
||
|
|
using Microsoft.EntityFrameworkCore;
|
||
|
|
using OTSSignsOrchestrator.Server.Clients;
|
||
|
|
using OTSSignsOrchestrator.Server.Data;
|
||
|
|
using OTSSignsOrchestrator.Server.Data.Entities;
|
||
|
|
using OTSSignsOrchestrator.Server.Hubs;
|
||
|
|
|
||
|
|
namespace OTSSignsOrchestrator.Server.Health.Checks;
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Probes the central Authentik instance at <c>GET /api/v3/-/health/ready/</c>.
|
||
|
|
/// Measures latency and writes an <see cref="AuthentikMetrics"/> row.
|
||
|
|
/// If down: <c>Severity = Critical</c>, message "Central Authentik is DOWN — all customer web UI logins failing".
|
||
|
|
/// This is a fleet-wide P1 alert. Runs every 2 minutes on a separate schedule.
|
||
|
|
///
|
||
|
|
/// This check is NOT per-instance — it runs once globally. The engine skips it for
|
||
|
|
/// per-instance checks. Instead it is scheduled independently as a Quartz job.
|
||
|
|
/// </summary>
|
||
|
|
public sealed class AuthentikGlobalHealthCheck : IHealthCheck
|
||
|
|
{
|
||
|
|
private readonly IAuthentikClient _authentikClient;
|
||
|
|
private readonly IServiceProvider _services;
|
||
|
|
private readonly ILogger<AuthentikGlobalHealthCheck> _logger;
|
||
|
|
|
||
|
|
public string CheckName => "AuthentikGlobal";
|
||
|
|
public bool AutoRemediate => false;
|
||
|
|
|
||
|
|
public AuthentikGlobalHealthCheck(
|
||
|
|
IAuthentikClient authentikClient,
|
||
|
|
IServiceProvider services,
|
||
|
|
ILogger<AuthentikGlobalHealthCheck> logger)
|
||
|
|
{
|
||
|
|
_authentikClient = authentikClient;
|
||
|
|
_services = services;
|
||
|
|
_logger = logger;
|
||
|
|
}
|
||
|
|
|
||
|
|
public async Task<HealthCheckResult> RunAsync(Instance instance, CancellationToken ct)
|
||
|
|
{
|
||
|
|
// This check doesn't use the instance parameter — it checks global Authentik health.
|
||
|
|
return await RunGlobalAsync(ct);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Core logic — callable from the Quartz job without an instance context.
|
||
|
|
/// </summary>
|
||
|
|
public async Task<HealthCheckResult> RunGlobalAsync(CancellationToken ct)
|
||
|
|
{
|
||
|
|
var sw = Stopwatch.StartNew();
|
||
|
|
AuthentikMetricsStatus metricsStatus;
|
||
|
|
string? errorMessage = null;
|
||
|
|
HealthCheckResult result;
|
||
|
|
|
||
|
|
try
|
||
|
|
{
|
||
|
|
var response = await _authentikClient.CheckHealthAsync();
|
||
|
|
sw.Stop();
|
||
|
|
|
||
|
|
if (response.IsSuccessStatusCode)
|
||
|
|
{
|
||
|
|
metricsStatus = AuthentikMetricsStatus.Healthy;
|
||
|
|
result = new HealthCheckResult(HealthStatus.Healthy,
|
||
|
|
$"Authentik healthy (latency: {sw.ElapsedMilliseconds}ms)");
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
metricsStatus = AuthentikMetricsStatus.Critical;
|
||
|
|
errorMessage = $"HTTP {response.StatusCode}";
|
||
|
|
result = new HealthCheckResult(HealthStatus.Critical,
|
||
|
|
"Central Authentik is DOWN — all customer web UI logins failing",
|
||
|
|
$"Health endpoint returned {response.StatusCode}");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
catch (Exception ex)
|
||
|
|
{
|
||
|
|
sw.Stop();
|
||
|
|
metricsStatus = AuthentikMetricsStatus.Critical;
|
||
|
|
errorMessage = ex.Message;
|
||
|
|
result = new HealthCheckResult(HealthStatus.Critical,
|
||
|
|
"Central Authentik is DOWN — all customer web UI logins failing",
|
||
|
|
ex.Message);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Write metrics row
|
||
|
|
await using var scope = _services.CreateAsyncScope();
|
||
|
|
var db = scope.ServiceProvider.GetRequiredService<OrchestratorDbContext>();
|
||
|
|
db.AuthentikMetrics.Add(new AuthentikMetrics
|
||
|
|
{
|
||
|
|
Id = Guid.NewGuid(),
|
||
|
|
CheckedAt = DateTime.UtcNow,
|
||
|
|
Status = metricsStatus,
|
||
|
|
LatencyMs = (int)sw.ElapsedMilliseconds,
|
||
|
|
ErrorMessage = errorMessage,
|
||
|
|
});
|
||
|
|
await db.SaveChangesAsync(ct);
|
||
|
|
|
||
|
|
// Broadcast alert if critical
|
||
|
|
if (result.Status == HealthStatus.Critical)
|
||
|
|
{
|
||
|
|
var hub = scope.ServiceProvider.GetRequiredService<IHubContext<FleetHub, IFleetClient>>();
|
||
|
|
await hub.Clients.All.SendAlertRaised("Critical", result.Message);
|
||
|
|
}
|
||
|
|
|
||
|
|
return result;
|
||
|
|
}
|
||
|
|
}
|