Files
OTSSignsOrchestrator/OTSSignsOrchestrator.Server/Health/HealthCheckEngine.cs

290 lines
11 KiB
C#
Raw Normal View History

using Microsoft.AspNetCore.SignalR;
using Microsoft.EntityFrameworkCore;
using Quartz;
using OTSSignsOrchestrator.Server.Data;
using OTSSignsOrchestrator.Server.Data.Entities;
using OTSSignsOrchestrator.Server.Hubs;
namespace OTSSignsOrchestrator.Server.Health;
/// <summary>
/// Background service that schedules and runs all <see cref="IHealthCheck"/> implementations
/// against every active <see cref="Instance"/>. Persists <see cref="HealthEvent"/> rows,
/// aggregates worst-severity to update <see cref="Instance.HealthStatus"/>,
/// broadcasts changes via <see cref="FleetHub"/>, and triggers auto-remediation when applicable.
///
/// Uses Quartz to stagger per-instance jobs across the check interval (avoids thundering herd).
/// Concurrency is capped at 4 simultaneous check runs via <see cref="SemaphoreSlim"/>.
/// </summary>
public sealed class HealthCheckEngine : BackgroundService
{
private readonly IServiceProvider _services;
private readonly ISchedulerFactory _schedulerFactory;
private readonly ILogger<HealthCheckEngine> _logger;
/// <summary>Default interval between full health-check sweeps.</summary>
internal static readonly TimeSpan DefaultCheckInterval = TimeSpan.FromMinutes(5);
public HealthCheckEngine(
IServiceProvider services,
ISchedulerFactory schedulerFactory,
ILogger<HealthCheckEngine> logger)
{
_services = services;
_schedulerFactory = schedulerFactory;
_logger = logger;
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
// Wait briefly for the rest of the app to start
await Task.Delay(TimeSpan.FromSeconds(5), stoppingToken);
var scheduler = await _schedulerFactory.GetScheduler(stoppingToken);
while (!stoppingToken.IsCancellationRequested)
{
try
{
await ScheduleInstanceChecks(scheduler, stoppingToken);
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
_logger.LogError(ex, "Error scheduling health check sweep");
}
await Task.Delay(DefaultCheckInterval, stoppingToken);
}
}
/// <summary>
/// Load all active instances and schedule staggered Quartz jobs so that
/// check start times are spread across the interval.
/// </summary>
private async Task ScheduleInstanceChecks(IScheduler scheduler, CancellationToken ct)
{
await using var scope = _services.CreateAsyncScope();
var db = scope.ServiceProvider.GetRequiredService<OrchestratorDbContext>();
var instances = await db.Instances
.AsNoTracking()
.Include(i => i.Customer)
.Where(i => i.Customer.Status == CustomerStatus.Active)
.ToListAsync(ct);
if (instances.Count == 0)
return;
// Spread jobs across 80 % of the check interval to leave a buffer
var spreadMs = (int)(DefaultCheckInterval.TotalMilliseconds * 0.8);
var stepMs = instances.Count > 1 ? spreadMs / (instances.Count - 1) : 0;
for (var i = 0; i < instances.Count; i++)
{
var instance = instances[i];
var delay = TimeSpan.FromMilliseconds(stepMs * i);
var jobKey = new JobKey($"health-{instance.Id}", "health-checks");
// Remove previous trigger if it still exists (idempotent reschedule)
if (await scheduler.CheckExists(jobKey, ct))
await scheduler.DeleteJob(jobKey, ct);
var job = JobBuilder.Create<InstanceHealthCheckJob>()
.WithIdentity(jobKey)
.UsingJobData("instanceId", instance.Id.ToString())
.Build();
var trigger = TriggerBuilder.Create()
.WithIdentity($"health-{instance.Id}-trigger", "health-checks")
.StartAt(DateTimeOffset.UtcNow.Add(delay))
.Build();
await scheduler.ScheduleJob(job, trigger, ct);
}
_logger.LogInformation(
"Scheduled health checks for {Count} active instance(s)", instances.Count);
}
}
/// <summary>
/// Quartz job that executes all <see cref="IHealthCheck"/> implementations for a single instance.
/// </summary>
[DisallowConcurrentExecution]
public sealed class InstanceHealthCheckJob : IJob
{
/// <summary>Global concurrency limiter — max 4 parallel health check runs.</summary>
private static readonly SemaphoreSlim s_concurrency = new(4);
private readonly IServiceProvider _services;
private readonly ILogger<InstanceHealthCheckJob> _logger;
public InstanceHealthCheckJob(
IServiceProvider services,
ILogger<InstanceHealthCheckJob> logger)
{
_services = services;
_logger = logger;
}
public async Task Execute(IJobExecutionContext context)
{
var instanceIdStr = context.MergedJobDataMap.GetString("instanceId");
if (!Guid.TryParse(instanceIdStr, out var instanceId))
{
_logger.LogWarning("InstanceHealthCheckJob: invalid instanceId {Id}", instanceIdStr);
return;
}
await s_concurrency.WaitAsync(context.CancellationToken);
try
{
await RunChecksForInstanceAsync(instanceId, context.CancellationToken);
}
finally
{
s_concurrency.Release();
}
}
private async Task RunChecksForInstanceAsync(Guid instanceId, CancellationToken ct)
{
await using var scope = _services.CreateAsyncScope();
var db = scope.ServiceProvider.GetRequiredService<OrchestratorDbContext>();
var hub = scope.ServiceProvider.GetRequiredService<IHubContext<FleetHub, IFleetClient>>();
var checks = scope.ServiceProvider.GetServices<IHealthCheck>();
var instance = await db.Instances
.Include(i => i.Customer)
.Include(i => i.OauthAppRegistries)
.Include(i => i.ByoiConfigs)
.FirstOrDefaultAsync(i => i.Id == instanceId, ct);
if (instance is null)
{
_logger.LogWarning("InstanceHealthCheckJob: instance {Id} not found", instanceId);
return;
}
var abbrev = instance.Customer.Abbreviation;
var worstStatus = HealthStatus.Healthy;
foreach (var check in checks)
{
// Skip the AuthentikGlobalHealthCheck — it runs on its own schedule
if (check.CheckName == "AuthentikGlobal")
continue;
HealthCheckResult result;
try
{
result = await check.RunAsync(instance, ct);
}
catch (Exception ex)
{
_logger.LogError(ex, "Health check {Check} failed for {Abbrev}", check.CheckName, abbrev);
result = new HealthCheckResult(HealthStatus.Critical, $"Check threw exception: {ex.Message}");
}
// Persist HealthEvent
var healthEvent = new HealthEvent
{
Id = Guid.NewGuid(),
InstanceId = instanceId,
CheckName = check.CheckName,
Status = ToEventStatus(result.Status),
Message = result.Message,
Remediated = false,
OccurredAt = DateTime.UtcNow,
};
// Auto-remediation
if (check.AutoRemediate && result.Status == HealthStatus.Critical)
{
try
{
var fixed_ = await check.RemediateAsync(instance, ct);
healthEvent.Remediated = fixed_;
// Append-only audit log
db.AuditLogs.Add(new AuditLog
{
Id = Guid.NewGuid(),
InstanceId = instanceId,
Actor = $"HealthCheckEngine:{check.CheckName}",
Action = "AutoRemediate",
Target = abbrev,
Outcome = fixed_ ? "Success" : "Failed",
Detail = result.Detail,
OccurredAt = DateTime.UtcNow,
});
if (fixed_)
{
_logger.LogInformation(
"Auto-remediated {Check} for {Abbrev}", check.CheckName, abbrev);
// Downgrade severity since we fixed it
healthEvent.Status = HealthEventStatus.Healthy;
}
}
catch (Exception ex)
{
_logger.LogError(ex, "Remediation for {Check} failed on {Abbrev}", check.CheckName, abbrev);
db.AuditLogs.Add(new AuditLog
{
Id = Guid.NewGuid(),
InstanceId = instanceId,
Actor = $"HealthCheckEngine:{check.CheckName}",
Action = "AutoRemediate",
Target = abbrev,
Outcome = "Error",
Detail = ex.Message,
OccurredAt = DateTime.UtcNow,
});
}
}
db.HealthEvents.Add(healthEvent);
// Track worst severity (only from non-remediated results)
if (!healthEvent.Remediated)
{
var status = FromEventStatus(healthEvent.Status);
if (status > worstStatus)
worstStatus = status;
}
}
// Update instance health status
var previousStatus = instance.HealthStatus;
instance.HealthStatus = worstStatus;
instance.LastHealthCheck = DateTime.UtcNow;
await db.SaveChangesAsync(ct);
// Broadcast status change
if (previousStatus != worstStatus)
{
await hub.Clients.All.SendInstanceStatusChanged(
instance.CustomerId.ToString(), worstStatus.ToString());
}
}
private static HealthEventStatus ToEventStatus(HealthStatus status) => status switch
{
HealthStatus.Healthy => HealthEventStatus.Healthy,
HealthStatus.Degraded => HealthEventStatus.Degraded,
HealthStatus.Critical => HealthEventStatus.Critical,
_ => HealthEventStatus.Critical,
};
private static HealthStatus FromEventStatus(HealthEventStatus status) => status switch
{
HealthEventStatus.Healthy => HealthStatus.Healthy,
HealthEventStatus.Degraded => HealthStatus.Degraded,
HealthEventStatus.Critical => HealthStatus.Critical,
_ => HealthStatus.Critical,
};
}