using Microsoft.AspNetCore.SignalR; using Microsoft.EntityFrameworkCore; using Quartz; using OTSSignsOrchestrator.Server.Data; using OTSSignsOrchestrator.Server.Data.Entities; using OTSSignsOrchestrator.Server.Hubs; namespace OTSSignsOrchestrator.Server.Health; /// /// Background service that schedules and runs all implementations /// against every active . Persists rows, /// aggregates worst-severity to update , /// broadcasts changes via , and triggers auto-remediation when applicable. /// /// Uses Quartz to stagger per-instance jobs across the check interval (avoids thundering herd). /// Concurrency is capped at 4 simultaneous check runs via . /// public sealed class HealthCheckEngine : BackgroundService { private readonly IServiceProvider _services; private readonly ISchedulerFactory _schedulerFactory; private readonly ILogger _logger; /// Default interval between full health-check sweeps. internal static readonly TimeSpan DefaultCheckInterval = TimeSpan.FromMinutes(5); public HealthCheckEngine( IServiceProvider services, ISchedulerFactory schedulerFactory, ILogger logger) { _services = services; _schedulerFactory = schedulerFactory; _logger = logger; } protected override async Task ExecuteAsync(CancellationToken stoppingToken) { // Wait briefly for the rest of the app to start await Task.Delay(TimeSpan.FromSeconds(5), stoppingToken); var scheduler = await _schedulerFactory.GetScheduler(stoppingToken); while (!stoppingToken.IsCancellationRequested) { try { await ScheduleInstanceChecks(scheduler, stoppingToken); } catch (Exception ex) when (ex is not OperationCanceledException) { _logger.LogError(ex, "Error scheduling health check sweep"); } await Task.Delay(DefaultCheckInterval, stoppingToken); } } /// /// Load all active instances and schedule staggered Quartz jobs so that /// check start times are spread across the interval. /// private async Task ScheduleInstanceChecks(IScheduler scheduler, CancellationToken ct) { await using var scope = _services.CreateAsyncScope(); var db = scope.ServiceProvider.GetRequiredService(); var instances = await db.Instances .AsNoTracking() .Include(i => i.Customer) .Where(i => i.Customer.Status == CustomerStatus.Active) .ToListAsync(ct); if (instances.Count == 0) return; // Spread jobs across 80 % of the check interval to leave a buffer var spreadMs = (int)(DefaultCheckInterval.TotalMilliseconds * 0.8); var stepMs = instances.Count > 1 ? spreadMs / (instances.Count - 1) : 0; for (var i = 0; i < instances.Count; i++) { var instance = instances[i]; var delay = TimeSpan.FromMilliseconds(stepMs * i); var jobKey = new JobKey($"health-{instance.Id}", "health-checks"); // Remove previous trigger if it still exists (idempotent reschedule) if (await scheduler.CheckExists(jobKey, ct)) await scheduler.DeleteJob(jobKey, ct); var job = JobBuilder.Create() .WithIdentity(jobKey) .UsingJobData("instanceId", instance.Id.ToString()) .Build(); var trigger = TriggerBuilder.Create() .WithIdentity($"health-{instance.Id}-trigger", "health-checks") .StartAt(DateTimeOffset.UtcNow.Add(delay)) .Build(); await scheduler.ScheduleJob(job, trigger, ct); } _logger.LogInformation( "Scheduled health checks for {Count} active instance(s)", instances.Count); } } /// /// Quartz job that executes all implementations for a single instance. /// [DisallowConcurrentExecution] public sealed class InstanceHealthCheckJob : IJob { /// Global concurrency limiter — max 4 parallel health check runs. private static readonly SemaphoreSlim s_concurrency = new(4); private readonly IServiceProvider _services; private readonly ILogger _logger; public InstanceHealthCheckJob( IServiceProvider services, ILogger logger) { _services = services; _logger = logger; } public async Task Execute(IJobExecutionContext context) { var instanceIdStr = context.MergedJobDataMap.GetString("instanceId"); if (!Guid.TryParse(instanceIdStr, out var instanceId)) { _logger.LogWarning("InstanceHealthCheckJob: invalid instanceId {Id}", instanceIdStr); return; } await s_concurrency.WaitAsync(context.CancellationToken); try { await RunChecksForInstanceAsync(instanceId, context.CancellationToken); } finally { s_concurrency.Release(); } } private async Task RunChecksForInstanceAsync(Guid instanceId, CancellationToken ct) { await using var scope = _services.CreateAsyncScope(); var db = scope.ServiceProvider.GetRequiredService(); var hub = scope.ServiceProvider.GetRequiredService>(); var checks = scope.ServiceProvider.GetServices(); var instance = await db.Instances .Include(i => i.Customer) .Include(i => i.OauthAppRegistries) .Include(i => i.ByoiConfigs) .FirstOrDefaultAsync(i => i.Id == instanceId, ct); if (instance is null) { _logger.LogWarning("InstanceHealthCheckJob: instance {Id} not found", instanceId); return; } var abbrev = instance.Customer.Abbreviation; var worstStatus = HealthStatus.Healthy; foreach (var check in checks) { // Skip the AuthentikGlobalHealthCheck — it runs on its own schedule if (check.CheckName == "AuthentikGlobal") continue; HealthCheckResult result; try { result = await check.RunAsync(instance, ct); } catch (Exception ex) { _logger.LogError(ex, "Health check {Check} failed for {Abbrev}", check.CheckName, abbrev); result = new HealthCheckResult(HealthStatus.Critical, $"Check threw exception: {ex.Message}"); } // Persist HealthEvent var healthEvent = new HealthEvent { Id = Guid.NewGuid(), InstanceId = instanceId, CheckName = check.CheckName, Status = ToEventStatus(result.Status), Message = result.Message, Remediated = false, OccurredAt = DateTime.UtcNow, }; // Auto-remediation if (check.AutoRemediate && result.Status == HealthStatus.Critical) { try { var fixed_ = await check.RemediateAsync(instance, ct); healthEvent.Remediated = fixed_; // Append-only audit log db.AuditLogs.Add(new AuditLog { Id = Guid.NewGuid(), InstanceId = instanceId, Actor = $"HealthCheckEngine:{check.CheckName}", Action = "AutoRemediate", Target = abbrev, Outcome = fixed_ ? "Success" : "Failed", Detail = result.Detail, OccurredAt = DateTime.UtcNow, }); if (fixed_) { _logger.LogInformation( "Auto-remediated {Check} for {Abbrev}", check.CheckName, abbrev); // Downgrade severity since we fixed it healthEvent.Status = HealthEventStatus.Healthy; } } catch (Exception ex) { _logger.LogError(ex, "Remediation for {Check} failed on {Abbrev}", check.CheckName, abbrev); db.AuditLogs.Add(new AuditLog { Id = Guid.NewGuid(), InstanceId = instanceId, Actor = $"HealthCheckEngine:{check.CheckName}", Action = "AutoRemediate", Target = abbrev, Outcome = "Error", Detail = ex.Message, OccurredAt = DateTime.UtcNow, }); } } db.HealthEvents.Add(healthEvent); // Track worst severity (only from non-remediated results) if (!healthEvent.Remediated) { var status = FromEventStatus(healthEvent.Status); if (status > worstStatus) worstStatus = status; } } // Update instance health status var previousStatus = instance.HealthStatus; instance.HealthStatus = worstStatus; instance.LastHealthCheck = DateTime.UtcNow; await db.SaveChangesAsync(ct); // Broadcast status change if (previousStatus != worstStatus) { await hub.Clients.All.SendInstanceStatusChanged( instance.CustomerId.ToString(), worstStatus.ToString()); } } private static HealthEventStatus ToEventStatus(HealthStatus status) => status switch { HealthStatus.Healthy => HealthEventStatus.Healthy, HealthStatus.Degraded => HealthEventStatus.Degraded, HealthStatus.Critical => HealthEventStatus.Critical, _ => HealthEventStatus.Critical, }; private static HealthStatus FromEventStatus(HealthEventStatus status) => status switch { HealthEventStatus.Healthy => HealthStatus.Healthy, HealthEventStatus.Degraded => HealthStatus.Degraded, HealthEventStatus.Critical => HealthStatus.Critical, _ => HealthStatus.Critical, }; }