OTSSignsOrchestrator/OTSSignsOrchestrator.Server/Health/HealthCheckEngine.cs

using Microsoft.AspNetCore.SignalR;
using Microsoft.EntityFrameworkCore;
using Quartz;
using OTSSignsOrchestrator.Server.Data;
using OTSSignsOrchestrator.Server.Data.Entities;
using OTSSignsOrchestrator.Server.Hubs;

namespace OTSSignsOrchestrator.Server.Health;

/// <summary>
/// Background service that schedules and runs all <see cref="IHealthCheck"/> implementations
/// against every active <see cref="Instance"/>. Persists <see cref="HealthEvent"/> rows,
/// aggregates worst-severity to update <see cref="Instance.HealthStatus"/>,
/// broadcasts changes via <see cref="FleetHub"/>, and triggers auto-remediation when applicable.
///
/// Uses Quartz to stagger per-instance jobs across the check interval (avoids thundering herd).
/// Concurrency is capped at 4 simultaneous check runs via <see cref="SemaphoreSlim"/>.
/// </summary>
public sealed class HealthCheckEngine : BackgroundService
{
    private readonly IServiceProvider _services;
    private readonly ISchedulerFactory _schedulerFactory;
    private readonly ILogger<HealthCheckEngine> _logger;

    /// <summary>Default interval between full health-check sweeps.</summary>
    internal static readonly TimeSpan DefaultCheckInterval = TimeSpan.FromMinutes(5);

    public HealthCheckEngine(
        IServiceProvider services,
        ISchedulerFactory schedulerFactory,
        ILogger<HealthCheckEngine> logger)
    {
        _services = services;
        _schedulerFactory = schedulerFactory;
        _logger = logger;
    }

    protected override async Task ExecuteAsync(CancellationToken stoppingToken)
    {
        // Wait briefly for the rest of the app to start
        await Task.Delay(TimeSpan.FromSeconds(5), stoppingToken);

        var scheduler = await _schedulerFactory.GetScheduler(stoppingToken);

        while (!stoppingToken.IsCancellationRequested)
        {
            try
            {
                await ScheduleInstanceChecks(scheduler, stoppingToken);
            }
            catch (Exception ex) when (ex is not OperationCanceledException)
            {
                _logger.LogError(ex, "Error scheduling health check sweep");
            }

            await Task.Delay(DefaultCheckInterval, stoppingToken);
        }
    }

    /// <summary>
    /// Load all active instances and schedule staggered Quartz jobs so that
    /// check start times are spread across the interval.
    /// </summary>
    private async Task ScheduleInstanceChecks(IScheduler scheduler, CancellationToken ct)
    {
        await using var scope = _services.CreateAsyncScope();
        var db = scope.ServiceProvider.GetRequiredService<OrchestratorDbContext>();

        var instances = await db.Instances
            .AsNoTracking()
            .Include(i => i.Customer)
            .Where(i => i.Customer.Status == CustomerStatus.Active)
            .ToListAsync(ct);

        if (instances.Count == 0)
            return;

        // Spread jobs across 80 % of the check interval to leave a buffer
        var spreadMs = (int)(DefaultCheckInterval.TotalMilliseconds * 0.8);
        var stepMs = instances.Count > 1 ? spreadMs / (instances.Count - 1) : 0;

        for (var i = 0; i < instances.Count; i++)
        {
            var instance = instances[i];
            var delay = TimeSpan.FromMilliseconds(stepMs * i);

            var jobKey = new JobKey($"health-{instance.Id}", "health-checks");

            // Remove previous trigger if it still exists (idempotent reschedule)
            if (await scheduler.CheckExists(jobKey, ct))
                await scheduler.DeleteJob(jobKey, ct);

            var job = JobBuilder.Create<InstanceHealthCheckJob>()
                .WithIdentity(jobKey)
                .UsingJobData("instanceId", instance.Id.ToString())
                .Build();

            var trigger = TriggerBuilder.Create()
                .WithIdentity($"health-{instance.Id}-trigger", "health-checks")
                .StartAt(DateTimeOffset.UtcNow.Add(delay))
                .Build();

            await scheduler.ScheduleJob(job, trigger, ct);
        }

        _logger.LogInformation(
            "Scheduled health checks for {Count} active instance(s)", instances.Count);
    }
}

/// <summary>
/// Quartz job that executes all <see cref="IHealthCheck"/> implementations for a single instance.
/// </summary>
[DisallowConcurrentExecution]
public sealed class InstanceHealthCheckJob : IJob
{
    /// <summary>Global concurrency limiter — max 4 parallel health check runs.</summary>
    private static readonly SemaphoreSlim s_concurrency = new(4);

    private readonly IServiceProvider _services;
    private readonly ILogger<InstanceHealthCheckJob> _logger;

    public InstanceHealthCheckJob(
        IServiceProvider services,
        ILogger<InstanceHealthCheckJob> logger)
    {
        _services = services;
        _logger = logger;
    }

    public async Task Execute(IJobExecutionContext context)
    {
        var instanceIdStr = context.MergedJobDataMap.GetString("instanceId");
        if (!Guid.TryParse(instanceIdStr, out var instanceId))
        {
            _logger.LogWarning("InstanceHealthCheckJob: invalid instanceId {Id}", instanceIdStr);
            return;
        }

        await s_concurrency.WaitAsync(context.CancellationToken);
        try
        {
            await RunChecksForInstanceAsync(instanceId, context.CancellationToken);
        }
        finally
        {
            s_concurrency.Release();
        }
    }

    private async Task RunChecksForInstanceAsync(Guid instanceId, CancellationToken ct)
    {
        await using var scope = _services.CreateAsyncScope();
        var db = scope.ServiceProvider.GetRequiredService<OrchestratorDbContext>();
        var hub = scope.ServiceProvider.GetRequiredService<IHubContext<FleetHub, IFleetClient>>();
        var checks = scope.ServiceProvider.GetServices<IHealthCheck>();

        var instance = await db.Instances
            .Include(i => i.Customer)
            .Include(i => i.OauthAppRegistries)
            .Include(i => i.ByoiConfigs)
            .FirstOrDefaultAsync(i => i.Id == instanceId, ct);

        if (instance is null)
        {
            _logger.LogWarning("InstanceHealthCheckJob: instance {Id} not found", instanceId);
            return;
        }

        var abbrev = instance.Customer.Abbreviation;
        var worstStatus = HealthStatus.Healthy;

        foreach (var check in checks)
        {
            // Skip the AuthentikGlobalHealthCheck — it runs on its own schedule
            if (check.CheckName == "AuthentikGlobal")
                continue;

            HealthCheckResult result;
            try
            {
                result = await check.RunAsync(instance, ct);
            }
            catch (Exception ex)
            {
                _logger.LogError(ex, "Health check {Check} failed for {Abbrev}", check.CheckName, abbrev);
                result = new HealthCheckResult(HealthStatus.Critical, $"Check threw exception: {ex.Message}");
            }

            // Persist HealthEvent
            var healthEvent = new HealthEvent
            {
                Id = Guid.NewGuid(),
                InstanceId = instanceId,
                CheckName = check.CheckName,
                Status = ToEventStatus(result.Status),
                Message = result.Message,
                Remediated = false,
                OccurredAt = DateTime.UtcNow,
            };

            // Auto-remediation
            if (check.AutoRemediate && result.Status == HealthStatus.Critical)
            {
                try
                {
                    var fixed_ = await check.RemediateAsync(instance, ct);
                    healthEvent.Remediated = fixed_;

                    // Append-only audit log
                    db.AuditLogs.Add(new AuditLog
                    {
                        Id = Guid.NewGuid(),
                        InstanceId = instanceId,
                        Actor = $"HealthCheckEngine:{check.CheckName}",
                        Action = "AutoRemediate",
                        Target = abbrev,
                        Outcome = fixed_ ? "Success" : "Failed",
                        Detail = result.Detail,
                        OccurredAt = DateTime.UtcNow,
                    });

                    if (fixed_)
                    {
                        _logger.LogInformation(
                            "Auto-remediated {Check} for {Abbrev}", check.CheckName, abbrev);
                        // Downgrade severity since we fixed it
                        healthEvent.Status = HealthEventStatus.Healthy;
                    }
                }
                catch (Exception ex)
                {
                    _logger.LogError(ex, "Remediation for {Check} failed on {Abbrev}", check.CheckName, abbrev);
                    db.AuditLogs.Add(new AuditLog
                    {
                        Id = Guid.NewGuid(),
                        InstanceId = instanceId,
                        Actor = $"HealthCheckEngine:{check.CheckName}",
                        Action = "AutoRemediate",
                        Target = abbrev,
                        Outcome = "Error",
                        Detail = ex.Message,
                        OccurredAt = DateTime.UtcNow,
                    });
                }
            }

            db.HealthEvents.Add(healthEvent);

            // Track worst severity (only from non-remediated results)
            if (!healthEvent.Remediated)
            {
                var status = FromEventStatus(healthEvent.Status);
                if (status > worstStatus)
                    worstStatus = status;
            }
        }

        // Update instance health status
        var previousStatus = instance.HealthStatus;
        instance.HealthStatus = worstStatus;
        instance.LastHealthCheck = DateTime.UtcNow;

        await db.SaveChangesAsync(ct);

        // Broadcast status change
        if (previousStatus != worstStatus)
        {
            await hub.Clients.All.SendInstanceStatusChanged(
                instance.CustomerId.ToString(), worstStatus.ToString());
        }
    }

    private static HealthEventStatus ToEventStatus(HealthStatus status) => status switch
    {
        HealthStatus.Healthy => HealthEventStatus.Healthy,
        HealthStatus.Degraded => HealthEventStatus.Degraded,
        HealthStatus.Critical => HealthEventStatus.Critical,
        _ => HealthEventStatus.Critical,
    };

    private static HealthStatus FromEventStatus(HealthEventStatus status) => status switch
    {
        HealthEventStatus.Healthy => HealthStatus.Healthy,
        HealthEventStatus.Degraded => HealthStatus.Degraded,
        HealthEventStatus.Critical => HealthStatus.Critical,
        _ => HealthStatus.Critical,
    };
}