- Add ReactivatePipeline to handle subscription reactivation, including scaling Docker services, health verification, status updates, audit logging, and broadcasting status changes. - Introduce RotateCredentialsPipeline for OAuth2 credential rotation, managing the deletion of old apps, creation of new ones, credential storage, access verification, and audit logging. - Create StepRunner to manage job step execution, including lifecycle management and progress broadcasting via SignalR. - Implement SuspendPipeline for subscription suspension, scaling down services, updating statuses, logging audits, and broadcasting changes. - Add UpdateScreenLimitPipeline to update Xibo CMS screen limits and record snapshots. - Introduce XiboFeatureManifests for hardcoded feature ACLs per role. - Add docker-compose.dev.yml for local development with PostgreSQL setup.
290 lines
11 KiB
C#
290 lines
11 KiB
C#
using Microsoft.AspNetCore.SignalR;
|
|
using Microsoft.EntityFrameworkCore;
|
|
using Quartz;
|
|
using OTSSignsOrchestrator.Server.Data;
|
|
using OTSSignsOrchestrator.Server.Data.Entities;
|
|
using OTSSignsOrchestrator.Server.Hubs;
|
|
|
|
namespace OTSSignsOrchestrator.Server.Health;
|
|
|
|
/// <summary>
|
|
/// Background service that schedules and runs all <see cref="IHealthCheck"/> implementations
|
|
/// against every active <see cref="Instance"/>. Persists <see cref="HealthEvent"/> rows,
|
|
/// aggregates worst-severity to update <see cref="Instance.HealthStatus"/>,
|
|
/// broadcasts changes via <see cref="FleetHub"/>, and triggers auto-remediation when applicable.
|
|
///
|
|
/// Uses Quartz to stagger per-instance jobs across the check interval (avoids thundering herd).
|
|
/// Concurrency is capped at 4 simultaneous check runs via <see cref="SemaphoreSlim"/>.
|
|
/// </summary>
|
|
public sealed class HealthCheckEngine : BackgroundService
|
|
{
|
|
private readonly IServiceProvider _services;
|
|
private readonly ISchedulerFactory _schedulerFactory;
|
|
private readonly ILogger<HealthCheckEngine> _logger;
|
|
|
|
/// <summary>Default interval between full health-check sweeps.</summary>
|
|
internal static readonly TimeSpan DefaultCheckInterval = TimeSpan.FromMinutes(5);
|
|
|
|
public HealthCheckEngine(
|
|
IServiceProvider services,
|
|
ISchedulerFactory schedulerFactory,
|
|
ILogger<HealthCheckEngine> logger)
|
|
{
|
|
_services = services;
|
|
_schedulerFactory = schedulerFactory;
|
|
_logger = logger;
|
|
}
|
|
|
|
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
|
{
|
|
// Wait briefly for the rest of the app to start
|
|
await Task.Delay(TimeSpan.FromSeconds(5), stoppingToken);
|
|
|
|
var scheduler = await _schedulerFactory.GetScheduler(stoppingToken);
|
|
|
|
while (!stoppingToken.IsCancellationRequested)
|
|
{
|
|
try
|
|
{
|
|
await ScheduleInstanceChecks(scheduler, stoppingToken);
|
|
}
|
|
catch (Exception ex) when (ex is not OperationCanceledException)
|
|
{
|
|
_logger.LogError(ex, "Error scheduling health check sweep");
|
|
}
|
|
|
|
await Task.Delay(DefaultCheckInterval, stoppingToken);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Load all active instances and schedule staggered Quartz jobs so that
|
|
/// check start times are spread across the interval.
|
|
/// </summary>
|
|
private async Task ScheduleInstanceChecks(IScheduler scheduler, CancellationToken ct)
|
|
{
|
|
await using var scope = _services.CreateAsyncScope();
|
|
var db = scope.ServiceProvider.GetRequiredService<OrchestratorDbContext>();
|
|
|
|
var instances = await db.Instances
|
|
.AsNoTracking()
|
|
.Include(i => i.Customer)
|
|
.Where(i => i.Customer.Status == CustomerStatus.Active)
|
|
.ToListAsync(ct);
|
|
|
|
if (instances.Count == 0)
|
|
return;
|
|
|
|
// Spread jobs across 80 % of the check interval to leave a buffer
|
|
var spreadMs = (int)(DefaultCheckInterval.TotalMilliseconds * 0.8);
|
|
var stepMs = instances.Count > 1 ? spreadMs / (instances.Count - 1) : 0;
|
|
|
|
for (var i = 0; i < instances.Count; i++)
|
|
{
|
|
var instance = instances[i];
|
|
var delay = TimeSpan.FromMilliseconds(stepMs * i);
|
|
|
|
var jobKey = new JobKey($"health-{instance.Id}", "health-checks");
|
|
|
|
// Remove previous trigger if it still exists (idempotent reschedule)
|
|
if (await scheduler.CheckExists(jobKey, ct))
|
|
await scheduler.DeleteJob(jobKey, ct);
|
|
|
|
var job = JobBuilder.Create<InstanceHealthCheckJob>()
|
|
.WithIdentity(jobKey)
|
|
.UsingJobData("instanceId", instance.Id.ToString())
|
|
.Build();
|
|
|
|
var trigger = TriggerBuilder.Create()
|
|
.WithIdentity($"health-{instance.Id}-trigger", "health-checks")
|
|
.StartAt(DateTimeOffset.UtcNow.Add(delay))
|
|
.Build();
|
|
|
|
await scheduler.ScheduleJob(job, trigger, ct);
|
|
}
|
|
|
|
_logger.LogInformation(
|
|
"Scheduled health checks for {Count} active instance(s)", instances.Count);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Quartz job that executes all <see cref="IHealthCheck"/> implementations for a single instance.
|
|
/// </summary>
|
|
[DisallowConcurrentExecution]
|
|
public sealed class InstanceHealthCheckJob : IJob
|
|
{
|
|
/// <summary>Global concurrency limiter — max 4 parallel health check runs.</summary>
|
|
private static readonly SemaphoreSlim s_concurrency = new(4);
|
|
|
|
private readonly IServiceProvider _services;
|
|
private readonly ILogger<InstanceHealthCheckJob> _logger;
|
|
|
|
public InstanceHealthCheckJob(
|
|
IServiceProvider services,
|
|
ILogger<InstanceHealthCheckJob> logger)
|
|
{
|
|
_services = services;
|
|
_logger = logger;
|
|
}
|
|
|
|
public async Task Execute(IJobExecutionContext context)
|
|
{
|
|
var instanceIdStr = context.MergedJobDataMap.GetString("instanceId");
|
|
if (!Guid.TryParse(instanceIdStr, out var instanceId))
|
|
{
|
|
_logger.LogWarning("InstanceHealthCheckJob: invalid instanceId {Id}", instanceIdStr);
|
|
return;
|
|
}
|
|
|
|
await s_concurrency.WaitAsync(context.CancellationToken);
|
|
try
|
|
{
|
|
await RunChecksForInstanceAsync(instanceId, context.CancellationToken);
|
|
}
|
|
finally
|
|
{
|
|
s_concurrency.Release();
|
|
}
|
|
}
|
|
|
|
private async Task RunChecksForInstanceAsync(Guid instanceId, CancellationToken ct)
|
|
{
|
|
await using var scope = _services.CreateAsyncScope();
|
|
var db = scope.ServiceProvider.GetRequiredService<OrchestratorDbContext>();
|
|
var hub = scope.ServiceProvider.GetRequiredService<IHubContext<FleetHub, IFleetClient>>();
|
|
var checks = scope.ServiceProvider.GetServices<IHealthCheck>();
|
|
|
|
var instance = await db.Instances
|
|
.Include(i => i.Customer)
|
|
.Include(i => i.OauthAppRegistries)
|
|
.Include(i => i.ByoiConfigs)
|
|
.FirstOrDefaultAsync(i => i.Id == instanceId, ct);
|
|
|
|
if (instance is null)
|
|
{
|
|
_logger.LogWarning("InstanceHealthCheckJob: instance {Id} not found", instanceId);
|
|
return;
|
|
}
|
|
|
|
var abbrev = instance.Customer.Abbreviation;
|
|
var worstStatus = HealthStatus.Healthy;
|
|
|
|
foreach (var check in checks)
|
|
{
|
|
// Skip the AuthentikGlobalHealthCheck — it runs on its own schedule
|
|
if (check.CheckName == "AuthentikGlobal")
|
|
continue;
|
|
|
|
HealthCheckResult result;
|
|
try
|
|
{
|
|
result = await check.RunAsync(instance, ct);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Health check {Check} failed for {Abbrev}", check.CheckName, abbrev);
|
|
result = new HealthCheckResult(HealthStatus.Critical, $"Check threw exception: {ex.Message}");
|
|
}
|
|
|
|
// Persist HealthEvent
|
|
var healthEvent = new HealthEvent
|
|
{
|
|
Id = Guid.NewGuid(),
|
|
InstanceId = instanceId,
|
|
CheckName = check.CheckName,
|
|
Status = ToEventStatus(result.Status),
|
|
Message = result.Message,
|
|
Remediated = false,
|
|
OccurredAt = DateTime.UtcNow,
|
|
};
|
|
|
|
// Auto-remediation
|
|
if (check.AutoRemediate && result.Status == HealthStatus.Critical)
|
|
{
|
|
try
|
|
{
|
|
var fixed_ = await check.RemediateAsync(instance, ct);
|
|
healthEvent.Remediated = fixed_;
|
|
|
|
// Append-only audit log
|
|
db.AuditLogs.Add(new AuditLog
|
|
{
|
|
Id = Guid.NewGuid(),
|
|
InstanceId = instanceId,
|
|
Actor = $"HealthCheckEngine:{check.CheckName}",
|
|
Action = "AutoRemediate",
|
|
Target = abbrev,
|
|
Outcome = fixed_ ? "Success" : "Failed",
|
|
Detail = result.Detail,
|
|
OccurredAt = DateTime.UtcNow,
|
|
});
|
|
|
|
if (fixed_)
|
|
{
|
|
_logger.LogInformation(
|
|
"Auto-remediated {Check} for {Abbrev}", check.CheckName, abbrev);
|
|
// Downgrade severity since we fixed it
|
|
healthEvent.Status = HealthEventStatus.Healthy;
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Remediation for {Check} failed on {Abbrev}", check.CheckName, abbrev);
|
|
db.AuditLogs.Add(new AuditLog
|
|
{
|
|
Id = Guid.NewGuid(),
|
|
InstanceId = instanceId,
|
|
Actor = $"HealthCheckEngine:{check.CheckName}",
|
|
Action = "AutoRemediate",
|
|
Target = abbrev,
|
|
Outcome = "Error",
|
|
Detail = ex.Message,
|
|
OccurredAt = DateTime.UtcNow,
|
|
});
|
|
}
|
|
}
|
|
|
|
db.HealthEvents.Add(healthEvent);
|
|
|
|
// Track worst severity (only from non-remediated results)
|
|
if (!healthEvent.Remediated)
|
|
{
|
|
var status = FromEventStatus(healthEvent.Status);
|
|
if (status > worstStatus)
|
|
worstStatus = status;
|
|
}
|
|
}
|
|
|
|
// Update instance health status
|
|
var previousStatus = instance.HealthStatus;
|
|
instance.HealthStatus = worstStatus;
|
|
instance.LastHealthCheck = DateTime.UtcNow;
|
|
|
|
await db.SaveChangesAsync(ct);
|
|
|
|
// Broadcast status change
|
|
if (previousStatus != worstStatus)
|
|
{
|
|
await hub.Clients.All.SendInstanceStatusChanged(
|
|
instance.CustomerId.ToString(), worstStatus.ToString());
|
|
}
|
|
}
|
|
|
|
private static HealthEventStatus ToEventStatus(HealthStatus status) => status switch
|
|
{
|
|
HealthStatus.Healthy => HealthEventStatus.Healthy,
|
|
HealthStatus.Degraded => HealthEventStatus.Degraded,
|
|
HealthStatus.Critical => HealthEventStatus.Critical,
|
|
_ => HealthEventStatus.Critical,
|
|
};
|
|
|
|
private static HealthStatus FromEventStatus(HealthEventStatus status) => status switch
|
|
{
|
|
HealthEventStatus.Healthy => HealthStatus.Healthy,
|
|
HealthEventStatus.Degraded => HealthStatus.Degraded,
|
|
HealthEventStatus.Critical => HealthStatus.Critical,
|
|
_ => HealthStatus.Critical,
|
|
};
|
|
}
|