Reclaim stale resource leases and refresh heartbeats
Adds ResourceRegistry::sweep_stale and runs it on each supervisor tick so a panicked or stuck segment task can't keep a shared resource locked indefinitely. The per-segment task refreshes heartbeat on every iteration for each key in runtime.held_resources, distinguishing live owners from dead ones. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
e3e7917078
commit
ed1067f6e5
|
|
@ -31,6 +31,10 @@ use crate::{
|
||||||
const APP_NAME: &str = "operation-system";
|
const APP_NAME: &str = "operation-system";
|
||||||
const SUPERVISOR_INTERVAL_SECS: u64 = 10;
|
const SUPERVISOR_INTERVAL_SECS: u64 = 10;
|
||||||
const FAULT_TICK_MS: u64 = 500;
|
const FAULT_TICK_MS: u64 = 500;
|
||||||
|
/// Resource leases older than this with no heartbeat are reclaimed by the
|
||||||
|
/// supervisor. Three supervisor ticks is enough headroom for a slow segment
|
||||||
|
/// task to refresh, but short enough to recover quickly from panics.
|
||||||
|
const RESOURCE_LEASE_MAX_AGE_SECS: i64 = 30;
|
||||||
|
|
||||||
/// Start the engine supervisor. Mirrors the feeder entry point.
|
/// Start the engine supervisor. Mirrors the feeder entry point.
|
||||||
pub fn start(state: AppState, store: Arc<SegmentRuntimeStore>) {
|
pub fn start(state: AppState, store: Arc<SegmentRuntimeStore>) {
|
||||||
|
|
@ -66,6 +70,19 @@ async fn supervise(state: AppState, store: Arc<SegmentRuntimeStore>) {
|
||||||
}
|
}
|
||||||
Err(err) => tracing::error!("Engine supervisor: list_segments failed: {}", err),
|
Err(err) => tracing::error!("Engine supervisor: list_segments failed: {}", err),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Reclaim stale resource leases (design doc §7 recovery path).
|
||||||
|
let reclaimed = state
|
||||||
|
.resource_registry
|
||||||
|
.sweep_stale(chrono::Duration::seconds(RESOURCE_LEASE_MAX_AGE_SECS))
|
||||||
|
.await;
|
||||||
|
for (key, owner) in reclaimed {
|
||||||
|
tracing::warn!(
|
||||||
|
"Engine: reclaimed stale resource '{}' previously held by segment {}",
|
||||||
|
key,
|
||||||
|
owner
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -190,6 +207,13 @@ async fn segment_task(state: AppState, store: Arc<SegmentRuntimeStore>, segment_
|
||||||
None => false,
|
None => false,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Refresh heartbeat on every tick we hold resources. Keeps the
|
||||||
|
// supervisor sweep from reclaiming a live but slow segment.
|
||||||
|
let snapshot_for_heartbeat = store.get_or_init(segment_id).await;
|
||||||
|
for key in &snapshot_for_heartbeat.held_resources {
|
||||||
|
state.resource_registry.heartbeat(key, segment_id).await;
|
||||||
|
}
|
||||||
|
|
||||||
// 5. Decide how long to sleep based on next state.
|
// 5. Decide how long to sleep based on next state.
|
||||||
let snapshot = store.get_or_init(segment_id).await;
|
let snapshot = store.get_or_init(segment_id).await;
|
||||||
if !runtime_changed && should_wait(&snapshot, segment.mode.as_str()) {
|
if !runtime_changed && should_wait(&snapshot, segment.mode.as_str()) {
|
||||||
|
|
|
||||||
|
|
@ -87,6 +87,29 @@ impl ResourceRegistry {
|
||||||
pub async fn snapshot(&self) -> HashMap<String, ResourceLease> {
|
pub async fn snapshot(&self) -> HashMap<String, ResourceLease> {
|
||||||
self.inner.read().await.clone()
|
self.inner.read().await.clone()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Drop any lease whose heartbeat is older than `max_age`. Returns the keys
|
||||||
|
/// that were reclaimed so the caller can log or alarm.
|
||||||
|
///
|
||||||
|
/// Recovery path from design doc §7 — a panicked or stuck segment task can
|
||||||
|
/// otherwise keep a public resource locked indefinitely.
|
||||||
|
pub async fn sweep_stale(
|
||||||
|
&self,
|
||||||
|
max_age: chrono::Duration,
|
||||||
|
) -> Vec<(String, Uuid)> {
|
||||||
|
let cutoff = Utc::now() - max_age;
|
||||||
|
let mut reclaimed = Vec::new();
|
||||||
|
let mut inner = self.inner.write().await;
|
||||||
|
inner.retain(|key, lease| {
|
||||||
|
if lease.heartbeat_at < cutoff {
|
||||||
|
reclaimed.push((key.clone(), lease.owner_segment_id));
|
||||||
|
false
|
||||||
|
} else {
|
||||||
|
true
|
||||||
|
}
|
||||||
|
});
|
||||||
|
reclaimed
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|
@ -124,4 +147,55 @@ mod tests {
|
||||||
assert!(!snap.contains_key("r2"));
|
assert!(!snap.contains_key("r2"));
|
||||||
assert_eq!(snap.get("r3").unwrap().owner_segment_id, seg_b);
|
assert_eq!(snap.get("r3").unwrap().owner_segment_id, seg_b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn heartbeat_refreshes_only_owner_lease() {
|
||||||
|
let registry = ResourceRegistry::new();
|
||||||
|
let seg_a = Uuid::new_v4();
|
||||||
|
let seg_b = Uuid::new_v4();
|
||||||
|
registry.try_acquire("r1", seg_a).await;
|
||||||
|
|
||||||
|
// Force the existing lease to look ancient.
|
||||||
|
{
|
||||||
|
let mut inner = registry.inner.write().await;
|
||||||
|
if let Some(lease) = inner.get_mut("r1") {
|
||||||
|
lease.heartbeat_at = Utc::now() - chrono::Duration::seconds(120);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Other-owner heartbeat is rejected.
|
||||||
|
registry.heartbeat("r1", seg_b).await;
|
||||||
|
let before = registry.snapshot().await.get("r1").unwrap().heartbeat_at;
|
||||||
|
assert!(Utc::now() - before > chrono::Duration::seconds(60));
|
||||||
|
|
||||||
|
// Owner heartbeat updates timestamp.
|
||||||
|
registry.heartbeat("r1", seg_a).await;
|
||||||
|
let after = registry.snapshot().await.get("r1").unwrap().heartbeat_at;
|
||||||
|
assert!(Utc::now() - after < chrono::Duration::seconds(10));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn sweep_stale_reclaims_only_old_leases() {
|
||||||
|
let registry = ResourceRegistry::new();
|
||||||
|
let seg_a = Uuid::new_v4();
|
||||||
|
let seg_b = Uuid::new_v4();
|
||||||
|
registry.try_acquire("stale", seg_a).await;
|
||||||
|
registry.try_acquire("fresh", seg_b).await;
|
||||||
|
|
||||||
|
// Age out the "stale" lease.
|
||||||
|
{
|
||||||
|
let mut inner = registry.inner.write().await;
|
||||||
|
if let Some(lease) = inner.get_mut("stale") {
|
||||||
|
lease.heartbeat_at = Utc::now() - chrono::Duration::seconds(60);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let reclaimed = registry.sweep_stale(chrono::Duration::seconds(30)).await;
|
||||||
|
assert_eq!(reclaimed.len(), 1);
|
||||||
|
assert_eq!(reclaimed[0].0, "stale");
|
||||||
|
assert_eq!(reclaimed[0].1, seg_a);
|
||||||
|
|
||||||
|
let snap = registry.snapshot().await;
|
||||||
|
assert!(!snap.contains_key("stale"));
|
||||||
|
assert!(snap.contains_key("fresh"));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue