347 lines
11 KiB
Rust
347 lines
11 KiB
Rust
use serde::{Deserialize, Serialize};
|
|
use std::collections::{HashMap, HashSet, VecDeque};
|
|
use std::fmt;
|
|
|
|
use crate::{
|
|
models::{Flow, Job, ScriptType},
|
|
storage::RedisDriver,
|
|
};
|
|
|
|
pub type DagResult<T> = Result<T, DagError>;
|
|
|
|
#[derive(Debug)]
|
|
pub enum DagError {
|
|
Storage(Box<dyn std::error::Error + Send + Sync>),
|
|
MissingDependency { job: u32, depends_on: u32 },
|
|
CycleDetected { remaining: Vec<u32> },
|
|
UnknownJob { job: u32 },
|
|
DependenciesIncomplete { job: u32, missing: Vec<u32> },
|
|
FlowFailed { failed_job: u32 },
|
|
JobNotStarted { job: u32 },
|
|
}
|
|
|
|
impl fmt::Display for DagError {
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
match self {
|
|
DagError::Storage(e) => write!(f, "Storage error: {}", e),
|
|
DagError::MissingDependency { job, depends_on } => write!(
|
|
f,
|
|
"Job {} depends on {}, which is not part of the flow.jobs list",
|
|
job, depends_on
|
|
),
|
|
DagError::CycleDetected { remaining } => {
|
|
write!(f, "Cycle detected; unresolved nodes: {:?}", remaining)
|
|
}
|
|
DagError::UnknownJob { job } => write!(f, "Unknown job id: {}", job),
|
|
DagError::DependenciesIncomplete { job, missing } => write!(
|
|
f,
|
|
"Job {} cannot start; missing completed deps: {:?}",
|
|
job, missing
|
|
),
|
|
DagError::FlowFailed { failed_job } => {
|
|
write!(f, "Flow failed due to job {}", failed_job)
|
|
}
|
|
DagError::JobNotStarted { job } => write!(
|
|
f,
|
|
"Job {} cannot be completed because it is not marked as started",
|
|
job
|
|
),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl std::error::Error for DagError {}
|
|
|
|
impl From<Box<dyn std::error::Error + Send + Sync>> for DagError {
|
|
fn from(e: Box<dyn std::error::Error + Send + Sync>) -> Self {
|
|
DagError::Storage(e)
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct JobSummary {
|
|
pub id: u32,
|
|
pub depends: Vec<u32>,
|
|
pub prerequisites: Vec<String>,
|
|
pub script_type: ScriptType,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct FlowDag {
|
|
pub flow_id: u32,
|
|
pub caller_id: u32,
|
|
pub context_id: u32,
|
|
pub nodes: HashMap<u32, JobSummary>,
|
|
pub edges: Vec<(u32, u32)>, // (from prerequisite, to job)
|
|
pub reverse_edges: Vec<(u32, u32)>, // (from job, to prerequisite)
|
|
pub roots: Vec<u32>, // in_degree == 0
|
|
pub leaves: Vec<u32>, // out_degree == 0
|
|
pub levels: Vec<Vec<u32>>, // topological layers for parallel execution
|
|
// Runtime execution state
|
|
pub started: HashSet<u32>,
|
|
pub completed: HashSet<u32>,
|
|
pub failed_job: Option<u32>,
|
|
}
|
|
|
|
pub async fn build_flow_dag(
|
|
redis: &RedisDriver,
|
|
context_id: u32,
|
|
flow_id: u32,
|
|
) -> DagResult<FlowDag> {
|
|
// Load flow
|
|
let flow: Flow = redis
|
|
.load_flow(context_id, flow_id)
|
|
.await
|
|
.map_err(DagError::from)?;
|
|
let caller_id = flow.caller_id();
|
|
let flow_job_ids = flow.jobs();
|
|
|
|
// Build a set for faster membership tests
|
|
let job_id_set: HashSet<u32> = flow_job_ids.iter().copied().collect();
|
|
|
|
// Load all jobs
|
|
let mut jobs: HashMap<u32, Job> = HashMap::with_capacity(flow_job_ids.len());
|
|
for jid in flow_job_ids {
|
|
let job = redis
|
|
.load_job(context_id, caller_id, *jid)
|
|
.await
|
|
.map_err(DagError::from)?;
|
|
jobs.insert(*jid, job);
|
|
}
|
|
|
|
// Validate dependencies and construct adjacency
|
|
let mut edges: Vec<(u32, u32)> = Vec::new();
|
|
let mut reverse_edges: Vec<(u32, u32)> = Vec::new();
|
|
let mut adj: HashMap<u32, Vec<u32>> = HashMap::with_capacity(jobs.len());
|
|
let mut rev_adj: HashMap<u32, Vec<u32>> = HashMap::with_capacity(jobs.len());
|
|
let mut in_degree: HashMap<u32, usize> = HashMap::with_capacity(jobs.len());
|
|
|
|
for &jid in flow_job_ids {
|
|
adj.entry(jid).or_default();
|
|
rev_adj.entry(jid).or_default();
|
|
in_degree.entry(jid).or_insert(0);
|
|
}
|
|
|
|
for (&jid, job) in &jobs {
|
|
for &dep in job.depends() {
|
|
if !job_id_set.contains(&dep) {
|
|
return Err(DagError::MissingDependency {
|
|
job: jid,
|
|
depends_on: dep,
|
|
});
|
|
}
|
|
// edge: dep -> jid
|
|
edges.push((dep, jid));
|
|
reverse_edges.push((jid, dep));
|
|
adj.get_mut(&dep).unwrap().push(jid);
|
|
rev_adj.get_mut(&jid).unwrap().push(dep);
|
|
*in_degree.get_mut(&jid).unwrap() += 1;
|
|
}
|
|
}
|
|
|
|
// Kahn's algorithm for topological sorting, with level construction
|
|
let mut zero_in: VecDeque<u32> = in_degree
|
|
.iter()
|
|
.filter_map(|(k, v)| if *v == 0 { Some(*k) } else { None })
|
|
.collect();
|
|
|
|
let mut processed_count = 0usize;
|
|
let mut levels: Vec<Vec<u32>> = Vec::new();
|
|
|
|
// To make deterministic, sort initial zero_in
|
|
{
|
|
let mut tmp: Vec<u32> = zero_in.iter().copied().collect();
|
|
tmp.sort_unstable();
|
|
zero_in = tmp.into_iter().collect();
|
|
}
|
|
|
|
while !zero_in.is_empty() {
|
|
let mut level: Vec<u32> = Vec::new();
|
|
// drain current frontier
|
|
let mut next_zero: Vec<u32> = Vec::new();
|
|
let mut current_frontier: Vec<u32> = zero_in.drain(..).collect();
|
|
current_frontier.sort_unstable();
|
|
for u in current_frontier {
|
|
level.push(u);
|
|
processed_count += 1;
|
|
if let Some(children) = adj.get(&u) {
|
|
let mut sorted_children = children.clone();
|
|
sorted_children.sort_unstable();
|
|
for &v in &sorted_children {
|
|
let d = in_degree.get_mut(&v).unwrap();
|
|
*d -= 1;
|
|
if *d == 0 {
|
|
next_zero.push(v);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
next_zero.sort_unstable();
|
|
zero_in = next_zero.into_iter().collect();
|
|
levels.push(level);
|
|
}
|
|
|
|
if processed_count != jobs.len() {
|
|
let remaining: Vec<u32> = in_degree
|
|
.into_iter()
|
|
.filter_map(|(k, v)| if v > 0 { Some(k) } else { None })
|
|
.collect();
|
|
return Err(DagError::CycleDetected { remaining });
|
|
}
|
|
|
|
// Roots and leaves
|
|
let roots: Vec<u32> = levels.first().cloned().unwrap_or_default();
|
|
let leaves: Vec<u32> = adj
|
|
.iter()
|
|
.filter_map(|(k, v)| if v.is_empty() { Some(*k) } else { None })
|
|
.collect();
|
|
|
|
// Nodes map (JobSummary)
|
|
let mut nodes: HashMap<u32, JobSummary> = HashMap::with_capacity(jobs.len());
|
|
for (&jid, job) in &jobs {
|
|
let summary = JobSummary {
|
|
id: jid,
|
|
depends: job.depends().to_vec(),
|
|
prerequisites: job.prerequisites().to_vec(),
|
|
script_type: job.script_type(),
|
|
};
|
|
nodes.insert(jid, summary);
|
|
}
|
|
|
|
// Sort edges deterministically
|
|
edges.sort_unstable();
|
|
reverse_edges.sort_unstable();
|
|
|
|
let dag = FlowDag {
|
|
flow_id,
|
|
caller_id,
|
|
context_id,
|
|
nodes,
|
|
edges,
|
|
reverse_edges,
|
|
roots,
|
|
leaves,
|
|
levels,
|
|
started: HashSet::new(),
|
|
completed: HashSet::new(),
|
|
failed_job: None,
|
|
};
|
|
|
|
Ok(dag)
|
|
}
|
|
|
|
impl FlowDag {
|
|
/// Return all jobs that are ready to be processed.
|
|
/// A job is ready if:
|
|
/// - it exists in the DAG
|
|
/// - it is not already started or completed
|
|
/// - it has no dependencies, or all dependencies are completed
|
|
///
|
|
/// If any job has failed, the entire flow is considered failed and an error is returned.
|
|
pub fn ready_jobs(&self) -> DagResult<Vec<u32>> {
|
|
if let Some(failed_job) = self.failed_job {
|
|
return Err(DagError::FlowFailed { failed_job });
|
|
}
|
|
|
|
let mut ready: Vec<u32> = Vec::new();
|
|
for (&jid, summary) in &self.nodes {
|
|
if self.completed.contains(&jid) || self.started.contains(&jid) {
|
|
continue;
|
|
}
|
|
let mut deps_ok = true;
|
|
for dep in &summary.depends {
|
|
if !self.completed.contains(dep) {
|
|
deps_ok = false;
|
|
break;
|
|
}
|
|
}
|
|
if deps_ok {
|
|
ready.push(jid);
|
|
}
|
|
}
|
|
ready.sort_unstable();
|
|
Ok(ready)
|
|
}
|
|
|
|
/// Mark a job as started.
|
|
/// Strict validation rules:
|
|
/// - Unknown jobs are rejected with UnknownJob
|
|
/// - If the flow has already failed, return FlowFailed
|
|
/// - If the job is already started or completed, this is a no-op (idempotent)
|
|
/// - If any dependency is not completed, return DependenciesIncomplete with the missing deps
|
|
pub fn mark_job_started(&mut self, job: u32) -> DagResult<()> {
|
|
if !self.nodes.contains_key(&job) {
|
|
return Err(DagError::UnknownJob { job });
|
|
}
|
|
if self.completed.contains(&job) || self.started.contains(&job) {
|
|
return Ok(());
|
|
}
|
|
if let Some(failed_job) = self.failed_job {
|
|
return Err(DagError::FlowFailed { failed_job });
|
|
}
|
|
|
|
let summary = self.nodes.get(&job).expect("checked contains_key");
|
|
let missing: Vec<u32> = summary
|
|
.depends
|
|
.iter()
|
|
.copied()
|
|
.filter(|d| !self.completed.contains(d))
|
|
.collect();
|
|
|
|
if !missing.is_empty() {
|
|
return Err(DagError::DependenciesIncomplete { job, missing });
|
|
}
|
|
|
|
self.started.insert(job);
|
|
Ok(())
|
|
}
|
|
|
|
/// Mark a job as completed.
|
|
/// Strict validation rules:
|
|
/// - Unknown jobs are rejected with UnknownJob
|
|
/// - If the job is already completed, this is a no-op (idempotent)
|
|
/// - If the flow has already failed, return FlowFailed
|
|
/// - If the job was not previously started, return JobNotStarted
|
|
pub fn mark_job_completed(&mut self, job: u32) -> DagResult<()> {
|
|
if !self.nodes.contains_key(&job) {
|
|
return Err(DagError::UnknownJob { job });
|
|
}
|
|
if self.completed.contains(&job) {
|
|
return Ok(());
|
|
}
|
|
if let Some(failed_job) = self.failed_job {
|
|
return Err(DagError::FlowFailed { failed_job });
|
|
}
|
|
if !self.started.contains(&job) {
|
|
return Err(DagError::JobNotStarted { job });
|
|
}
|
|
|
|
self.started.remove(&job);
|
|
self.completed.insert(job);
|
|
Ok(())
|
|
}
|
|
|
|
/// Mark a job as failed.
|
|
/// Behavior:
|
|
/// - Unknown jobs are rejected with UnknownJob
|
|
/// - If a failure is already recorded:
|
|
/// - If it is the same job, no-op (idempotent)
|
|
/// - If it is a different job, return FlowFailed with the already-failed job
|
|
/// - Otherwise record this job as the failed job
|
|
pub fn mark_job_failed(&mut self, job: u32) -> DagResult<()> {
|
|
if !self.nodes.contains_key(&job) {
|
|
return Err(DagError::UnknownJob { job });
|
|
}
|
|
match self.failed_job {
|
|
Some(existing) if existing == job => Ok(()),
|
|
Some(existing) => Err(DagError::FlowFailed {
|
|
failed_job: existing,
|
|
}),
|
|
None => {
|
|
self.failed_job = Some(job);
|
|
Ok(())
|
|
}
|
|
}
|
|
}
|
|
}
|