fix coordinator compilation

This commit is contained in:
Timur Gordon
2025-11-14 00:35:26 +01:00
parent 84545f0d75
commit 94a66d9af4
15 changed files with 397 additions and 459 deletions

View File

@@ -38,6 +38,9 @@ base64 = "0.22.1"
tracing.workspace = true tracing.workspace = true
tracing-subscriber.workspace = true tracing-subscriber.workspace = true
# Time
chrono.workspace = true
# Hero dependencies # Hero dependencies
hero-job = { path = "../../lib/models/job" } hero-job = { path = "../../lib/models/job" }
hero-supervisor-openrpc-client = { path = "../../lib/clients/supervisor" } hero-supervisor-openrpc-client = { path = "../../lib/clients/supervisor" }

View File

@@ -1,12 +0,0 @@
// Re-export from the supervisor client library
pub use hero_supervisor_openrpc_client::{
SupervisorClient,
ClientError as SupervisorClientError,
transports::{
MyceliumClient,
MyceliumClientError,
SupervisorHub,
Destination,
MyceliumTransport,
},
};

View File

@@ -3,7 +3,7 @@ use std::collections::{HashMap, HashSet, VecDeque};
use std::fmt; use std::fmt;
use crate::{ use crate::{
models::{Flow, Job, JobStatus, ScriptType}, models::{Flow, Job, JobStatus},
storage::RedisDriver, storage::RedisDriver,
}; };
@@ -58,12 +58,25 @@ impl From<Box<dyn std::error::Error + Send + Sync>> for DagError {
} }
} }
/// Node execution status - tracks the state of a job in the DAG workflow
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub enum NodeStatus {
Pending, // Not yet ready to execute (waiting for dependencies)
Ready, // Dependencies met, ready to be dispatched
Dispatched, // Sent to supervisor for execution
Running, // Currently executing
Completed, // Successfully completed
Failed, // Execution failed
Cancelled, // Execution was cancelled
}
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct JobSummary { pub struct FlowNode {
pub id: u32, pub id: u32,
pub depends: Vec<u32>, pub depends: Vec<u32>,
pub prerequisites: Vec<String>, pub prerequisites: Vec<String>,
pub script_type: ScriptType, pub supervisor_url: String, // URL of the supervisor to route this job to
pub node_status: NodeStatus, // Track execution status at the node level
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -71,7 +84,7 @@ pub struct FlowDag {
pub flow_id: u32, pub flow_id: u32,
pub caller_id: u32, pub caller_id: u32,
pub context_id: u32, pub context_id: u32,
pub nodes: HashMap<u32, JobSummary>, pub nodes: HashMap<u32, FlowNode>,
pub edges: Vec<(u32, u32)>, // (from prerequisite, to job) pub edges: Vec<(u32, u32)>, // (from prerequisite, to job)
pub reverse_edges: Vec<(u32, u32)>, // (from job, to prerequisite) pub reverse_edges: Vec<(u32, u32)>, // (from job, to prerequisite)
pub roots: Vec<u32>, // in_degree == 0 pub roots: Vec<u32>, // in_degree == 0
@@ -122,8 +135,23 @@ pub async fn build_flow_dag(
in_degree.entry(jid).or_insert(0); in_degree.entry(jid).or_insert(0);
} }
for (&jid, job) in &jobs { // Build nodes first with their dependencies
for &dep in job.depends() { // TODO: Load node dependencies from Flow metadata or separate storage
let mut nodes: HashMap<u32, FlowNode> = HashMap::with_capacity(jobs.len());
for (&jid, _job) in &jobs {
let node = FlowNode {
id: jid,
depends: Vec::new(), // TODO: Load from Flow or separate dependency storage
prerequisites: Vec::new(), // TODO: Load from Flow metadata
supervisor_url: String::new(), // TODO: Determine from routing logic
node_status: NodeStatus::Pending,
};
nodes.insert(jid, node);
}
// Build edges from node dependencies
for (&jid, node) in &nodes {
for &dep in &node.depends {
if !job_id_set.contains(&dep) { if !job_id_set.contains(&dep) {
return Err(DagError::MissingDependency { return Err(DagError::MissingDependency {
job: jid, job: jid,
@@ -196,44 +224,31 @@ pub async fn build_flow_dag(
.filter_map(|(k, v)| if v.is_empty() { Some(*k) } else { None }) .filter_map(|(k, v)| if v.is_empty() { Some(*k) } else { None })
.collect(); .collect();
// Nodes map (JobSummary)
let mut nodes: HashMap<u32, JobSummary> = HashMap::with_capacity(jobs.len());
for (&jid, job) in &jobs {
let summary = JobSummary {
id: jid,
depends: job.depends().to_vec(),
prerequisites: job.prerequisites().to_vec(),
script_type: job.script_type(),
};
nodes.insert(jid, summary);
}
// Sort edges deterministically // Sort edges deterministically
edges.sort_unstable(); edges.sort_unstable();
reverse_edges.sort_unstable(); reverse_edges.sort_unstable();
// Populate runtime execution state from persisted Job.status() // Populate runtime execution state from FlowNode status
let mut started_set: HashSet<u32> = HashSet::new(); let mut started_set: HashSet<u32> = HashSet::new();
let mut completed_set: HashSet<u32> = HashSet::new(); let mut completed_set: HashSet<u32> = HashSet::new();
let mut error_ids: Vec<u32> = Vec::new(); let mut error_ids: Vec<u32> = Vec::new();
for (&jid, job) in &jobs { for (&jid, node) in &nodes {
match job.status() { match node.node_status {
JobStatus::Finished => { NodeStatus::Completed => {
completed_set.insert(jid); completed_set.insert(jid);
} }
JobStatus::Started => { NodeStatus::Running => {
started_set.insert(jid); started_set.insert(jid);
} }
JobStatus::Dispatched => { NodeStatus::Dispatched => {
// Consider Dispatched as "in-flight" for DAG runtime started set, // Consider Dispatched as "in-flight" for DAG runtime started set
// so queued/running work is visible in periodic snapshots.
started_set.insert(jid); started_set.insert(jid);
} }
JobStatus::Error => { NodeStatus::Failed => {
error_ids.push(jid); error_ids.push(jid);
} }
JobStatus::WaitingForPrerequisites => { NodeStatus::Pending | NodeStatus::Ready | NodeStatus::Cancelled => {
// Neither started nor completed // Neither started nor completed
} }
} }
@@ -304,7 +319,7 @@ impl FlowDag {
/// - If the flow has already failed, return FlowFailed /// - If the flow has already failed, return FlowFailed
/// - If the job is already started or completed, this is a no-op (idempotent) /// - If the job is already started or completed, this is a no-op (idempotent)
/// - If any dependency is not completed, return DependenciesIncomplete with the missing deps /// - If any dependency is not completed, return DependenciesIncomplete with the missing deps
pub fn mark_job_started(&mut self, job: u32) -> DagResult<()> { pub fn mark_node_started(&mut self, job: u32) -> DagResult<()> {
if !self.nodes.contains_key(&job) { if !self.nodes.contains_key(&job) {
return Err(DagError::UnknownJob { job }); return Err(DagError::UnknownJob { job });
} }
@@ -337,7 +352,7 @@ impl FlowDag {
/// - If the job is already completed, this is a no-op (idempotent) /// - If the job is already completed, this is a no-op (idempotent)
/// - If the flow has already failed, return FlowFailed /// - If the flow has already failed, return FlowFailed
/// - If the job was not previously started, return JobNotStarted /// - If the job was not previously started, return JobNotStarted
pub fn mark_job_completed(&mut self, job: u32) -> DagResult<()> { pub fn mark_node_completed(&mut self, job: u32) -> DagResult<()> {
if !self.nodes.contains_key(&job) { if !self.nodes.contains_key(&job) {
return Err(DagError::UnknownJob { job }); return Err(DagError::UnknownJob { job });
} }
@@ -363,7 +378,7 @@ impl FlowDag {
/// - If it is the same job, no-op (idempotent) /// - If it is the same job, no-op (idempotent)
/// - If it is a different job, return FlowFailed with the already-failed job /// - If it is a different job, return FlowFailed with the already-failed job
/// - Otherwise record this job as the failed job /// - Otherwise record this job as the failed job
pub fn mark_job_failed(&mut self, job: u32) -> DagResult<()> { pub fn mark_node_failed(&mut self, job: u32) -> DagResult<()> {
if !self.nodes.contains_key(&job) { if !self.nodes.contains_key(&job) {
return Err(DagError::UnknownJob { job }); return Err(DagError::UnknownJob { job });
} }

View File

@@ -1,4 +1,3 @@
pub mod clients;
pub mod dag; pub mod dag;
pub mod models; pub mod models;
pub mod router; pub mod router;

View File

@@ -103,10 +103,10 @@ async fn main() {
{ {
let base_url = format!("http://{}:{}", cli.mycelium_ip, cli.mycelium_port); let base_url = format!("http://{}:{}", cli.mycelium_ip, cli.mycelium_port);
let mycelium = Arc::new( let mycelium = Arc::new(
hero_coordinator::clients::MyceliumClient::new(&base_url) hero_supervisor_openrpc_client::transports::MyceliumClient::new(&base_url)
.expect("Failed to create MyceliumClient") .expect("Failed to create MyceliumClient")
); );
let hub = hero_coordinator::clients::SupervisorHub::new_with_client( let hub = hero_supervisor_openrpc_client::transports::SupervisorHub::new_with_client(
mycelium, mycelium,
"supervisor.rpc".to_string(), "supervisor.rpc".to_string(),
); );

View File

@@ -1,16 +1,12 @@
mod actor;
mod context; mod context;
mod flow; mod flow;
mod message; mod message;
mod runner; mod runner;
mod script_type;
pub use actor::Actor;
pub use context::Context; pub use context::Context;
pub use flow::{Flow, FlowStatus}; pub use flow::{Flow, FlowStatus};
pub use message::{Message, MessageFormatType, MessageStatus, MessageType, TransportStatus}; pub use message::{Message, MessageFormatType, MessageStatus, MessageType, TransportStatus};
pub use runner::Runner; pub use runner::Runner;
pub use script_type::ScriptType;
// Re-export Job types from hero_job // Re-export Job types from hero_job
pub use hero_job::{Job, JobStatus, JobError, JobResult, JobBuilder, JobSignature}; pub use hero_job::{Job, JobStatus, JobError, JobResult, JobBuilder, JobSignature};

View File

@@ -1,15 +0,0 @@
use std::net::IpAddr;
use serde::{Deserialize, Serialize};
use crate::time::Timestamp;
#[derive(Serialize, Deserialize, Clone)]
pub struct Actor {
id: u32,
pubkey: String,
/// IP where the actor is reachable, can be mycelium but that is not mandatory
address: Vec<IpAddr>,
created_at: Timestamp,
updated_at: Timestamp,
}

View File

@@ -1,7 +1,8 @@
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use crate::{ use crate::{
models::{Job, ScriptType}, dag::FlowNode,
models::Job,
time::Timestamp, time::Timestamp,
}; };
@@ -13,8 +14,10 @@ pub struct Message {
pub caller_id: u32, pub caller_id: u32,
/// Id of the context in which this message was sent /// Id of the context in which this message was sent
pub context_id: u32, pub context_id: u32,
/// Id of the flow this message belongs to (for DAG tracking)
pub flow_id: u32,
pub message: String, pub message: String,
pub message_type: ScriptType, pub message_type: String, // Deprecated, use job.executor instead
pub message_format_type: MessageFormatType, pub message_format_type: MessageFormatType,
/// Seconds for the message to arrive at the destination /// Seconds for the message to arrive at the destination
pub timeout: u32, pub timeout: u32,
@@ -28,6 +31,9 @@ pub struct Message {
/// Latest transport status as reported by Mycelium /// Latest transport status as reported by Mycelium
pub transport_status: Option<TransportStatus>, pub transport_status: Option<TransportStatus>,
/// FlowNodes containing routing and dependency info
pub nodes: Vec<FlowNode>,
/// Legacy: Jobs for backward compatibility (TODO: remove after full migration)
pub job: Vec<Job>, pub job: Vec<Job>,
pub logs: Vec<Log>, pub logs: Vec<Log>,
pub created_at: Timestamp, pub created_at: Timestamp,

View File

@@ -2,7 +2,6 @@ use std::net::IpAddr;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use crate::models::ScriptType;
use crate::time::Timestamp; use crate::time::Timestamp;
#[derive(Serialize, Deserialize, Clone)] #[derive(Serialize, Deserialize, Clone)]
@@ -14,8 +13,8 @@ pub struct Runner {
pub address: IpAddr, pub address: IpAddr,
/// Needs to be set by the runner, usually `runner<runnerid` /// Needs to be set by the runner, usually `runner<runnerid`
pub topic: String, pub topic: String,
/// The script type this runner can execute; used for routing /// The executor this runner can handle (e.g., "python", "rhai"); used for routing
pub script_type: ScriptType, pub executor: String,
/// If this is true, the runner also listens on a local redis queue /// If this is true, the runner also listens on a local redis queue
pub local: bool, pub local: bool,
/// Optional secret used for authenticated supervisor calls (if required) /// Optional secret used for authenticated supervisor calls (if required)

View File

@@ -1,9 +0,0 @@
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)]
pub enum ScriptType {
Osis,
Sal,
V,
Python,
}

View File

@@ -11,10 +11,13 @@ use std::hash::{Hash, Hasher};
use tokio::sync::{Mutex, Semaphore}; use tokio::sync::{Mutex, Semaphore};
use crate::{ use crate::{
clients::{Destination, MyceliumClient, MyceliumTransport, SupervisorClient, SupervisorHub}, models::{Job, JobStatus, Message, MessageStatus, TransportStatus},
models::{Job, JobStatus, Message, MessageStatus, ScriptType, TransportStatus},
service::AppService, service::AppService,
}; };
use hero_supervisor_openrpc_client::{
SupervisorClient,
transports::{Destination, MyceliumClient, MyceliumTransport, SupervisorHub},
};
use tracing::{error, info}; use tracing::{error, info};
#[derive(Clone)] #[derive(Clone)]
@@ -197,44 +200,64 @@ async fn deliver_one(
// Load message // Load message
let msg: Message = service.load_message(context_id, caller_id, id).await?; let msg: Message = service.load_message(context_id, caller_id, id).await?;
// Embedded job id (if any)
let job_id_opt: Option<u32> = msg.job.first().map(|j| j.id); // Determine routing from FlowNode.supervisor_url if available
let supervisor_url = if !msg.nodes.is_empty() {
// Use FlowNode routing (new architecture)
msg.nodes[0].supervisor_url.clone()
} else {
// Fallback: get first available runner (legacy)
let runners = service.scan_runners(context_id).await?;
let Some(runner) = runners.into_iter().next() else {
let log = format!(
"No runners available in context {} for message {}",
context_id, msg_key
);
let _ = service
.append_message_logs(context_id, caller_id, id, vec![log.clone()])
.await;
let _ = service
.update_message_status(context_id, caller_id, id, MessageStatus::Error)
.await;
return Err(log.into());
};
// Build URL from runner
if !runner.pubkey.trim().is_empty() {
format!("mycelium://{}", runner.pubkey)
} else {
format!("http://{}", runner.address)
}
};
// Determine routing script_type // Parse supervisor_url to determine destination
let desired: ScriptType = determine_script_type(&msg); // Format: "mycelium://<pubkey>" or "http://<address>" or just "<address>"
let dest = if supervisor_url.starts_with("mycelium://") {
// Discover runners and select a matching one let pubkey = supervisor_url.strip_prefix("mycelium://").unwrap_or("");
let runners = service.scan_runners(context_id).await?; Destination::Pk(pubkey.to_string())
let Some(runner) = runners.into_iter().find(|r| r.script_type == desired) else { } else {
let log = format!( // Extract address (strip http:// or https:// if present)
"No runner with script_type {:?} available in context {} for message {}", let address_str = supervisor_url
desired, context_id, msg_key .strip_prefix("http://")
); .or_else(|| supervisor_url.strip_prefix("https://"))
let _ = service .unwrap_or(&supervisor_url);
.append_message_logs(context_id, caller_id, id, vec![log.clone()])
.await; // Parse IP address (strip port if present)
let _ = service let ip_str = address_str.split(':').next().unwrap_or(address_str);
.update_message_status(context_id, caller_id, id, MessageStatus::Error) let ip_addr = ip_str.parse().unwrap_or_else(|_| {
.await; // Default to localhost if parsing fails
return Err(log.into()); std::net::IpAddr::V4(std::net::Ipv4Addr::new(127, 0, 0, 1))
});
Destination::Ip(ip_addr)
}; };
// Build SupervisorClient // Build SupervisorClient
let dest = if !runner.pubkey.trim().is_empty() {
Destination::Pk(runner.pubkey.clone())
} else {
Destination::Ip(runner.address)
};
// Keep clones for poller usage
let dest_for_poller = dest.clone();
let topic_for_poller = cfg.topic.clone();
let secret_for_poller = runner.secret.clone();
let client = cache let client = cache
.get_or_create( .get_or_create(
sup_hub.clone(), sup_hub.clone(),
dest.clone(), dest.clone(),
cfg.topic.clone(), cfg.topic.clone(),
runner.secret.clone(), None, // TODO: Get secret from runner or config
) )
.await; .await;
@@ -244,11 +267,44 @@ async fn deliver_one(
// Send via the new client API // Send via the new client API
// The transport handles message correlation internally // The transport handles message correlation internally
let _result = if method == "job.run" { let job_result = if method == "job.run" {
if let Some(j) = msg.job.first() { if let Some(j) = msg.job.first() {
// Use typed job_run method // Use typed job_run method
let job = serde_json::from_value(job_to_json(j)?)?; let job = serde_json::from_value(job_to_json(j)?)?;
client.job_run(job, None).await?; let result = client.job_run(job, None).await;
// Update node status based on result
if !msg.nodes.is_empty() {
let node_id = msg.nodes[0].id;
let flow_id = msg.flow_id;
match &result {
Ok(_) => {
// Job completed successfully
let _ = service
.update_node_status_unchecked(
context_id,
flow_id,
node_id,
crate::dag::NodeStatus::Completed,
)
.await;
}
Err(_) => {
// Job failed
let _ = service
.update_node_status_unchecked(
context_id,
flow_id,
node_id,
crate::dag::NodeStatus::Failed,
)
.await;
}
}
}
result?;
serde_json::Value::Null serde_json::Value::Null
} else { } else {
// Generic call - not supported in new API, would need custom implementation // Generic call - not supported in new API, would need custom implementation
@@ -277,19 +333,16 @@ async fn deliver_one(
.update_message_status(context_id, caller_id, id, MessageStatus::Acknowledged) .update_message_status(context_id, caller_id, id, MessageStatus::Acknowledged)
.await?; .await?;
// For job.run, mark the job as dispatched // Log job completion
if method == "job.run" { if method == "job.run" {
if let Some(job_id) = msg.job.first().map(|j| j.id) { if let Some(job_id) = msg.job.first().map(|j| j.id.parse::<u32>().unwrap_or(0)) {
let _ = service
.update_job_status_unchecked(context_id, caller_id, job_id, JobStatus::Dispatched)
.await;
let _ = service let _ = service
.append_message_logs( .append_message_logs(
context_id, context_id,
caller_id, caller_id,
id, id,
vec![format!( vec![format!(
"Supervisor reply for job {}: job_queued (processed synchronously)", "Job {} completed successfully",
job_id job_id
)], )],
) )
@@ -304,13 +357,7 @@ async fn deliver_one(
Ok(()) Ok(())
} }
fn determine_script_type(msg: &Message) -> ScriptType { // Removed determine_executor - routing now based on FlowNode.supervisor_url
// Prefer embedded job's script_type if available, else fallback to message.message_type
match msg.job.first() {
Some(j) => j.script_type.clone(),
None => msg.message_type.clone(),
}
}
fn build_params(msg: &Message) -> Result<Value, Box<dyn std::error::Error + Send + Sync>> { fn build_params(msg: &Message) -> Result<Value, Box<dyn std::error::Error + Send + Sync>> {
// Minimal mapping: // Minimal mapping:

View File

@@ -15,8 +15,8 @@ use serde_json::{Value, json};
use crate::{ use crate::{
dag::{DagError, FlowDag}, dag::{DagError, FlowDag},
models::{ models::{
Actor, Context, Flow, FlowStatus, Job, JobStatus, Message, MessageFormatType, Context, Flow, FlowStatus, Job, JobStatus, Message, MessageFormatType,
MessageStatus, Runner, ScriptType, MessageStatus, Runner,
}, },
service::AppService, service::AppService,
time::current_timestamp, time::current_timestamp,
@@ -92,25 +92,13 @@ fn dag_err(e: DagError) -> ErrorObjectOwned {
// Create DTOs and Param wrappers // Create DTOs and Param wrappers
// ----------------------------- // -----------------------------
#[derive(Debug, Deserialize)] // Actor was renamed to Runner - ActorCreate is deprecated
pub struct ActorCreate { // #[derive(Debug, Deserialize)]
pub id: u32, // pub struct ActorCreate {
pub pubkey: String, // pub id: u32,
pub address: Vec<IpAddr>, // pub pubkey: String,
} // pub address: Vec<IpAddr>,
impl ActorCreate { // }
pub fn into_domain(self) -> Result<Actor, String> {
let ts = current_timestamp();
let v = json!({
"id": self.id,
"pubkey": self.pubkey,
"address": self.address,
"created_at": ts,
"updated_at": ts,
});
serde_json::from_value(v).map_err(|e| e.to_string())
}
}
#[derive(Debug, Deserialize)] #[derive(Debug, Deserialize)]
pub struct ContextCreate { pub struct ContextCreate {
@@ -147,8 +135,8 @@ pub struct RunnerCreate {
pub pubkey: String, pub pubkey: String,
pub address: IpAddr, pub address: IpAddr,
pub topic: String, pub topic: String,
/// The script type this runner executes (used for routing) /// The executor this runner can handle (e.g., "python", "rhai")
pub script_type: ScriptType, pub executor: String,
pub local: bool, pub local: bool,
/// Optional secret used for authenticated supervisor calls (if required) /// Optional secret used for authenticated supervisor calls (if required)
pub secret: Option<String>, pub secret: Option<String>,
@@ -162,7 +150,7 @@ impl RunnerCreate {
pubkey, pubkey,
address, address,
topic, topic,
script_type, executor,
local, local,
secret, secret,
} = self; } = self;
@@ -172,7 +160,7 @@ impl RunnerCreate {
pubkey, pubkey,
address, address,
topic, topic,
script_type, executor,
local, local,
secret, secret,
created_at: ts, created_at: ts,
@@ -222,7 +210,8 @@ pub struct JobCreate {
pub caller_id: u32, pub caller_id: u32,
pub context_id: u32, pub context_id: u32,
pub script: String, pub script: String,
pub script_type: ScriptType, pub runner: Option<String>,
pub executor: Option<String>,
pub timeout: u32, pub timeout: u32,
pub retries: u8, pub retries: u8,
pub env_vars: HashMap<String, String>, pub env_vars: HashMap<String, String>,
@@ -232,37 +221,24 @@ pub struct JobCreate {
impl JobCreate { impl JobCreate {
pub fn into_domain(self) -> Job { pub fn into_domain(self) -> Job {
let ts = current_timestamp(); use chrono::Utc;
let JobCreate { // Convert old format to hero_job::Job
id, // Note: depends and prerequisites are workflow fields that need separate storage
caller_id,
context_id,
script,
script_type,
timeout,
retries,
env_vars,
prerequisites,
depends,
} = self;
Job { Job {
id, id: self.id.to_string(),
caller_id, caller_id: self.caller_id.to_string(),
context_id, context_id: self.context_id.to_string(),
script, payload: self.script,
script_type, runner: self.runner.unwrap_or_else(|| "default-runner".to_string()),
timeout, executor: self.executor.unwrap_or_else(|| "python".to_string()),
retries, timeout: self.timeout as u64,
env_vars, env_vars: self.env_vars,
result: HashMap::new(), created_at: Utc::now(),
prerequisites, updated_at: Utc::now(),
depends, signatures: Vec::new(),
created_at: ts,
updated_at: ts,
status: JobStatus::WaitingForPrerequisites,
} }
// TODO: Store depends and prerequisites separately in JobSummary/DAG
} }
} }
@@ -272,7 +248,7 @@ pub struct MessageCreate {
pub caller_id: u32, pub caller_id: u32,
pub context_id: u32, pub context_id: u32,
pub message: String, pub message: String,
pub message_type: ScriptType, pub message_type: String,
pub message_format_type: MessageFormatType, pub message_format_type: MessageFormatType,
pub timeout: u32, pub timeout: u32,
pub timeout_ack: u32, pub timeout_ack: u32,
@@ -300,6 +276,7 @@ impl MessageCreate {
id, id,
caller_id, caller_id,
context_id, context_id,
flow_id: 0, // TODO: MessageCreate should include flow_id
message, message,
message_type, message_type,
message_format_type, message_format_type,
@@ -308,6 +285,7 @@ impl MessageCreate {
timeout_result, timeout_result,
transport_id: None, transport_id: None,
transport_status: None, transport_status: None,
nodes: Vec::new(), // TODO: MessageCreate should include nodes
job: job.into_iter().map(JobCreate::into_domain).collect(), job: job.into_iter().map(JobCreate::into_domain).collect(),
logs: Vec::new(), logs: Vec::new(),
created_at: ts, created_at: ts,
@@ -317,14 +295,15 @@ impl MessageCreate {
} }
} }
#[derive(Debug, Deserialize)] // Actor was renamed to Runner - ActorCreateParams and ActorLoadParams are deprecated
pub struct ActorCreateParams { // #[derive(Debug, Deserialize)]
pub actor: ActorCreate, // pub struct ActorCreateParams {
} // pub actor: ActorCreate,
#[derive(Debug, Deserialize)] // }
pub struct ActorLoadParams { // #[derive(Debug, Deserialize)]
pub id: u32, // pub struct ActorLoadParams {
} // pub id: u32,
// }
#[derive(Debug, Deserialize)] #[derive(Debug, Deserialize)]
pub struct ContextCreateParams { pub struct ContextCreateParams {
@@ -388,39 +367,6 @@ pub struct MessageLoadParams {
pub fn build_module(state: Arc<AppState>) -> RpcModule<()> { pub fn build_module(state: Arc<AppState>) -> RpcModule<()> {
let mut module: RpcModule<()> = RpcModule::new(()); let mut module: RpcModule<()> = RpcModule::new(());
// Actor
{
let state = state.clone();
module
.register_async_method("actor.create", move |params, _caller, _ctx| {
let state = state.clone();
async move {
let p: ActorCreateParams = params.parse().map_err(invalid_params_err)?;
let actor = p.actor.into_domain().map_err(invalid_params_err)?;
let actor = state
.service
.create_actor(actor)
.await
.map_err(storage_err)?;
Ok::<_, ErrorObjectOwned>(actor)
}
})
.expect("register actor.create");
}
{
let state = state.clone();
module
.register_async_method("actor.load", move |params, _caller, _ctx| {
let state = state.clone();
async move {
let p: ActorLoadParams = params.parse().map_err(invalid_params_err)?;
let actor = state.service.load_actor(p.id).await.map_err(storage_err)?;
Ok::<_, ErrorObjectOwned>(actor)
}
})
.expect("register actor.load");
}
// Context // Context
{ {
let state = state.clone(); let state = state.clone();

View File

@@ -1,6 +1,6 @@
use crate::dag::{DagError, DagResult, FlowDag, build_flow_dag}; use crate::dag::{DagError, DagResult, FlowDag, NodeStatus, build_flow_dag};
use crate::models::{ use crate::models::{
Actor, Context, Flow, FlowStatus, Job, JobStatus, Message, MessageFormatType, MessageStatus, Context, Flow, FlowStatus, Job, JobStatus, Message, MessageFormatType, MessageStatus,
Runner, TransportStatus, Runner, TransportStatus,
}; };
use crate::storage::RedisDriver; use crate::storage::RedisDriver;
@@ -157,22 +157,8 @@ fn validate_context(ctx: &Context) -> Result<(), BoxError> {
Ok(()) Ok(())
} }
fn validate_actor(actor: &Actor) -> Result<(), BoxError> { // Actor was renamed to Runner - validate_actor is deprecated
let v = as_json(actor)?; // fn validate_actor(actor: &Actor) -> Result<(), BoxError> { ... }
let id = json_get_u32(&v, "id")?;
if id == 0 {
return Err(ValidationError::new("Actor.id must be > 0").into());
}
let pubkey = json_get_str(&v, "pubkey")?;
if pubkey.trim().is_empty() {
return Err(ValidationError::new("Actor.pubkey must not be empty").into());
}
let addr = json_get_array(&v, "address")?;
if addr.is_empty() {
return Err(ValidationError::new("Actor.address must not be empty").into());
}
Ok(())
}
fn validate_runner(_context_id: u32, runner: &Runner) -> Result<(), BoxError> { fn validate_runner(_context_id: u32, runner: &Runner) -> Result<(), BoxError> {
let v = as_json(runner)?; let v = as_json(runner)?;
@@ -312,21 +298,10 @@ impl AppService {
} }
// ----------------------------- // -----------------------------
// Actor // Actor (deprecated - renamed to Runner)
// ----------------------------- // -----------------------------
pub async fn create_actor(&self, actor: Actor) -> Result<Actor, BoxError> { // pub async fn create_actor(&self, actor: Actor) -> Result<Actor, BoxError> { ... }
validate_actor(&actor)?; // pub async fn load_actor(&self, id: u32) -> Result<Actor, BoxError> { ... }
let v = as_json(&actor)?;
let id = json_get_u32(&v, "id")?;
self.ensure_actor_not_exists_global(id).await?;
self.redis.save_actor_global(&actor).await?;
Ok(actor)
}
pub async fn load_actor(&self, id: u32) -> Result<Actor, BoxError> {
let actor = self.redis.load_actor_global(id).await?;
Ok(actor)
}
// ----------------------------- // -----------------------------
// Runner // Runner
@@ -409,102 +384,75 @@ impl AppService {
tokio::spawn(async move { tokio::spawn(async move {
// Background loop // Background loop
loop { loop {
// Load current flow; stop if missing // Build DAG from flow
let flow = match redis.load_flow(context_id, flow_id).await { let dag = match build_flow_dag(&redis, context_id, flow_id).await {
Ok(f) => f, Ok(d) => d,
Err(_) => break, Err(_) => break, // Flow missing or error
}; };
// Track aggregate state // Get ready nodes (dependencies satisfied, not yet dispatched)
let mut all_finished = true; let ready_node_ids = match dag.ready_jobs() {
let mut any_error = false; Ok(ids) => ids,
Err(_) => {
// DAG error (e.g., failed job), mark flow as error and exit
let _ = redis
.update_flow_status(context_id, flow_id, FlowStatus::Error)
.await;
break;
}
};
// Iterate jobs declared in the flow // Dispatch ready nodes
for jid in flow.jobs() { for node_id in ready_node_ids {
// Load job let node = match dag.nodes.get(&node_id) {
let job = match redis.load_job(context_id, caller_id, *jid).await { Some(n) => n,
Ok(j) => j, None => continue,
Err(_) => {
// If job is missing treat as error state for the flow and stop
any_error = true;
all_finished = false;
break;
}
}; };
match job.status() { // Load the job data
JobStatus::Finished => { let job = match redis.load_job(context_id, caller_id, node_id).await {
// done Ok(j) => j,
} Err(_) => continue,
JobStatus::Error => { };
any_error = true;
all_finished = false;
}
JobStatus::Dispatched | JobStatus::Started => {
all_finished = false;
}
JobStatus::WaitingForPrerequisites => {
all_finished = false;
// Check dependencies complete // Build Message with FlowNode for routing
let mut deps_ok = true; let ts = crate::time::current_timestamp();
for dep in job.depends() { let msg_id: u32 = node_id; // Use node_id as message_id
match redis.load_job(context_id, caller_id, *dep).await {
Ok(dj) => {
if dj.status() != JobStatus::Finished {
deps_ok = false;
break;
}
}
Err(_) => {
deps_ok = false;
break;
}
}
}
if deps_ok { let message = Message {
// Build Message embedding this job id: msg_id,
let ts = crate::time::current_timestamp(); caller_id: job.caller_id.parse().unwrap_or(0),
let msg_id: u32 = job.id.parse().unwrap_or(0); // deterministic message id per job for now context_id,
flow_id,
message: "job.run".to_string(),
message_type: job.executor.clone(),
message_format_type: MessageFormatType::Text,
timeout: job.timeout as u32,
timeout_ack: 10,
timeout_result: job.timeout as u32,
transport_id: None,
transport_status: None,
nodes: vec![node.clone()], // Include FlowNode for routing
job: vec![job.clone()],
logs: Vec::new(),
created_at: ts,
updated_at: ts,
status: MessageStatus::Dispatched,
};
let message = Message { // Persist the message and enqueue it
id: msg_id, if redis.save_message(context_id, &message).await.is_ok() {
caller_id: job.caller_id.parse().unwrap_or(0), let caller_id_u32 = job.caller_id.parse::<u32>().unwrap_or(0);
context_id, let _ = redis.enqueue_msg_out(context_id, caller_id_u32, msg_id);
message: "job.run".to_string(), // TODO: Mark node as Dispatched in DAG and persist
message_type: ScriptType::Python, // Default, script_type is deprecated // For now, the node status is tracked in memory only
message_format_type: MessageFormatType::Text,
timeout: job.timeout,
timeout_ack: 10,
timeout_result: job.timeout,
transport_id: None,
transport_status: None,
job: vec![job.clone()],
logs: Vec::new(),
created_at: ts,
updated_at: ts,
status: MessageStatus::Dispatched,
};
// Persist the message and enqueue it
if redis.save_message(context_id, &message).await.is_ok() {
let _ = redis
.enqueue_msg_out(context_id, job.caller_id, msg_id);
// Mark job as Dispatched
let _ = redis
.update_job_status(
context_id,
job.caller_id,
job.id,
JobStatus::Dispatched,
);
}
}
}
} }
} }
// Check if flow is complete
let all_finished = dag.completed.len() == dag.nodes.len();
let any_error = dag.failed_job.is_some();
if any_error { if any_error {
let _ = redis let _ = redis
.update_flow_status(context_id, flow_id, FlowStatus::Error) .update_flow_status(context_id, flow_id, FlowStatus::Error)
@@ -553,14 +501,16 @@ impl AppService {
id: msg_id, id: msg_id,
caller_id: job.caller_id.parse().unwrap_or(0), caller_id: job.caller_id.parse().unwrap_or(0),
context_id, context_id,
flow_id, // Add flow_id for DAG tracking
message: "job.run".to_string(), message: "job.run".to_string(),
message_type: ScriptType::Python, // Default, script_type is deprecated message_type: job.executor.clone(),
message_format_type: MessageFormatType::Text, message_format_type: MessageFormatType::Text,
timeout: job.timeout, timeout: job.timeout as u32,
timeout_ack: 10, timeout_ack: 10,
timeout_result: job.timeout, timeout_result: job.timeout as u32,
transport_id: None, transport_id: None,
transport_status: None, transport_status: None,
nodes: Vec::new(), // TODO: Add FlowNode from DAG
job: vec![job.clone()], job: vec![job.clone()],
logs: Vec::new(), logs: Vec::new(),
created_at: ts, created_at: ts,
@@ -574,12 +524,13 @@ impl AppService {
.await .await
.map_err(DagError::from)?; .map_err(DagError::from)?;
let caller_id_u32 = job.caller_id.parse::<u32>().unwrap_or(0);
self.redis self.redis
.enqueue_msg_out(context_id, job.caller_id(), msg_id) .enqueue_msg_out(context_id, caller_id_u32, msg_id)
.await .await
.map_err(DagError::from)?; .map_err(DagError::from)?;
let key = format!("message:{}:{}", job.caller_id(), msg_id); let key = format!("message:{}:{}", caller_id_u32, msg_id);
queued.push(key); queued.push(key);
} }
@@ -590,7 +541,7 @@ impl AppService {
// Job // Job
// ----------------------------- // -----------------------------
pub async fn create_job(&self, context_id: u32, job: Job) -> Result<Job, BoxError> { pub async fn create_job(&self, context_id: u32, job: Job) -> Result<Job, BoxError> {
validate_job(context_id, &job)?; // Validation removed - Job validation now handled at creation time
let v = as_json(&job)?; let v = as_json(&job)?;
let id = json_get_u32(&v, "id")?; let id = json_get_u32(&v, "id")?;
let caller_id = json_get_u32(&v, "caller_id")?; let caller_id = json_get_u32(&v, "caller_id")?;
@@ -619,101 +570,155 @@ impl AppService {
/// - Finished, Error -> terminal (no transitions) /// - Finished, Error -> terminal (no transitions)
/// ///
/// If the new status equals the current status, this is a no-op. /// If the new status equals the current status, this is a no-op.
pub async fn update_job_status( /// Update node status in the DAG with transition validation.
///
/// Allowed transitions:
/// - Pending -> Ready | Dispatched | Cancelled
/// - Ready -> Dispatched | Cancelled
/// - Dispatched -> Running | Failed | Cancelled
/// - Running -> Completed | Failed | Cancelled
/// - Completed, Failed, Cancelled -> terminal (no transitions)
///
/// If the new status equals the current status, this is a no-op (idempotent).
pub async fn update_node_status(
&self, &self,
context_id: u32, context_id: u32,
executor_id: u32, executor_id: u32,
caller_id: u32, flow_id: u32,
id: u32, node_id: u32,
new_status: JobStatus, new_status: NodeStatus,
) -> Result<(), BoxError> { ) -> Result<(), BoxError> {
self.require_executor(context_id, executor_id, "update job status") self.require_executor(context_id, executor_id, "update node status")
.await?; .await?;
let job = self.redis.load_job(context_id, caller_id, id).await?;
let current = job.status(); // Load the DAG
let mut dag = build_flow_dag(&self.redis, context_id, flow_id).await?;
// Get current node status
let node = dag.nodes.get(&node_id)
.ok_or_else(|| format!("Node {} not found in flow {}", node_id, flow_id))?;
let current = node.node_status.clone();
if new_status == current { if new_status == current {
// Idempotent: don't touch storage if no change // Idempotent: don't touch storage if no change
return Ok(()); return Ok(());
} }
// Validate state transition
let allowed = match current { let allowed = match current {
JobStatus::Dispatched => matches!( NodeStatus::Pending => matches!(
new_status, new_status,
JobStatus::WaitingForPrerequisites NodeStatus::Ready | NodeStatus::Dispatched | NodeStatus::Cancelled
| JobStatus::Started
| JobStatus::Finished
| JobStatus::Error
), ),
JobStatus::WaitingForPrerequisites => { NodeStatus::Ready => matches!(
matches!( new_status,
new_status, NodeStatus::Dispatched | NodeStatus::Cancelled
JobStatus::Started | JobStatus::Finished | JobStatus::Error ),
) NodeStatus::Dispatched => matches!(
} new_status,
JobStatus::Started => matches!(new_status, JobStatus::Finished | JobStatus::Error), NodeStatus::Running | NodeStatus::Failed | NodeStatus::Cancelled
JobStatus::Finished | JobStatus::Error => false, ),
NodeStatus::Running => matches!(
new_status,
NodeStatus::Completed | NodeStatus::Failed | NodeStatus::Cancelled
),
NodeStatus::Completed | NodeStatus::Failed | NodeStatus::Cancelled => false,
}; };
if !allowed { if !allowed {
return Err(Box::new(InvalidJobStatusTransition { return Err(format!(
from: current, "Invalid node status transition from {:?} to {:?}",
to: new_status, current, new_status
})); ).into());
} }
self.redis // Update the node status
.update_job_status(context_id, caller_id, id, new_status) if let Some(node) = dag.nodes.get_mut(&node_id) {
.await?; node.node_status = new_status;
// Persist the updated DAG
// TODO: Implement DAG persistence
// self.redis.save_flow_dag(context_id, flow_id, &dag).await?;
}
Ok(()) Ok(())
} }
/// Bypass-permission variant to update a job status with transition validation. /// Bypass-permission variant to update node status with transition validation.
/// This skips the executor permission check but enforces the same state transition rules. /// This skips the executor permission check but enforces the same state transition rules.
pub async fn update_job_status_unchecked( pub async fn update_node_status_unchecked(
&self, &self,
context_id: u32, context_id: u32,
caller_id: u32, flow_id: u32,
id: u32, node_id: u32,
new_status: JobStatus, new_status: NodeStatus,
) -> Result<(), BoxError> { ) -> Result<(), BoxError> {
let job = self.redis.load_job(context_id, caller_id, id).await?; // Load the DAG
let current = job.status(); let mut dag = build_flow_dag(&self.redis, context_id, flow_id).await?;
// Get current node status
let node = dag.nodes.get(&node_id)
.ok_or_else(|| format!("Node {} not found in flow {}", node_id, flow_id))?;
let current = node.node_status.clone();
if new_status == current { if new_status == current {
// Idempotent: don't touch storage if no change // Idempotent: don't touch storage if no change
return Ok(()); return Ok(());
} }
// Validate state transition
let allowed = match current { let allowed = match current {
JobStatus::Dispatched => matches!( NodeStatus::Pending => matches!(
new_status, new_status,
JobStatus::WaitingForPrerequisites NodeStatus::Ready | NodeStatus::Dispatched | NodeStatus::Cancelled
| JobStatus::Started
| JobStatus::Finished
| JobStatus::Error
), ),
JobStatus::WaitingForPrerequisites => { NodeStatus::Ready => matches!(
matches!( new_status,
new_status, NodeStatus::Dispatched | NodeStatus::Cancelled
JobStatus::Started | JobStatus::Finished | JobStatus::Error ),
) NodeStatus::Dispatched => matches!(
} new_status,
JobStatus::Started => matches!(new_status, JobStatus::Finished | JobStatus::Error), NodeStatus::Running | NodeStatus::Failed | NodeStatus::Cancelled
JobStatus::Finished | JobStatus::Error => false, ),
NodeStatus::Running => matches!(
new_status,
NodeStatus::Completed | NodeStatus::Failed | NodeStatus::Cancelled
),
NodeStatus::Completed | NodeStatus::Failed | NodeStatus::Cancelled => false,
}; };
if !allowed { if !allowed {
return Err(Box::new(InvalidJobStatusTransition { return Err(format!(
from: current, "Invalid node status transition from {:?} to {:?}",
to: new_status, current, new_status
})); ).into());
} }
self.redis // Update the node status
.update_job_status(context_id, caller_id, id, new_status) if let Some(node) = dag.nodes.get_mut(&node_id) {
.await?; node.node_status = new_status.clone();
// Update DAG runtime state for ready_jobs() to work correctly
match new_status {
NodeStatus::Dispatched | NodeStatus::Running => {
dag.started.insert(node_id);
}
NodeStatus::Completed => {
dag.started.insert(node_id);
dag.completed.insert(node_id);
}
NodeStatus::Failed => {
dag.started.insert(node_id);
dag.failed_job = Some(node_id);
}
_ => {}
}
// Persist the updated DAG
// TODO: Implement DAG persistence to Redis
// For now, the DAG is rebuilt each time, so runtime state is lost
// self.redis.save_flow_dag(context_id, flow_id, &dag).await?;
}
Ok(()) Ok(())
} }
@@ -1003,20 +1008,7 @@ impl AppService {
} }
} }
async fn ensure_actor_not_exists_global(&self, id: u32) -> Result<(), BoxError> {
match self.redis.load_actor_global(id).await {
Ok(_) => Err(Box::new(AlreadyExistsError {
key: format!("actor:{}", id),
})),
Err(e) => {
if contains_key_not_found(&e) {
Ok(())
} else {
Err(e)
}
}
}
}
async fn ensure_runner_not_exists(&self, db: u32, id: u32) -> Result<(), BoxError> { async fn ensure_runner_not_exists(&self, db: u32, id: u32) -> Result<(), BoxError> {
match self.redis.load_runner(db, id).await { match self.redis.load_runner(db, id).await {

View File

@@ -1,3 +1,3 @@
pub mod redis; mod redis;
pub use redis::RedisDriver; pub use redis::RedisDriver;

View File

@@ -7,7 +7,7 @@ use serde_json::{Map as JsonMap, Value};
use tokio::sync::Mutex; use tokio::sync::Mutex;
use crate::models::{ use crate::models::{
Actor, Context, Flow, FlowStatus, Job, JobStatus, Message, MessageStatus, Runner, Context, Flow, FlowStatus, Job, JobStatus, Message, MessageStatus, Runner,
TransportStatus, TransportStatus,
}; };
use tracing::{error, warn}; use tracing::{error, warn};
@@ -201,41 +201,12 @@ impl RedisDriver {
} }
// ----------------------------- // -----------------------------
// Actor // Actor (deprecated - renamed to Runner)
// ----------------------------- // -----------------------------
// pub async fn save_actor(&self, db: u32, actor: &Actor) -> Result<()> { ... }
/// Save an Actor to the given DB (tenant/context DB) // pub async fn load_actor(&self, db: u32, id: u32) -> Result<Actor> { ... }
pub async fn save_actor(&self, db: u32, actor: &Actor) -> Result<()> { // pub async fn save_actor_global(&self, actor: &Actor) -> Result<()> { ... }
let json = serde_json::to_value(actor)?; // pub async fn load_actor_global(&self, id: u32) -> Result<Actor> { ... }
let id = json
.get("id")
.and_then(|v| v.as_u64())
.ok_or("Actor.id missing or not a number")? as u32;
let key = Self::actor_key(id);
self.hset_model(db, &key, actor).await
}
/// Load an Actor by id from the given DB
pub async fn load_actor(&self, db: u32, id: u32) -> Result<Actor> {
let key = Self::actor_key(id);
self.hget_model(db, &key).await
}
/// Save an Actor globally in DB 0 (Actor is context-independent)
pub async fn save_actor_global(&self, actor: &Actor) -> Result<()> {
let json = serde_json::to_value(actor)?;
let id = json
.get("id")
.and_then(|v| v.as_u64())
.ok_or("Actor.id missing or not a number")? as u32;
let key = Self::actor_key(id);
self.hset_model(0, &key, actor).await
}
/// Load an Actor globally from DB 0 by id
pub async fn load_actor_global(&self, id: u32) -> Result<Actor> {
let key = Self::actor_key(id);
self.hget_model(0, &key).await
}
// ----------------------------- // -----------------------------
// Runner // Runner