597 lines
22 KiB
Rust
597 lines
22 KiB
Rust
use log::{debug, error, info, warn};
|
|
use redis::AsyncCommands;
|
|
use std::collections::HashMap;
|
|
use std::time::Duration;
|
|
use hero_job::NAMESPACE_PREFIX;
|
|
use zinit_client::ZinitClient;
|
|
|
|
mod job;
|
|
mod error;
|
|
mod lifecycle;
|
|
|
|
pub use crate::error::SupervisorError;
|
|
pub use crate::job::JobBuilder;
|
|
pub use crate::lifecycle::WorkerConfig;
|
|
// Re-export types from hero_job for public API
|
|
pub use hero_job::{Job, JobStatus, ScriptType};
|
|
|
|
pub struct Supervisor {
|
|
redis_client: redis::Client,
|
|
zinit_client: ZinitClient,
|
|
builder_data: Option<SupervisorBuilderData>,
|
|
}
|
|
|
|
pub struct SupervisorBuilder {
|
|
redis_url: Option<String>,
|
|
zinit_socket_path: Option<String>,
|
|
osis_worker: Option<String>,
|
|
sal_worker: Option<String>,
|
|
v_worker: Option<String>,
|
|
python_worker: Option<String>,
|
|
worker_env_vars: HashMap<String, String>,
|
|
}
|
|
|
|
/// Helper struct to pass builder data to worker launch method
|
|
struct SupervisorBuilderData {
|
|
osis_worker: Option<String>,
|
|
sal_worker: Option<String>,
|
|
v_worker: Option<String>,
|
|
python_worker: Option<String>,
|
|
worker_env_vars: HashMap<String, String>,
|
|
}
|
|
|
|
impl SupervisorBuilder {
|
|
pub fn new() -> Self {
|
|
Self {
|
|
redis_url: None,
|
|
zinit_socket_path: Some("/var/run/zinit.sock".to_string()),
|
|
osis_worker: None,
|
|
sal_worker: None,
|
|
v_worker: None,
|
|
python_worker: None,
|
|
worker_env_vars: HashMap::new(),
|
|
}
|
|
}
|
|
|
|
pub fn redis_url(mut self, url: &str) -> Self {
|
|
self.redis_url = Some(url.to_string());
|
|
self
|
|
}
|
|
|
|
pub fn zinit_socket_path(mut self, path: &str) -> Self {
|
|
self.zinit_socket_path = Some(path.to_string());
|
|
self
|
|
}
|
|
|
|
pub fn osis_worker(mut self, binary_path: &str) -> Self {
|
|
self.osis_worker = Some(binary_path.to_string());
|
|
self
|
|
}
|
|
|
|
pub fn sal_worker(mut self, binary_path: &str) -> Self {
|
|
self.sal_worker = Some(binary_path.to_string());
|
|
self
|
|
}
|
|
|
|
pub fn v_worker(mut self, binary_path: &str) -> Self {
|
|
self.v_worker = Some(binary_path.to_string());
|
|
self
|
|
}
|
|
|
|
pub fn python_worker(mut self, binary_path: &str) -> Self {
|
|
self.python_worker = Some(binary_path.to_string());
|
|
self
|
|
}
|
|
|
|
pub fn worker_env_var(mut self, key: &str, value: &str) -> Self {
|
|
self.worker_env_vars.insert(key.to_string(), value.to_string());
|
|
self
|
|
}
|
|
|
|
pub fn worker_env_vars(mut self, env_vars: HashMap<String, String>) -> Self {
|
|
self.worker_env_vars.extend(env_vars);
|
|
self
|
|
}
|
|
|
|
/// Builds the final `Supervisor` instance synchronously.
|
|
///
|
|
/// This method validates the configuration and creates the Redis client.
|
|
/// Worker launching is deferred to the `start_workers()` method.
|
|
///
|
|
/// # Returns
|
|
///
|
|
/// * `Ok(Supervisor)` - Successfully configured client
|
|
/// * `Err(SupervisorError)` - Configuration or connection error
|
|
pub fn build(self) -> Result<Supervisor, SupervisorError> {
|
|
let url = self.redis_url
|
|
.unwrap_or_else(|| "redis://127.0.0.1/".to_string());
|
|
let client = redis::Client::open(url)?;
|
|
|
|
let zinit_socket = self.zinit_socket_path
|
|
.unwrap_or_else(|| "/var/run/zinit.sock".to_string());
|
|
let zinit_client = ZinitClient::new(&zinit_socket);
|
|
|
|
// Store builder data for later use in start_workers()
|
|
let builder_data = SupervisorBuilderData {
|
|
osis_worker: self.osis_worker,
|
|
sal_worker: self.sal_worker,
|
|
v_worker: self.v_worker,
|
|
python_worker: self.python_worker,
|
|
worker_env_vars: self.worker_env_vars,
|
|
};
|
|
|
|
let supervisor = Supervisor {
|
|
redis_client: client,
|
|
zinit_client,
|
|
builder_data: Some(builder_data),
|
|
};
|
|
|
|
Ok(supervisor)
|
|
}
|
|
}
|
|
|
|
impl Supervisor {
|
|
/// Start all configured workers asynchronously.
|
|
/// This method should be called after build() to launch the workers.
|
|
pub async fn start_workers(&self) -> Result<(), SupervisorError> {
|
|
// Clean up any existing worker services first
|
|
self.cleanup_existing_workers().await?;
|
|
|
|
// Launch configured workers if builder data is available
|
|
if let Some(builder_data) = &self.builder_data {
|
|
self.launch_configured_workers(builder_data).await?;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Clean up all worker services from zinit on program exit
|
|
pub async fn cleanup_and_shutdown(&self) -> Result<(), SupervisorError> {
|
|
info!("Cleaning up worker services before shutdown...");
|
|
|
|
let worker_names = vec![
|
|
"osis_worker_1",
|
|
"sal_worker_1",
|
|
"v_worker_1",
|
|
"python_worker_1"
|
|
];
|
|
|
|
for worker_name in worker_names {
|
|
if let Err(e) = self.stop_and_delete_worker(worker_name).await {
|
|
warn!("Failed to cleanup worker {}: {}", worker_name, e);
|
|
}
|
|
}
|
|
|
|
info!("Worker cleanup completed");
|
|
Ok(())
|
|
}
|
|
|
|
/// Clean up any existing worker services on startup
|
|
async fn cleanup_existing_workers(&self) -> Result<(), SupervisorError> {
|
|
info!("Cleaning up any existing worker services...");
|
|
|
|
let worker_names = vec![
|
|
"osis_worker_1",
|
|
"sal_worker_1",
|
|
"v_worker_1",
|
|
"python_worker_1"
|
|
];
|
|
|
|
for worker_name in worker_names {
|
|
// Try to stop and delete, but don't fail if they don't exist
|
|
let _ = self.stop_and_delete_worker(worker_name).await;
|
|
}
|
|
|
|
info!("Existing worker cleanup completed");
|
|
Ok(())
|
|
}
|
|
|
|
/// Stop and delete a worker service from zinit
|
|
async fn stop_and_delete_worker(&self, worker_name: &str) -> Result<(), SupervisorError> {
|
|
// First try to stop the worker
|
|
if let Err(e) = self.zinit_client.stop(worker_name).await {
|
|
debug!("Worker {} was not running or failed to stop: {}", worker_name, e);
|
|
}
|
|
|
|
// Then try to delete the service
|
|
if let Err(e) = self.zinit_client.delete(worker_name).await {
|
|
debug!("Worker {} service did not exist or failed to delete: {}", worker_name, e);
|
|
} else {
|
|
info!("Successfully deleted worker service: {}", worker_name);
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Get the hardcoded worker queue key for the script type
|
|
fn get_worker_queue_key(&self, script_type: &ScriptType) -> String {
|
|
format!("{}worker_queue:{}", NAMESPACE_PREFIX, script_type.worker_queue_suffix())
|
|
}
|
|
|
|
pub fn new_job(&self) -> JobBuilder {
|
|
JobBuilder::new(self)
|
|
}
|
|
|
|
// Internal helper to submit script details and push to work queue
|
|
async fn create_job_using_connection(
|
|
&self,
|
|
conn: &mut redis::aio::MultiplexedConnection,
|
|
job: &Job,
|
|
) -> Result<(), SupervisorError> {
|
|
debug!(
|
|
"Submitting play request: {} for script type: {:?} with namespace prefix: {}",
|
|
job.id, job.script_type, NAMESPACE_PREFIX
|
|
);
|
|
|
|
// Use the shared Job struct's Redis storage method
|
|
job.store_in_redis(conn).await
|
|
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to store job in Redis: {}", e)))?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
// Internal helper to submit script details and push to work queue
|
|
async fn start_job_using_connection(
|
|
&self,
|
|
conn: &mut redis::aio::MultiplexedConnection,
|
|
job_id: String,
|
|
script_type: &ScriptType
|
|
) -> Result<(), SupervisorError> {
|
|
let worker_queue_key = self.get_worker_queue_key(script_type);
|
|
|
|
// lpush also infers its types, RV is typically i64 (length of list) or () depending on exact command variant
|
|
// For `redis::AsyncCommands::lpush`, it's `RedisResult<R>` where R: FromRedisValue
|
|
// Often this is the length of the list. Let's allow inference or specify if needed.
|
|
let _: redis::RedisResult<i64> =
|
|
conn.lpush(&worker_queue_key, job_id.clone()).await;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
// Internal helper to await response from worker
|
|
async fn await_response_from_connection(
|
|
&self,
|
|
conn: &mut redis::aio::MultiplexedConnection,
|
|
job_key: &String,
|
|
reply_queue_key: &String,
|
|
timeout: Duration,
|
|
) -> Result<String, SupervisorError> {
|
|
// BLPOP on the reply queue
|
|
// The timeout for BLPOP is in seconds (integer)
|
|
let blpop_timeout_secs = timeout.as_secs().max(1); // Ensure at least 1 second for BLPOP timeout
|
|
|
|
match conn
|
|
.blpop::<&String, Option<(String, String)>>(reply_queue_key, blpop_timeout_secs as f64)
|
|
.await
|
|
{
|
|
Ok(Some((_queue, result_message_str))) => {
|
|
Ok(result_message_str)
|
|
}
|
|
Ok(None) => {
|
|
// BLPOP timed out
|
|
warn!(
|
|
"Timeout waiting for result on reply queue {} for job {}",
|
|
reply_queue_key, job_key
|
|
);
|
|
// Optionally, delete the reply queue
|
|
let _: redis::RedisResult<i32> = conn.del(&reply_queue_key).await;
|
|
Err(SupervisorError::Timeout(job_key.clone()))
|
|
}
|
|
Err(e) => {
|
|
// Redis error
|
|
error!(
|
|
"Redis error on BLPOP for reply queue {}: {}",
|
|
reply_queue_key, e
|
|
);
|
|
// Optionally, delete the reply queue
|
|
let _: redis::RedisResult<i32> = conn.del(&reply_queue_key).await;
|
|
Err(SupervisorError::RedisError(e))
|
|
}
|
|
}
|
|
}
|
|
|
|
// New method using dedicated reply queue
|
|
pub async fn create_job(
|
|
&self,
|
|
job: &Job,
|
|
) -> Result<(), SupervisorError> {
|
|
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
|
|
|
self.create_job_using_connection(
|
|
&mut conn,
|
|
&job, // Pass the job_id parameter
|
|
)
|
|
.await?;
|
|
Ok(())
|
|
}
|
|
|
|
// Method to start a previously created job
|
|
pub async fn start_job(
|
|
&self,
|
|
job_id: &str,
|
|
) -> Result<(), SupervisorError> {
|
|
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
|
|
|
// Load the job to get its script type
|
|
let job = Job::load_from_redis(&mut conn, job_id).await?;
|
|
|
|
self.start_job_using_connection(&mut conn, job_id.to_string(), &job.script_type).await?;
|
|
Ok(())
|
|
}
|
|
|
|
// New method using dedicated reply queue with automatic worker selection
|
|
pub async fn run_job_and_await_result(
|
|
&self,
|
|
job: &Job
|
|
) -> Result<String, SupervisorError> {
|
|
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
|
|
|
let reply_queue_key = format!("{}:reply:{}", NAMESPACE_PREFIX, job.id); // Derived from the passed job_id
|
|
|
|
self.create_job_using_connection(
|
|
&mut conn,
|
|
&job, // Pass the job_id parameter
|
|
)
|
|
.await?;
|
|
|
|
self.start_job_using_connection(&mut conn, job.id.clone(), &job.script_type).await?;
|
|
|
|
info!(
|
|
"Task {} submitted. Waiting for result on queue {} with timeout {:?}...",
|
|
job.id, // This is the UUID
|
|
reply_queue_key,
|
|
job.timeout
|
|
);
|
|
|
|
self.await_response_from_connection(
|
|
&mut conn,
|
|
&job.id,
|
|
&reply_queue_key,
|
|
job.timeout,
|
|
)
|
|
.await
|
|
}
|
|
|
|
// Method to get job status
|
|
pub async fn get_job_status(
|
|
&self,
|
|
job_id: &str,
|
|
) -> Result<JobStatus, SupervisorError> {
|
|
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
|
let job_key = format!("{}{}", NAMESPACE_PREFIX, job_id);
|
|
|
|
let result_map: Option<std::collections::HashMap<String, String>> =
|
|
conn.hgetall(&job_key).await?;
|
|
|
|
match result_map {
|
|
Some(map) => {
|
|
let status_str = map.get("status").cloned().unwrap_or_else(|| {
|
|
warn!("Task {}: 'status' field missing from Redis hash, defaulting to empty.", job_id);
|
|
String::new()
|
|
});
|
|
|
|
let status = match status_str.as_str() {
|
|
"dispatched" => JobStatus::Dispatched,
|
|
"started" => JobStatus::Started,
|
|
"error" => JobStatus::Error,
|
|
"finished" => JobStatus::Finished,
|
|
_ => JobStatus::Dispatched, // default
|
|
};
|
|
|
|
Ok(status)
|
|
}
|
|
None => {
|
|
warn!("Job {} not found in Redis", job_id);
|
|
Ok(JobStatus::Dispatched) // default for missing jobs
|
|
}
|
|
}
|
|
}
|
|
|
|
// Method to get job output
|
|
pub async fn get_job_output(
|
|
&self,
|
|
job_id: &str,
|
|
) -> Result<Option<String>, SupervisorError> {
|
|
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
|
let job_key = format!("{}{}", NAMESPACE_PREFIX, job_id);
|
|
|
|
let result_map: Option<std::collections::HashMap<String, String>> =
|
|
conn.hgetall(&job_key).await?;
|
|
|
|
match result_map {
|
|
Some(map) => {
|
|
Ok(map.get("output").cloned())
|
|
}
|
|
None => {
|
|
warn!("Job {} not found in Redis", job_id);
|
|
Ok(None)
|
|
}
|
|
}
|
|
}
|
|
|
|
/// List all jobs in Redis
|
|
pub async fn list_jobs(&self) -> Result<Vec<String>, SupervisorError> {
|
|
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
|
|
|
// Use the shared Job struct's list method
|
|
Job::list_all_job_ids(&mut conn).await
|
|
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to list jobs: {}", e)))
|
|
}
|
|
|
|
/// Stop a job by pushing its ID to the stop queue
|
|
pub async fn stop_job(&self, job_id: &str) -> Result<(), SupervisorError> {
|
|
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
|
|
|
// Get job details to determine script type and appropriate worker
|
|
let job_key = format!("{}job:{}", NAMESPACE_PREFIX, job_id);
|
|
let job_data: std::collections::HashMap<String, String> = conn.hgetall(&job_key).await?;
|
|
|
|
if job_data.is_empty() {
|
|
return Err(SupervisorError::InvalidInput(format!("Job {} not found", job_id)));
|
|
}
|
|
|
|
// Parse script type from job data
|
|
let script_type_str = job_data.get("script_type")
|
|
.ok_or_else(|| SupervisorError::InvalidInput("Job missing script_type field".to_string()))?;
|
|
|
|
let script_type: ScriptType = serde_json::from_str(&format!("\"{}\"", script_type_str))
|
|
.map_err(|e| SupervisorError::InvalidInput(format!("Invalid script type: {}", e)))?;
|
|
|
|
// Use hardcoded stop queue key for this script type
|
|
let stop_queue_key = format!("{}stop_queue:{}", NAMESPACE_PREFIX, script_type.worker_queue_suffix());
|
|
|
|
// Push job ID to the stop queue
|
|
conn.lpush::<_, _, ()>(&stop_queue_key, job_id).await?;
|
|
|
|
info!("Job {} added to stop queue {} for script type {:?}", job_id, stop_queue_key, script_type);
|
|
Ok(())
|
|
}
|
|
|
|
/// Get logs for a job by reading from its log file
|
|
pub async fn get_job_logs(&self, job_id: &str) -> Result<Option<String>, SupervisorError> {
|
|
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
|
let job_key = format!("{}job:{}", NAMESPACE_PREFIX, job_id);
|
|
|
|
// Get the job data to find the log path
|
|
let result_map: Option<std::collections::HashMap<String, String>> =
|
|
conn.hgetall(&job_key).await?;
|
|
|
|
match result_map {
|
|
Some(map) => {
|
|
if let Some(log_path) = map.get("log_path") {
|
|
// Try to read the log file
|
|
match std::fs::read_to_string(log_path) {
|
|
Ok(contents) => Ok(Some(contents)),
|
|
Err(e) => {
|
|
warn!("Failed to read log file {}: {}", log_path, e);
|
|
Ok(None)
|
|
}
|
|
}
|
|
} else {
|
|
// No log path configured for this job
|
|
Ok(None)
|
|
}
|
|
}
|
|
None => {
|
|
warn!("Job {} not found in Redis", job_id);
|
|
Ok(None)
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Delete a specific job by ID
|
|
pub async fn delete_job(&self, job_id: &str) -> Result<(), SupervisorError> {
|
|
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
|
|
|
// Use the shared Job struct's delete method
|
|
Job::delete_from_redis(&mut conn, job_id).await
|
|
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to delete job: {}", e)))?;
|
|
|
|
info!("Job {} deleted successfully", job_id);
|
|
Ok(())
|
|
}
|
|
|
|
/// Clear all jobs from Redis
|
|
pub async fn clear_all_jobs(&self) -> Result<usize, SupervisorError> {
|
|
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
|
|
|
// Get all job IDs first
|
|
let job_ids = Job::list_all_job_ids(&mut conn).await
|
|
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to list jobs: {}", e)))?;
|
|
|
|
let count = job_ids.len();
|
|
|
|
// Delete each job using the shared method
|
|
for job_id in job_ids {
|
|
Job::delete_from_redis(&mut conn, &job_id).await
|
|
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to delete job {}: {}", job_id, e)))?;
|
|
}
|
|
|
|
Ok(count)
|
|
}
|
|
|
|
/// Check if all prerequisites for a job are completed
|
|
pub async fn check_prerequisites_completed(&self, job_id: &str) -> Result<bool, SupervisorError> {
|
|
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
|
|
|
// Load the job using the shared Job struct
|
|
let job = Job::load_from_redis(&mut conn, job_id).await
|
|
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to load job: {}", e)))?;
|
|
|
|
// Check each prerequisite job status
|
|
for prereq_id in &job.prerequisites {
|
|
let status = Job::get_status(&mut conn, prereq_id).await
|
|
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to get prerequisite status: {}", e)))?;
|
|
|
|
if status != JobStatus::Finished {
|
|
return Ok(false); // Prerequisite not completed
|
|
}
|
|
}
|
|
|
|
Ok(true) // All prerequisites completed (or no prerequisites)
|
|
}
|
|
|
|
/// Update job status and check dependent jobs for readiness
|
|
pub async fn update_job_status_and_check_dependents(&self, job_id: &str, new_status: JobStatus) -> Result<Vec<String>, SupervisorError> {
|
|
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
|
|
|
// Update job status using shared Job method
|
|
Job::update_status(&mut conn, job_id, new_status.clone()).await
|
|
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to update job status: {}", e)))?;
|
|
|
|
let mut ready_jobs = Vec::new();
|
|
|
|
// If job finished, check dependent jobs
|
|
if new_status == JobStatus::Finished {
|
|
// Load the job to get its dependents
|
|
let job = Job::load_from_redis(&mut conn, job_id).await
|
|
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to load job: {}", e)))?;
|
|
|
|
// Check each dependent job
|
|
for dependent_id in &job.dependents {
|
|
let dependent_status = Job::get_status(&mut conn, dependent_id).await
|
|
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to get dependent status: {}", e)))?;
|
|
|
|
// Only check jobs that are waiting for prerequisites
|
|
if dependent_status == JobStatus::WaitingForPrerequisites {
|
|
// Check if all prerequisites are now completed
|
|
if self.check_prerequisites_completed(dependent_id).await? {
|
|
// Update status to dispatched and add to ready jobs
|
|
Job::update_status(&mut conn, dependent_id, JobStatus::Dispatched).await
|
|
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to update dependent status: {}", e)))?;
|
|
ready_jobs.push(dependent_id.clone());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(ready_jobs)
|
|
}
|
|
|
|
/// Dispatch jobs that are ready (have all prerequisites completed)
|
|
pub async fn dispatch_ready_jobs(&self, ready_job_ids: Vec<String>) -> Result<(), SupervisorError> {
|
|
for job_id in ready_job_ids {
|
|
// Get job data to determine script type and select worker
|
|
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
|
let job_key = format!("{}job:{}", NAMESPACE_PREFIX, job_id);
|
|
let job_data: std::collections::HashMap<String, String> = conn.hgetall(&job_key).await?;
|
|
|
|
if let Some(script_type_str) = job_data.get("script_type") {
|
|
// Parse script type (stored as Debug format, e.g., "OSIS")
|
|
let script_type = match script_type_str.as_str() {
|
|
"OSIS" => ScriptType::OSIS,
|
|
"SAL" => ScriptType::SAL,
|
|
"V" => ScriptType::V,
|
|
"Python" => ScriptType::Python,
|
|
_ => return Err(SupervisorError::InvalidInput(format!("Unknown script type: {}", script_type_str))),
|
|
};
|
|
|
|
// Dispatch job using hardcoded queue
|
|
self.start_job_using_connection(&mut conn, job_id, &script_type).await?;
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|
|
}
|
|
|