rename worker to actor
This commit is contained in:
@@ -1,20 +1,20 @@
|
||||
# Worker Lifecycle Management
|
||||
# Actor Lifecycle Management
|
||||
|
||||
The Hero Supervisor includes comprehensive worker lifecycle management functionality using [Zinit](https://github.com/threefoldtech/zinit) as the process manager. This enables the supervisor to manage worker processes, perform health monitoring, and implement load balancing.
|
||||
The Hero Supervisor includes comprehensive actor lifecycle management functionality using [Zinit](https://github.com/threefoldtech/zinit) as the process manager. This enables the supervisor to manage actor processes, perform health monitoring, and implement load balancing.
|
||||
|
||||
## Overview
|
||||
|
||||
The lifecycle management system provides:
|
||||
|
||||
- **Worker Process Management**: Start, stop, restart, and monitor worker binaries
|
||||
- **Health Monitoring**: Automatic ping jobs every 10 minutes for idle workers
|
||||
- **Graceful Shutdown**: Clean termination of worker processes
|
||||
- **Actor Process Management**: Start, stop, restart, and monitor actor binaries
|
||||
- **Health Monitoring**: Automatic ping jobs every 10 minutes for idle actors
|
||||
- **Graceful Shutdown**: Clean termination of actor processes
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐
|
||||
│ Supervisor │ │ WorkerLifecycle │ │ Zinit │
|
||||
│ Supervisor │ │ ActorLifecycle │ │ Zinit │
|
||||
│ │◄──►│ Manager │◄──►│ (Process │
|
||||
│ (Job Dispatch) │ │ │ │ Manager) │
|
||||
└─────────────────┘ └──────────────────┘ └─────────────────┘
|
||||
@@ -22,49 +22,49 @@ The lifecycle management system provides:
|
||||
│ │ │
|
||||
▼ ▼ ▼
|
||||
┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐
|
||||
│ Redis │ │ Health Monitor │ │ Worker Binaries │
|
||||
│ Redis │ │ Health Monitor │ │ Actor Binaries │
|
||||
│ (Job Queue) │ │ (Ping Jobs) │ │ (OSIS/SAL/V) │
|
||||
└─────────────────┘ └──────────────────┘ └─────────────────┘
|
||||
```
|
||||
|
||||
## Components
|
||||
|
||||
### WorkerConfig
|
||||
### ActorConfig
|
||||
|
||||
Defines configuration for a worker binary:
|
||||
Defines configuration for a actor binary:
|
||||
|
||||
```rust
|
||||
use hero_supervisor::{WorkerConfig, ScriptType};
|
||||
use hero_supervisor::{ActorConfig, ScriptType};
|
||||
use std::path::PathBuf;
|
||||
use std::collections::HashMap;
|
||||
|
||||
let config = WorkerConfig::new(
|
||||
"osis_worker_0".to_string(),
|
||||
PathBuf::from("/usr/local/bin/osis_worker"),
|
||||
let config = ActorConfig::new(
|
||||
"osis_actor_0".to_string(),
|
||||
PathBuf::from("/usr/local/bin/osis_actor"),
|
||||
ScriptType::OSIS,
|
||||
)
|
||||
.with_args(vec![
|
||||
"--redis-url".to_string(),
|
||||
"redis://localhost:6379".to_string(),
|
||||
"--worker-id".to_string(),
|
||||
"osis_worker_0".to_string(),
|
||||
"--actor-id".to_string(),
|
||||
"osis_actor_0".to_string(),
|
||||
])
|
||||
.with_env({
|
||||
let mut env = HashMap::new();
|
||||
env.insert("RUST_LOG".to_string(), "info".to_string());
|
||||
env.insert("WORKER_TYPE".to_string(), "osis".to_string());
|
||||
env.insert("ACTOR_TYPE".to_string(), "osis".to_string());
|
||||
env
|
||||
})
|
||||
.with_health_check("/usr/local/bin/osis_worker --health-check".to_string())
|
||||
.with_health_check("/usr/local/bin/osis_actor --health-check".to_string())
|
||||
.with_dependencies(vec!["redis".to_string()]);
|
||||
```
|
||||
|
||||
### WorkerLifecycleManager
|
||||
### ActorLifecycleManager
|
||||
|
||||
Main component for managing worker lifecycles:
|
||||
Main component for managing actor lifecycles:
|
||||
|
||||
```rust
|
||||
use hero_supervisor::{WorkerLifecycleManagerBuilder, Supervisor};
|
||||
use hero_supervisor::{ActorLifecycleManagerBuilder, Supervisor};
|
||||
|
||||
let supervisor = SupervisorBuilder::new()
|
||||
.redis_url("redis://localhost:6379")
|
||||
@@ -72,11 +72,11 @@ let supervisor = SupervisorBuilder::new()
|
||||
.context_id("production")
|
||||
.build()?;
|
||||
|
||||
let mut lifecycle_manager = WorkerLifecycleManagerBuilder::new("/var/run/zinit.sock".to_string())
|
||||
let mut lifecycle_manager = ActorLifecycleManagerBuilder::new("/var/run/zinit.sock".to_string())
|
||||
.with_supervisor(supervisor.clone())
|
||||
.add_worker(osis_worker_config)
|
||||
.add_worker(sal_worker_config)
|
||||
.add_worker(v_worker_config)
|
||||
.add_actor(osis_actor_config)
|
||||
.add_actor(sal_actor_config)
|
||||
.add_actor(v_actor_config)
|
||||
.build();
|
||||
```
|
||||
|
||||
@@ -84,45 +84,45 @@ let mut lifecycle_manager = WorkerLifecycleManagerBuilder::new("/var/run/zinit.s
|
||||
|
||||
The lifecycle manager supports all Hero script types:
|
||||
|
||||
- **OSIS**: Rhai/HeroScript execution workers
|
||||
- **SAL**: System Abstraction Layer workers
|
||||
- **OSIS**: Rhai/HeroScript execution actors
|
||||
- **SAL**: System Abstraction Layer actors
|
||||
- **V**: HeroScript execution in V language
|
||||
- **Python**: HeroScript execution in Python
|
||||
|
||||
## Key Features
|
||||
|
||||
### 1. Worker Management
|
||||
### 1. Actor Management
|
||||
|
||||
```rust
|
||||
// Start all configured workers
|
||||
lifecycle_manager.start_all_workers().await?;
|
||||
// Start all configured actors
|
||||
lifecycle_manager.start_all_actors().await?;
|
||||
|
||||
// Stop all workers
|
||||
lifecycle_manager.stop_all_workers().await?;
|
||||
// Stop all actors
|
||||
lifecycle_manager.stop_all_actors().await?;
|
||||
|
||||
// Restart specific worker
|
||||
lifecycle_manager.restart_worker("osis_worker_0").await?;
|
||||
// Restart specific actor
|
||||
lifecycle_manager.restart_actor("osis_actor_0").await?;
|
||||
|
||||
// Get worker status
|
||||
let status = lifecycle_manager.get_worker_status("osis_worker_0").await?;
|
||||
println!("Worker state: {:?}, PID: {}", status.state, status.pid);
|
||||
// Get actor status
|
||||
let status = lifecycle_manager.get_actor_status("osis_actor_0").await?;
|
||||
println!("Actor state: {:?}, PID: {}", status.state, status.pid);
|
||||
```
|
||||
|
||||
### 2. Health Monitoring
|
||||
|
||||
The system automatically monitors worker health:
|
||||
The system automatically monitors actor health:
|
||||
|
||||
- Tracks last job execution time for each worker
|
||||
- Sends ping jobs to workers idle for 10+ minutes
|
||||
- Restarts workers that fail ping checks 3 times
|
||||
- Updates job times when workers receive tasks
|
||||
- Tracks last job execution time for each actor
|
||||
- Sends ping jobs to actors idle for 10+ minutes
|
||||
- Restarts actors that fail ping checks 3 times
|
||||
- Updates job times when actors receive tasks
|
||||
|
||||
```rust
|
||||
// Manual health check
|
||||
lifecycle_manager.monitor_worker_health().await?;
|
||||
lifecycle_manager.monitor_actor_health().await?;
|
||||
|
||||
// Update job time (called automatically by supervisor)
|
||||
lifecycle_manager.update_worker_job_time("osis_worker_0");
|
||||
lifecycle_manager.update_actor_job_time("osis_actor_0");
|
||||
|
||||
// Start continuous health monitoring
|
||||
lifecycle_manager.start_health_monitoring().await; // Runs forever
|
||||
@@ -130,26 +130,26 @@ lifecycle_manager.start_health_monitoring().await; // Runs forever
|
||||
|
||||
### 3. Dynamic Scaling
|
||||
|
||||
Scale workers up or down based on demand:
|
||||
Scale actors up or down based on demand:
|
||||
|
||||
```rust
|
||||
// Scale OSIS workers to 5 instances
|
||||
lifecycle_manager.scale_workers(&ScriptType::OSIS, 5).await?;
|
||||
// Scale OSIS actors to 5 instances
|
||||
lifecycle_manager.scale_actors(&ScriptType::OSIS, 5).await?;
|
||||
|
||||
// Scale down SAL workers to 1 instance
|
||||
lifecycle_manager.scale_workers(&ScriptType::SAL, 1).await?;
|
||||
// Scale down SAL actors to 1 instance
|
||||
lifecycle_manager.scale_actors(&ScriptType::SAL, 1).await?;
|
||||
|
||||
// Check current running count
|
||||
let count = lifecycle_manager.get_running_worker_count(&ScriptType::V).await;
|
||||
println!("Running V workers: {}", count);
|
||||
let count = lifecycle_manager.get_running_actor_count(&ScriptType::V).await;
|
||||
println!("Running V actors: {}", count);
|
||||
```
|
||||
|
||||
### 4. Service Dependencies
|
||||
|
||||
Workers can depend on other services:
|
||||
Actors can depend on other services:
|
||||
|
||||
```rust
|
||||
let config = WorkerConfig::new(name, binary, script_type)
|
||||
let config = ActorConfig::new(name, binary, script_type)
|
||||
.with_dependencies(vec![
|
||||
"redis".to_string(),
|
||||
"database".to_string(),
|
||||
@@ -157,25 +157,25 @@ let config = WorkerConfig::new(name, binary, script_type)
|
||||
]);
|
||||
```
|
||||
|
||||
Zinit ensures dependencies start before the worker.
|
||||
Zinit ensures dependencies start before the actor.
|
||||
|
||||
## Integration with Supervisor
|
||||
|
||||
The lifecycle manager integrates seamlessly with the supervisor:
|
||||
|
||||
```rust
|
||||
use hero_supervisor::{Supervisor, WorkerLifecycleManager};
|
||||
use hero_supervisor::{Supervisor, ActorLifecycleManager};
|
||||
|
||||
// Create supervisor and lifecycle manager
|
||||
let supervisor = SupervisorBuilder::new().build()?;
|
||||
let mut lifecycle_manager = WorkerLifecycleManagerBuilder::new(zinit_socket)
|
||||
let mut lifecycle_manager = ActorLifecycleManagerBuilder::new(zinit_socket)
|
||||
.with_supervisor(supervisor.clone())
|
||||
.build();
|
||||
|
||||
// Start workers
|
||||
lifecycle_manager.start_all_workers().await?;
|
||||
// Start actors
|
||||
lifecycle_manager.start_all_actors().await?;
|
||||
|
||||
// Create and execute jobs (supervisor automatically routes to workers)
|
||||
// Create and execute jobs (supervisor automatically routes to actors)
|
||||
let job = supervisor
|
||||
.new_job()
|
||||
.script_type(ScriptType::OSIS)
|
||||
@@ -191,15 +191,15 @@ println!("Job result: {}", result);
|
||||
The lifecycle manager automatically creates Zinit service configurations:
|
||||
|
||||
```yaml
|
||||
# Generated service config for osis_worker_0
|
||||
exec: "/usr/local/bin/osis_worker --redis-url redis://localhost:6379 --worker-id osis_worker_0"
|
||||
test: "/usr/local/bin/osis_worker --health-check"
|
||||
# Generated service config for osis_actor_0
|
||||
exec: "/usr/local/bin/osis_actor --redis-url redis://localhost:6379 --actor-id osis_actor_0"
|
||||
test: "/usr/local/bin/osis_actor --health-check"
|
||||
oneshot: false # Restart on exit
|
||||
after:
|
||||
- redis
|
||||
env:
|
||||
RUST_LOG: "info"
|
||||
WORKER_TYPE: "osis"
|
||||
ACTOR_TYPE: "osis"
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
@@ -209,10 +209,10 @@ The system provides comprehensive error handling:
|
||||
```rust
|
||||
use hero_supervisor::SupervisorError;
|
||||
|
||||
match lifecycle_manager.start_worker(&config).await {
|
||||
Ok(_) => println!("Worker started successfully"),
|
||||
Err(SupervisorError::WorkerStartFailed(worker, reason)) => {
|
||||
eprintln!("Failed to start {}: {}", worker, reason);
|
||||
match lifecycle_manager.start_actor(&config).await {
|
||||
Ok(_) => println!("Actor started successfully"),
|
||||
Err(SupervisorError::ActorStartFailed(actor, reason)) => {
|
||||
eprintln!("Failed to start {}: {}", actor, reason);
|
||||
}
|
||||
Err(e) => eprintln!("Other error: {}", e),
|
||||
}
|
||||
@@ -243,11 +243,11 @@ REDIS_URL=redis://localhost:6379 cargo run --example lifecycle_demo
|
||||
redis-server
|
||||
```
|
||||
|
||||
3. **Worker Binaries**: Compiled worker binaries for each script type
|
||||
- `/usr/local/bin/osis_worker`
|
||||
- `/usr/local/bin/sal_worker`
|
||||
- `/usr/local/bin/v_worker`
|
||||
- `/usr/local/bin/python_worker`
|
||||
3. **Actor Binaries**: Compiled actor binaries for each script type
|
||||
- `/usr/local/bin/osis_actor`
|
||||
- `/usr/local/bin/sal_actor`
|
||||
- `/usr/local/bin/v_actor`
|
||||
- `/usr/local/bin/python_actor`
|
||||
|
||||
## Configuration Best Practices
|
||||
|
||||
@@ -267,15 +267,15 @@ REDIS_URL=redis://localhost:6379 cargo run --example lifecycle_demo
|
||||
- Check socket permissions: `ls -la /var/run/zinit.sock`
|
||||
- Verify socket path in configuration
|
||||
|
||||
2. **Worker Start Failed**
|
||||
2. **Actor Start Failed**
|
||||
- Check binary exists and is executable
|
||||
- Verify dependencies are running
|
||||
- Review Zinit logs: `zinit logs <service-name>`
|
||||
|
||||
3. **Health Check Failures**
|
||||
- Implement proper health check endpoint in workers
|
||||
- Implement proper health check endpoint in actors
|
||||
- Verify health check command syntax
|
||||
- Check worker responsiveness
|
||||
- Check actor responsiveness
|
||||
|
||||
4. **Redis Connection Issues**
|
||||
- Ensure Redis is running and accessible
|
||||
@@ -289,10 +289,10 @@ REDIS_URL=redis://localhost:6379 cargo run --example lifecycle_demo
|
||||
zinit list
|
||||
|
||||
# View service logs
|
||||
zinit logs osis_worker_0
|
||||
zinit logs osis_actor_0
|
||||
|
||||
# Check service status
|
||||
zinit status osis_worker_0
|
||||
zinit status osis_actor_0
|
||||
|
||||
# Monitor Redis queues
|
||||
redis-cli keys "hero:job:*"
|
||||
@@ -300,20 +300,20 @@ redis-cli keys "hero:job:*"
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
- **Scaling**: Start with minimal workers and scale based on queue depth
|
||||
- **Scaling**: Start with minimal actors and scale based on queue depth
|
||||
- **Health Monitoring**: Adjust ping intervals based on workload patterns
|
||||
- **Resource Usage**: Monitor CPU/memory usage of worker processes
|
||||
- **Resource Usage**: Monitor CPU/memory usage of actor processes
|
||||
- **Queue Depth**: Monitor Redis queue lengths for scaling decisions
|
||||
|
||||
## Security
|
||||
|
||||
- **Process Isolation**: Zinit provides process isolation
|
||||
- **User Permissions**: Run workers with appropriate user permissions
|
||||
- **User Permissions**: Run actors with appropriate user permissions
|
||||
- **Network Security**: Secure Redis and Zinit socket access
|
||||
- **Binary Validation**: Verify worker binary integrity before deployment
|
||||
- **Binary Validation**: Verify actor binary integrity before deployment
|
||||
|
||||
|
||||
## Future
|
||||
|
||||
- **Load Balancing**: Dynamic scaling of workers based on demand
|
||||
- **Load Balancing**: Dynamic scaling of actors based on demand
|
||||
- **Service Dependencies**: Proper startup ordering with dependency management
|
@@ -1,60 +1,60 @@
|
||||
# Hero Supervisor
|
||||
|
||||
The **Hero Supervisor** is responsible for supervising the lifecycle of workers and dispatching jobs to them via Redis queues.
|
||||
The **Hero Supervisor** is responsible for supervising the lifecycle of actors and dispatching jobs to them via Redis queues.
|
||||
|
||||
## Overview
|
||||
|
||||
The system involves four primary actors:
|
||||
|
||||
1. **OSIS**: A worker that executes Rhai and HeroScript.
|
||||
2. **SAL**: A worker that performs system abstraction layer functionalities using Rhai.
|
||||
3. **V**: A worker that executes HeroScript in the V programming language.
|
||||
4. **Python**: A worker that executes HeroScript in Python.
|
||||
1. **OSIS**: A actor that executes Rhai and HeroScript.
|
||||
2. **SAL**: A actor that performs system abstraction layer functionalities using Rhai.
|
||||
3. **V**: A actor that executes HeroScript in the V programming language.
|
||||
4. **Python**: A actor that executes HeroScript in Python.
|
||||
|
||||
The Supervisor utilizes **zinit** to start and monitor these workers, ensuring they are running correctly.
|
||||
The Supervisor utilizes **zinit** to start and monitor these actors, ensuring they are running correctly.
|
||||
|
||||
### Key Features
|
||||
|
||||
- **Worker Lifecycle Supervision**: Oversee the lifecycle of workers, including starting, stopping, restarting, and load balancing based on job demand.
|
||||
- **Job Supervision**: API for efficiently managing jobs dispatched to workers over Redis queues.
|
||||
- **Actor Lifecycle Supervision**: Oversee the lifecycle of actors, including starting, stopping, restarting, and load balancing based on job demand.
|
||||
- **Job Supervision**: API for efficiently managing jobs dispatched to actors over Redis queues.
|
||||
|
||||
## Worker Lifecycle Supervision
|
||||
## Actor Lifecycle Supervision
|
||||
|
||||
The Supervisor oversees the lifecycle of the workers, ensuring they are operational and efficiently allocated. Load balancing is implemented to dynamically adjust the number of active workers based on job demand.
|
||||
The Supervisor oversees the lifecycle of the actors, ensuring they are operational and efficiently allocated. Load balancing is implemented to dynamically adjust the number of active actors based on job demand.
|
||||
|
||||
Additionally, the Supervisor implements health monitoring for worker engines: if a worker engine does not receive a job within 10 minutes, the Supervisor sends a ping job. The engine must respond immediately; if it fails to do so, the Supervisor restarts the requested job engine.
|
||||
Additionally, the Supervisor implements health monitoring for actor engines: if a actor engine does not receive a job within 10 minutes, the Supervisor sends a ping job. The engine must respond immediately; if it fails to do so, the Supervisor restarts the requested job engine.
|
||||
|
||||
### Prerequisites
|
||||
|
||||
**Important**: Before running any lifecycle examples or using worker management features, you must start the Zinit daemon:
|
||||
**Important**: Before running any lifecycle examples or using actor management features, you must start the Zinit daemon:
|
||||
|
||||
```bash
|
||||
# Start Zinit daemon (required for worker lifecycle management)
|
||||
# Start Zinit daemon (required for actor lifecycle management)
|
||||
sudo zinit init
|
||||
|
||||
# Or start Zinit with a custom socket path
|
||||
sudo zinit --socket /var/run/zinit.sock init
|
||||
```
|
||||
|
||||
**Note**: The Supervisor uses Zinit as the process manager for worker lifecycle operations. The default socket path is `/var/run/zinit.sock`, but you can configure a custom path using the `SupervisorBuilder::zinit_socket_path()` method.
|
||||
**Note**: The Supervisor uses Zinit as the process manager for actor lifecycle operations. The default socket path is `/var/run/zinit.sock`, but you can configure a custom path using the `SupervisorBuilder::zinit_socket_path()` method.
|
||||
|
||||
**Troubleshooting**: If you get connection errors when running examples, ensure:
|
||||
1. Zinit daemon is running (`zinit list` should work)
|
||||
2. The socket path matches between Zinit and your Supervisor configuration
|
||||
3. You have appropriate permissions to access the Zinit socket
|
||||
|
||||
### Supervisor API for Worker Lifecycle
|
||||
### Supervisor API for Actor Lifecycle
|
||||
|
||||
The Supervisor provides the following methods for supervising the worker lifecycle:
|
||||
The Supervisor provides the following methods for supervising the actor lifecycle:
|
||||
|
||||
- **`start_worker()`**: Initializes and starts a specified worker.
|
||||
- **`stop_worker()`**: Gracefully stops a specified worker.
|
||||
- **`restart_worker()`**: Restarts a specified worker to ensure it operates correctly.
|
||||
- **`get_worker_status()`**: Checks the status of a specific worker.
|
||||
- **`start_actor()`**: Initializes and starts a specified actor.
|
||||
- **`stop_actor()`**: Gracefully stops a specified actor.
|
||||
- **`restart_actor()`**: Restarts a specified actor to ensure it operates correctly.
|
||||
- **`get_actor_status()`**: Checks the status of a specific actor.
|
||||
|
||||
## Job Supervision
|
||||
|
||||
Jobs are dispatched to workers through their designated Redis queues, and the Supervisor provides an API for comprehensive job supervision.
|
||||
Jobs are dispatched to actors through their designated Redis queues, and the Supervisor provides an API for comprehensive job supervision.
|
||||
|
||||
### Supervisor API for Job Supervision
|
||||
|
||||
@@ -95,9 +95,9 @@ You can modify these in the example source code if your setup differs.
|
||||
Jobs are managed within the `hero:` namespace in Redis:
|
||||
|
||||
- **`hero:job:{job_id}`**: Stores job parameters as a Redis hash.
|
||||
- **`hero:work_queue:{worker_id}`**: Contains worker-specific job queues for dispatching jobs.
|
||||
- **`hero:work_queue:{actor_id}`**: Contains actor-specific job queues for dispatching jobs.
|
||||
- **`hero:reply:{job_id}`**: Dedicated queues for job results.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- A Redis server must be accessible to both the Supervisor and the workers.
|
||||
- A Redis server must be accessible to both the Supervisor and the actors.
|
@@ -1,10 +1,10 @@
|
||||
# Hero Supervisor Protocol
|
||||
|
||||
This document describes the Redis-based protocol used by the Hero Supervisor for job management and worker communication.
|
||||
This document describes the Redis-based protocol used by the Hero Supervisor for job management and actor communication.
|
||||
|
||||
## Overview
|
||||
|
||||
The Hero Supervisor uses Redis as a message broker and data store for managing distributed job execution. Jobs are stored as Redis hashes, and communication with workers happens through Redis lists (queues).
|
||||
The Hero Supervisor uses Redis as a message broker and data store for managing distributed job execution. Jobs are stored as Redis hashes, and communication with actors happens through Redis lists (queues).
|
||||
|
||||
## Redis Namespace
|
||||
|
||||
@@ -22,7 +22,7 @@ hero:job:{job_id}
|
||||
**Job Hash Fields:**
|
||||
- `id`: Unique job identifier (UUID v4)
|
||||
- `caller_id`: Identifier of the client that created the job
|
||||
- `worker_id`: Target worker identifier
|
||||
- `actor_id`: Target actor identifier
|
||||
- `context_id`: Execution context identifier
|
||||
- `script`: Script content to execute (Rhai or HeroScript)
|
||||
- `timeout`: Execution timeout in seconds
|
||||
@@ -35,8 +35,8 @@ hero:job:{job_id}
|
||||
- `env_vars`: Environment variables as JSON object (optional)
|
||||
- `prerequisites`: JSON array of job IDs that must complete before this job (optional)
|
||||
- `dependents`: JSON array of job IDs that depend on this job completing (optional)
|
||||
- `output`: Job execution result (set by worker)
|
||||
- `error`: Error message if job failed (set by worker)
|
||||
- `output`: Job execution result (set by actor)
|
||||
- `error`: Error message if job failed (set by actor)
|
||||
- `dependencies`: List of job IDs that this job depends on
|
||||
|
||||
### Job Dependencies
|
||||
@@ -47,19 +47,19 @@ Jobs can have dependencies on other jobs, which are stored in the `dependencies`
|
||||
|
||||
Jobs are queued for execution using Redis lists:
|
||||
```
|
||||
hero:work_queue:{worker_id}
|
||||
hero:work_queue:{actor_id}
|
||||
```
|
||||
|
||||
Workers listen on their specific queue using `BLPOP` for job IDs to process.
|
||||
Actors listen on their specific queue using `BLPOP` for job IDs to process.
|
||||
|
||||
### Stop Queues
|
||||
|
||||
Job stop requests are sent through dedicated stop queues:
|
||||
```
|
||||
hero:stop_queue:{worker_id}
|
||||
hero:stop_queue:{actor_id}
|
||||
```
|
||||
|
||||
Workers monitor these queues to receive stop requests for running jobs.
|
||||
Actors monitor these queues to receive stop requests for running jobs.
|
||||
|
||||
### Reply Queues
|
||||
|
||||
@@ -68,7 +68,7 @@ For synchronous job execution, dedicated reply queues are used:
|
||||
hero:reply:{job_id}
|
||||
```
|
||||
|
||||
Workers send results to these queues when jobs complete.
|
||||
Actors send results to these queues when jobs complete.
|
||||
|
||||
## Job Lifecycle
|
||||
|
||||
@@ -79,20 +79,20 @@ Client -> Redis: HSET hero:job:{job_id} {job_fields}
|
||||
|
||||
### 2. Job Submission
|
||||
```
|
||||
Client -> Redis: LPUSH hero:work_queue:{worker_id} {job_id}
|
||||
Client -> Redis: LPUSH hero:work_queue:{actor_id} {job_id}
|
||||
```
|
||||
|
||||
### 3. Job Processing
|
||||
```
|
||||
Worker -> Redis: BLPOP hero:work_queue:{worker_id}
|
||||
Worker -> Redis: HSET hero:job:{job_id} status "started"
|
||||
Worker: Execute script
|
||||
Worker -> Redis: HSET hero:job:{job_id} status "finished" output "{result}"
|
||||
Actor -> Redis: BLPOP hero:work_queue:{actor_id}
|
||||
Actor -> Redis: HSET hero:job:{job_id} status "started"
|
||||
Actor: Execute script
|
||||
Actor -> Redis: HSET hero:job:{job_id} status "finished" output "{result}"
|
||||
```
|
||||
|
||||
### 4. Job Completion (Async)
|
||||
```
|
||||
Worker -> Redis: LPUSH hero:reply:{job_id} {result}
|
||||
Actor -> Redis: LPUSH hero:reply:{job_id} {result}
|
||||
```
|
||||
|
||||
## API Operations
|
||||
@@ -110,7 +110,7 @@ supervisor.list_jobs() -> Vec<String>
|
||||
supervisor.stop_job(job_id) -> Result<(), SupervisorError>
|
||||
```
|
||||
**Redis Operations:**
|
||||
- `LPUSH hero:stop_queue:{worker_id} {job_id}` - Send stop request
|
||||
- `LPUSH hero:stop_queue:{actor_id} {job_id}` - Send stop request
|
||||
|
||||
### Get Job Status
|
||||
```rust
|
||||
@@ -131,20 +131,20 @@ supervisor.get_job_logs(job_id) -> Result<Option<String>, SupervisorError>
|
||||
|
||||
### Run Job and Await Result
|
||||
```rust
|
||||
supervisor.run_job_and_await_result(job, worker_id) -> Result<String, SupervisorError>
|
||||
supervisor.run_job_and_await_result(job, actor_id) -> Result<String, SupervisorError>
|
||||
```
|
||||
**Redis Operations:**
|
||||
1. `HSET hero:job:{job_id} {job_fields}` - Store job
|
||||
2. `LPUSH hero:work_queue:{worker_id} {job_id}` - Submit job
|
||||
2. `LPUSH hero:work_queue:{actor_id} {job_id}` - Submit job
|
||||
3. `BLPOP hero:reply:{job_id} {timeout}` - Wait for result
|
||||
|
||||
## Worker Protocol
|
||||
## Actor Protocol
|
||||
|
||||
### Job Processing Loop
|
||||
```rust
|
||||
loop {
|
||||
// 1. Wait for job
|
||||
job_id = BLPOP hero:work_queue:{worker_id}
|
||||
job_id = BLPOP hero:work_queue:{actor_id}
|
||||
|
||||
// 2. Get job details
|
||||
job_data = HGETALL hero:job:{job_id}
|
||||
@@ -153,8 +153,8 @@ loop {
|
||||
HSET hero:job:{job_id} status "started"
|
||||
|
||||
// 4. Check for stop requests
|
||||
if LLEN hero:stop_queue:{worker_id} > 0 {
|
||||
stop_job_id = LPOP hero:stop_queue:{worker_id}
|
||||
if LLEN hero:stop_queue:{actor_id} > 0 {
|
||||
stop_job_id = LPOP hero:stop_queue:{actor_id}
|
||||
if stop_job_id == job_id {
|
||||
HSET hero:job:{job_id} status "error" error "stopped"
|
||||
continue
|
||||
@@ -175,15 +175,15 @@ loop {
|
||||
```
|
||||
|
||||
### Stop Request Handling
|
||||
Workers should periodically check the stop queue during long-running jobs:
|
||||
Actors should periodically check the stop queue during long-running jobs:
|
||||
```rust
|
||||
if LLEN hero:stop_queue:{worker_id} > 0 {
|
||||
stop_requests = LRANGE hero:stop_queue:{worker_id} 0 -1
|
||||
if LLEN hero:stop_queue:{actor_id} > 0 {
|
||||
stop_requests = LRANGE hero:stop_queue:{actor_id} 0 -1
|
||||
if stop_requests.contains(current_job_id) {
|
||||
// Stop current job execution
|
||||
HSET hero:job:{current_job_id} status "error" error "stopped_by_request"
|
||||
// Remove stop request
|
||||
LREM hero:stop_queue:{worker_id} 1 current_job_id
|
||||
LREM hero:stop_queue:{actor_id} 1 current_job_id
|
||||
return
|
||||
}
|
||||
}
|
||||
@@ -193,17 +193,17 @@ if LLEN hero:stop_queue:{worker_id} > 0 {
|
||||
|
||||
### Job Timeouts
|
||||
- Client sets timeout when creating job
|
||||
- Worker should respect timeout and stop execution
|
||||
- Actor should respect timeout and stop execution
|
||||
- If timeout exceeded: `HSET hero:job:{job_id} status "error" error "timeout"`
|
||||
|
||||
### Worker Failures
|
||||
- If worker crashes, job remains in "started" status
|
||||
### Actor Failures
|
||||
- If actor crashes, job remains in "started" status
|
||||
- Monitoring systems can detect stale jobs and retry
|
||||
- Jobs can be requeued: `LPUSH hero:work_queue:{worker_id} {job_id}`
|
||||
- Jobs can be requeued: `LPUSH hero:work_queue:{actor_id} {job_id}`
|
||||
|
||||
### Redis Connection Issues
|
||||
- Clients should implement retry logic with exponential backoff
|
||||
- Workers should reconnect and resume processing
|
||||
- Actors should reconnect and resume processing
|
||||
- Use Redis persistence to survive Redis restarts
|
||||
|
||||
## Monitoring and Observability
|
||||
@@ -211,10 +211,10 @@ if LLEN hero:stop_queue:{worker_id} > 0 {
|
||||
### Queue Monitoring
|
||||
```bash
|
||||
# Check work queue length
|
||||
LLEN hero:work_queue:{worker_id}
|
||||
LLEN hero:work_queue:{actor_id}
|
||||
|
||||
# Check stop queue length
|
||||
LLEN hero:stop_queue:{worker_id}
|
||||
LLEN hero:stop_queue:{actor_id}
|
||||
|
||||
# List all jobs
|
||||
KEYS hero:job:*
|
||||
@@ -228,7 +228,7 @@ HGETALL hero:job:{job_id}
|
||||
- Jobs completed per second
|
||||
- Average job execution time
|
||||
- Queue depths
|
||||
- Worker availability
|
||||
- Actor availability
|
||||
- Error rates by job type
|
||||
|
||||
## Security Considerations
|
||||
@@ -237,7 +237,7 @@ HGETALL hero:job:{job_id}
|
||||
- Use Redis AUTH for authentication
|
||||
- Enable TLS for Redis connections
|
||||
- Restrict Redis network access
|
||||
- Use Redis ACLs to limit worker permissions
|
||||
- Use Redis ACLs to limit actor permissions
|
||||
|
||||
### Job Security
|
||||
- Validate script content before execution
|
||||
@@ -265,8 +265,8 @@ HGETALL hero:job:{job_id}
|
||||
- Batch similar jobs when possible
|
||||
- Implement job prioritization if needed
|
||||
|
||||
### Worker Optimization
|
||||
- Pool worker connections to Redis
|
||||
### Actor Optimization
|
||||
- Pool actor connections to Redis
|
||||
- Use async I/O for Redis operations
|
||||
- Implement graceful shutdown handling
|
||||
- Monitor worker resource usage
|
||||
- Monitor actor resource usage
|
||||
|
@@ -1,6 +1,6 @@
|
||||
# Hero Supervisor CLI Example
|
||||
|
||||
This example demonstrates how to use the `hive-supervisor` CLI tool for managing workers and jobs in the Hero ecosystem.
|
||||
This example demonstrates how to use the `hive-supervisor` CLI tool for managing actors and jobs in the Hero ecosystem.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
@@ -19,20 +19,20 @@ This example demonstrates how to use the `hive-supervisor` CLI tool for managing
|
||||
# Follow Zinit installation instructions for your platform
|
||||
```
|
||||
|
||||
3. **Worker Binaries**: The configuration references worker binaries that need to be available:
|
||||
- `/usr/local/bin/osis_worker`
|
||||
- `/usr/local/bin/sal_worker`
|
||||
- `/usr/local/bin/v_worker`
|
||||
- `/usr/local/bin/python_worker`
|
||||
3. **Actor Binaries**: The configuration references actor binaries that need to be available:
|
||||
- `/usr/local/bin/osis_actor`
|
||||
- `/usr/local/bin/sal_actor`
|
||||
- `/usr/local/bin/v_actor`
|
||||
- `/usr/local/bin/python_actor`
|
||||
|
||||
For testing purposes, you can create mock worker binaries or update the paths in `config.toml` to point to existing binaries.
|
||||
For testing purposes, you can create mock actor binaries or update the paths in `config.toml` to point to existing binaries.
|
||||
|
||||
## Configuration
|
||||
|
||||
The `config.toml` file contains the supervisor configuration:
|
||||
|
||||
- **Global settings**: Redis URL and Zinit socket path
|
||||
- **Worker configurations**: Binary paths and environment variables for each worker type
|
||||
- **Actor configurations**: Binary paths and environment variables for each actor type
|
||||
|
||||
## Usage Examples
|
||||
|
||||
@@ -43,29 +43,29 @@ The `config.toml` file contains the supervisor configuration:
|
||||
cargo build --bin hive-supervisor --release
|
||||
```
|
||||
|
||||
### 2. Worker Management
|
||||
### 2. Actor Management
|
||||
|
||||
```bash
|
||||
# Show help
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml --help
|
||||
|
||||
# List all configured workers
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml workers list
|
||||
# List all configured actors
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml actors list
|
||||
|
||||
# Start all workers
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml workers start
|
||||
# Start all actors
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml actors start
|
||||
|
||||
# Start specific workers
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml workers start osis_worker sal_worker
|
||||
# Start specific actors
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml actors start osis_actor sal_actor
|
||||
|
||||
# Check worker status
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml workers status
|
||||
# Check actor status
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml actors status
|
||||
|
||||
# Stop all workers
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml workers stop
|
||||
# Stop all actors
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml actors stop
|
||||
|
||||
# Restart specific worker
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml workers restart osis_worker
|
||||
# Restart specific actor
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml actors restart osis_actor
|
||||
```
|
||||
|
||||
### 3. Job Management
|
||||
@@ -73,7 +73,7 @@ cargo build --bin hive-supervisor --release
|
||||
```bash
|
||||
# Create a job with inline script
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml jobs create \
|
||||
--script 'print("Hello from OSIS worker!");' \
|
||||
--script 'print("Hello from OSIS actor!");' \
|
||||
--script-type osis \
|
||||
--caller-id "user123" \
|
||||
--context-id "session456"
|
||||
@@ -118,18 +118,18 @@ cargo build --bin hive-supervisor --release
|
||||
|
||||
```bash
|
||||
# Enable debug logging
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml -v workers status
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml -v actors status
|
||||
|
||||
# Enable trace logging
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml -vv workers status
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml -vv actors status
|
||||
|
||||
# Disable timestamps
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml --no-timestamp workers status
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml --no-timestamp actors status
|
||||
```
|
||||
|
||||
## Sample Scripts
|
||||
|
||||
The `sample_scripts/` directory contains example scripts for different worker types:
|
||||
The `sample_scripts/` directory contains example scripts for different actor types:
|
||||
|
||||
- `hello_osis.rhai` - Simple OSIS/HeroScript example
|
||||
- `system_sal.rhai` - SAL system operation example
|
||||
@@ -148,9 +148,9 @@ The `sample_scripts/` directory contains example scripts for different worker ty
|
||||
- Verify Zinit is running and the socket path is correct
|
||||
- Check permissions on the socket file
|
||||
|
||||
3. **Worker Binary Not Found**
|
||||
3. **Actor Binary Not Found**
|
||||
- Update binary paths in `config.toml` to match your system
|
||||
- Ensure worker binaries are executable
|
||||
- Ensure actor binaries are executable
|
||||
|
||||
4. **Permission Denied**
|
||||
- Check file permissions on configuration and binary files
|
||||
@@ -161,7 +161,7 @@ The `sample_scripts/` directory contains example scripts for different worker ty
|
||||
Run with verbose logging to see detailed operation information:
|
||||
|
||||
```bash
|
||||
RUST_LOG=debug ./target/release/hive-supervisor --config examples/cli/config.toml -vv workers status
|
||||
RUST_LOG=debug ./target/release/hive-supervisor --config examples/cli/config.toml -vv actors status
|
||||
```
|
||||
|
||||
## Configuration Customization
|
||||
@@ -170,15 +170,15 @@ You can customize the configuration for your environment:
|
||||
|
||||
1. **Update Redis URL**: Change `redis_url` in the `[global]` section
|
||||
2. **Update Zinit Socket**: Change `zinit_socket_path` for your Zinit installation
|
||||
3. **Worker Paths**: Update binary paths in worker sections to match your setup
|
||||
4. **Environment Variables**: Add or modify environment variables for each worker type
|
||||
3. **Actor Paths**: Update binary paths in actor sections to match your setup
|
||||
4. **Environment Variables**: Add or modify environment variables for each actor type
|
||||
|
||||
## Integration with Hero Ecosystem
|
||||
|
||||
This CLI integrates with the broader Hero ecosystem:
|
||||
|
||||
- **Job Queue**: Uses Redis for job queuing and status tracking
|
||||
- **Process Management**: Uses Zinit for worker lifecycle management
|
||||
- **Process Management**: Uses Zinit for actor lifecycle management
|
||||
- **Script Execution**: Supports multiple script types (OSIS, SAL, V, Python)
|
||||
- **Monitoring**: Provides real-time status and logging capabilities
|
||||
|
||||
|
@@ -1,19 +1,19 @@
|
||||
# Hero Supervisor CLI Configuration Example
|
||||
# This configuration demonstrates how to set up the hive-supervisor CLI
|
||||
# with different worker types for script execution.
|
||||
# with different actor types for script execution.
|
||||
|
||||
[global]
|
||||
# Redis connection URL for job queuing
|
||||
redis_url = "redis://localhost:6379"
|
||||
|
||||
# OSIS Worker Configuration
|
||||
# OSIS Actor Configuration
|
||||
# Handles OSIS (HeroScript) execution
|
||||
[osis_worker]
|
||||
[osis_actor]
|
||||
binary_path = "../../../target/debug/osis"
|
||||
env_vars = { "RUST_LOG" = "info", "WORKER_TYPE" = "osis", "MAX_CONCURRENT_JOBS" = "5" }
|
||||
env_vars = { "RUST_LOG" = "info", "ACTOR_TYPE" = "osis", "MAX_CONCURRENT_JOBS" = "5" }
|
||||
|
||||
# SAL Worker Configuration
|
||||
# SAL Actor Configuration
|
||||
# Handles System Abstraction Layer scripts
|
||||
[sal_worker]
|
||||
[sal_actor]
|
||||
binary_path = "../../../target/debug/sal"
|
||||
env_vars = { "RUST_LOG" = "info", "WORKER_TYPE" = "sal", "MAX_CONCURRENT_JOBS" = "3" }
|
||||
env_vars = { "RUST_LOG" = "info", "ACTOR_TYPE" = "sal", "MAX_CONCURRENT_JOBS" = "3" }
|
@@ -58,25 +58,25 @@ fi
|
||||
echo -e "${BLUE}=== CLI Help and Information ===${NC}"
|
||||
run_cli "Show main help" --help
|
||||
|
||||
echo -e "${BLUE}=== Worker Management Examples ===${NC}"
|
||||
run_cli "List configured workers" workers list
|
||||
run_cli "Show worker management help" workers --help
|
||||
echo -e "${BLUE}=== Actor Management Examples ===${NC}"
|
||||
run_cli "List configured actors" actors list
|
||||
run_cli "Show actor management help" actors --help
|
||||
|
||||
# Note: These commands would require actual worker binaries and Zinit setup
|
||||
echo -e "${YELLOW}Note: The following commands require actual worker binaries and Zinit setup${NC}"
|
||||
# Note: These commands would require actual actor binaries and Zinit setup
|
||||
echo -e "${YELLOW}Note: The following commands require actual actor binaries and Zinit setup${NC}"
|
||||
echo -e "${YELLOW}They are shown for demonstration but may fail without proper setup${NC}"
|
||||
echo
|
||||
|
||||
# Uncomment these if you have the proper setup
|
||||
# run_cli "Check worker status" workers status
|
||||
# run_cli "Start all workers" workers start
|
||||
# run_cli "Check worker status after start" workers status
|
||||
# run_cli "Check actor status" actors status
|
||||
# run_cli "Start all actors" actors start
|
||||
# run_cli "Check actor status after start" actors status
|
||||
|
||||
echo -e "${BLUE}=== Job Management Examples ===${NC}"
|
||||
run_cli "Show job management help" jobs --help
|
||||
|
||||
# Create sample jobs (these will also require workers to be running)
|
||||
echo -e "${YELLOW}Sample job creation commands (require running workers):${NC}"
|
||||
# Create sample jobs (these will also require actors to be running)
|
||||
echo -e "${YELLOW}Sample job creation commands (require running actors):${NC}"
|
||||
echo
|
||||
|
||||
echo "# Create OSIS job with inline script:"
|
||||
@@ -123,22 +123,22 @@ echo
|
||||
|
||||
echo -e "${BLUE}=== Verbose Logging Examples ===${NC}"
|
||||
echo "# Debug logging:"
|
||||
echo "$CLI_BINARY --config $CONFIG_FILE -v workers list"
|
||||
echo "$CLI_BINARY --config $CONFIG_FILE -v actors list"
|
||||
echo
|
||||
echo "# Trace logging:"
|
||||
echo "$CLI_BINARY --config $CONFIG_FILE -vv workers list"
|
||||
echo "$CLI_BINARY --config $CONFIG_FILE -vv actors list"
|
||||
echo
|
||||
echo "# No timestamps:"
|
||||
echo "$CLI_BINARY --config $CONFIG_FILE --no-timestamp workers list"
|
||||
echo "$CLI_BINARY --config $CONFIG_FILE --no-timestamp actors list"
|
||||
echo
|
||||
|
||||
echo -e "${GREEN}=== Example Runner Complete ===${NC}"
|
||||
echo -e "${YELLOW}To run actual commands, ensure you have:${NC}"
|
||||
echo "1. Redis server running on localhost:6379"
|
||||
echo "2. Zinit process manager installed and configured"
|
||||
echo "3. Worker binaries available at the paths specified in config.toml"
|
||||
echo "3. Actor binaries available at the paths specified in config.toml"
|
||||
echo
|
||||
echo -e "${YELLOW}For testing without full setup, you can:${NC}"
|
||||
echo "1. Update config.toml with paths to existing binaries"
|
||||
echo "2. Use the CLI help commands and configuration validation"
|
||||
echo "3. Test the REPL mode (requires workers to be running)"
|
||||
echo "3. Test the REPL mode (requires actors to be running)"
|
||||
|
@@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Sample Python script for demonstration
|
||||
This script demonstrates Python worker functionality
|
||||
This script demonstrates Python actor functionality
|
||||
"""
|
||||
|
||||
import json
|
||||
@@ -9,7 +9,7 @@ import datetime
|
||||
from typing import List, Dict
|
||||
|
||||
def main():
|
||||
print("=== Python Worker Demo ===")
|
||||
print("=== Python Actor Demo ===")
|
||||
print("Python data processing operations")
|
||||
|
||||
# Data structures
|
||||
|
@@ -1,8 +1,8 @@
|
||||
// Sample OSIS/HeroScript for demonstration
|
||||
// This script demonstrates basic OSIS worker functionality
|
||||
// This script demonstrates basic OSIS actor functionality
|
||||
|
||||
print("=== OSIS Worker Demo ===");
|
||||
print("Hello from the OSIS worker!");
|
||||
print("=== OSIS Actor Demo ===");
|
||||
print("Hello from the OSIS actor!");
|
||||
|
||||
// Basic variable operations
|
||||
let name = "Hero";
|
||||
|
@@ -1,12 +1,12 @@
|
||||
// Sample V language script for demonstration
|
||||
// This script demonstrates V worker functionality
|
||||
// This script demonstrates V actor functionality
|
||||
|
||||
module main
|
||||
|
||||
import math
|
||||
|
||||
fn main() {
|
||||
println("=== V Worker Demo ===")
|
||||
println("=== V Actor Demo ===")
|
||||
println("V language mathematical operations")
|
||||
|
||||
// Basic arithmetic
|
||||
|
@@ -1,7 +1,7 @@
|
||||
// Sample SAL (System Abstraction Layer) script for demonstration
|
||||
// This script demonstrates system-level operations through SAL worker
|
||||
// This script demonstrates system-level operations through SAL actor
|
||||
|
||||
print("=== SAL Worker Demo ===");
|
||||
print("=== SAL Actor Demo ===");
|
||||
print("System Abstraction Layer operations");
|
||||
|
||||
// System information gathering
|
||||
|
@@ -1,6 +1,6 @@
|
||||
use hero_supervisor::{
|
||||
Supervisor, SupervisorBuilder, WorkerConfig, WorkerLifecycleManager,
|
||||
WorkerLifecycleManagerBuilder, ScriptType
|
||||
Supervisor, SupervisorBuilder, ActorConfig, ActorLifecycleManager,
|
||||
ActorLifecycleManagerBuilder, ScriptType
|
||||
};
|
||||
use log::{info, warn, error};
|
||||
use std::collections::HashMap;
|
||||
@@ -13,7 +13,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// Initialize logging
|
||||
env_logger::init();
|
||||
|
||||
info!("Starting Worker Lifecycle Management Demo");
|
||||
info!("Starting Actor Lifecycle Management Demo");
|
||||
|
||||
// Configuration
|
||||
let redis_url = "redis://localhost:6379";
|
||||
@@ -25,154 +25,154 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
.context_id("demo_context")
|
||||
.build()?;
|
||||
|
||||
// Configure workers for different script types
|
||||
let mut worker_configs = Vec::new();
|
||||
// Configure actors for different script types
|
||||
let mut actor_configs = Vec::new();
|
||||
|
||||
// OSIS workers (Rhai/HeroScript)
|
||||
// OSIS actors (Rhai/HeroScript)
|
||||
for i in 0..2 {
|
||||
let config = WorkerConfig::new(
|
||||
format!("osis_worker_{}", i),
|
||||
PathBuf::from("/usr/local/bin/osis_worker"),
|
||||
let config = ActorConfig::new(
|
||||
format!("osis_actor_{}", i),
|
||||
PathBuf::from("/usr/local/bin/osis_actor"),
|
||||
ScriptType::OSIS,
|
||||
)
|
||||
.with_args(vec![
|
||||
"--redis-url".to_string(),
|
||||
redis_url.to_string(),
|
||||
"--worker-id".to_string(),
|
||||
format!("osis_worker_{}", i),
|
||||
"--actor-id".to_string(),
|
||||
format!("osis_actor_{}", i),
|
||||
])
|
||||
.with_env({
|
||||
let mut env = HashMap::new();
|
||||
env.insert("RUST_LOG".to_string(), "info".to_string());
|
||||
env.insert("WORKER_TYPE".to_string(), "osis".to_string());
|
||||
env.insert("ACTOR_TYPE".to_string(), "osis".to_string());
|
||||
env
|
||||
})
|
||||
.with_health_check("/usr/local/bin/osis_worker --health-check".to_string())
|
||||
.with_health_check("/usr/local/bin/osis_actor --health-check".to_string())
|
||||
.with_dependencies(vec!["redis".to_string()]);
|
||||
|
||||
worker_configs.push(config);
|
||||
actor_configs.push(config);
|
||||
}
|
||||
|
||||
// SAL workers (System Abstraction Layer)
|
||||
// SAL actors (System Abstraction Layer)
|
||||
for i in 0..3 {
|
||||
let config = WorkerConfig::new(
|
||||
format!("sal_worker_{}", i),
|
||||
PathBuf::from("/usr/local/bin/sal_worker"),
|
||||
let config = ActorConfig::new(
|
||||
format!("sal_actor_{}", i),
|
||||
PathBuf::from("/usr/local/bin/sal_actor"),
|
||||
ScriptType::SAL,
|
||||
)
|
||||
.with_args(vec![
|
||||
"--redis-url".to_string(),
|
||||
redis_url.to_string(),
|
||||
"--worker-id".to_string(),
|
||||
format!("sal_worker_{}", i),
|
||||
"--actor-id".to_string(),
|
||||
format!("sal_actor_{}", i),
|
||||
])
|
||||
.with_env({
|
||||
let mut env = HashMap::new();
|
||||
env.insert("RUST_LOG".to_string(), "info".to_string());
|
||||
env.insert("WORKER_TYPE".to_string(), "sal".to_string());
|
||||
env.insert("ACTOR_TYPE".to_string(), "sal".to_string());
|
||||
env
|
||||
})
|
||||
.with_health_check("/usr/local/bin/sal_worker --health-check".to_string())
|
||||
.with_health_check("/usr/local/bin/sal_actor --health-check".to_string())
|
||||
.with_dependencies(vec!["redis".to_string()]);
|
||||
|
||||
worker_configs.push(config);
|
||||
actor_configs.push(config);
|
||||
}
|
||||
|
||||
// V workers (HeroScript in V language)
|
||||
// V actors (HeroScript in V language)
|
||||
for i in 0..2 {
|
||||
let config = WorkerConfig::new(
|
||||
format!("v_worker_{}", i),
|
||||
PathBuf::from("/usr/local/bin/v_worker"),
|
||||
let config = ActorConfig::new(
|
||||
format!("v_actor_{}", i),
|
||||
PathBuf::from("/usr/local/bin/v_actor"),
|
||||
ScriptType::V,
|
||||
)
|
||||
.with_args(vec![
|
||||
"--redis-url".to_string(),
|
||||
redis_url.to_string(),
|
||||
"--worker-id".to_string(),
|
||||
format!("v_worker_{}", i),
|
||||
"--actor-id".to_string(),
|
||||
format!("v_actor_{}", i),
|
||||
])
|
||||
.with_env({
|
||||
let mut env = HashMap::new();
|
||||
env.insert("RUST_LOG".to_string(), "info".to_string());
|
||||
env.insert("WORKER_TYPE".to_string(), "v".to_string());
|
||||
env.insert("ACTOR_TYPE".to_string(), "v".to_string());
|
||||
env
|
||||
})
|
||||
.with_health_check("/usr/local/bin/v_worker --health-check".to_string())
|
||||
.with_health_check("/usr/local/bin/v_actor --health-check".to_string())
|
||||
.with_dependencies(vec!["redis".to_string()]);
|
||||
|
||||
worker_configs.push(config);
|
||||
actor_configs.push(config);
|
||||
}
|
||||
|
||||
// Create lifecycle manager
|
||||
let mut lifecycle_manager = WorkerLifecycleManagerBuilder::new(zinit_socket.to_string())
|
||||
let mut lifecycle_manager = ActorLifecycleManagerBuilder::new(zinit_socket.to_string())
|
||||
.with_supervisor(supervisor.clone());
|
||||
|
||||
// Add all worker configurations
|
||||
for config in worker_configs {
|
||||
lifecycle_manager = lifecycle_manager.add_worker(config);
|
||||
// Add all actor configurations
|
||||
for config in actor_configs {
|
||||
lifecycle_manager = lifecycle_manager.add_actor(config);
|
||||
}
|
||||
|
||||
let mut lifecycle_manager = lifecycle_manager.build();
|
||||
|
||||
// Demonstrate lifecycle operations
|
||||
info!("=== Starting Worker Lifecycle Demo ===");
|
||||
info!("=== Starting Actor Lifecycle Demo ===");
|
||||
|
||||
// 1. Start all workers
|
||||
info!("1. Starting all workers...");
|
||||
match lifecycle_manager.start_all_workers().await {
|
||||
Ok(_) => info!("✅ All workers started successfully"),
|
||||
// 1. Start all actors
|
||||
info!("1. Starting all actors...");
|
||||
match lifecycle_manager.start_all_actors().await {
|
||||
Ok(_) => info!("✅ All actors started successfully"),
|
||||
Err(e) => {
|
||||
error!("❌ Failed to start workers: {}", e);
|
||||
error!("❌ Failed to start actors: {}", e);
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for workers to initialize
|
||||
// Wait for actors to initialize
|
||||
sleep(Duration::from_secs(5)).await;
|
||||
|
||||
// 2. Check worker status
|
||||
info!("2. Checking worker status...");
|
||||
match lifecycle_manager.get_all_worker_status().await {
|
||||
// 2. Check actor status
|
||||
info!("2. Checking actor status...");
|
||||
match lifecycle_manager.get_all_actor_status().await {
|
||||
Ok(status_map) => {
|
||||
for (worker_name, status) in status_map {
|
||||
info!(" Worker '{}': State={:?}, PID={}", worker_name, status.state, status.pid);
|
||||
for (actor_name, status) in status_map {
|
||||
info!(" Actor '{}': State={:?}, PID={}", actor_name, status.state, status.pid);
|
||||
}
|
||||
}
|
||||
Err(e) => warn!("Failed to get worker status: {}", e),
|
||||
Err(e) => warn!("Failed to get actor status: {}", e),
|
||||
}
|
||||
|
||||
// 3. Demonstrate scaling
|
||||
info!("3. Demonstrating worker scaling...");
|
||||
info!("3. Demonstrating actor scaling...");
|
||||
|
||||
// Scale up OSIS workers
|
||||
info!(" Scaling up OSIS workers to 3...");
|
||||
if let Err(e) = lifecycle_manager.scale_workers(&ScriptType::OSIS, 3).await {
|
||||
warn!("Failed to scale OSIS workers: {}", e);
|
||||
// Scale up OSIS actors
|
||||
info!(" Scaling up OSIS actors to 3...");
|
||||
if let Err(e) = lifecycle_manager.scale_actors(&ScriptType::OSIS, 3).await {
|
||||
warn!("Failed to scale OSIS actors: {}", e);
|
||||
}
|
||||
|
||||
sleep(Duration::from_secs(3)).await;
|
||||
|
||||
// Scale down SAL workers
|
||||
info!(" Scaling down SAL workers to 1...");
|
||||
if let Err(e) = lifecycle_manager.scale_workers(&ScriptType::SAL, 1).await {
|
||||
warn!("Failed to scale SAL workers: {}", e);
|
||||
// Scale down SAL actors
|
||||
info!(" Scaling down SAL actors to 1...");
|
||||
if let Err(e) = lifecycle_manager.scale_actors(&ScriptType::SAL, 1).await {
|
||||
warn!("Failed to scale SAL actors: {}", e);
|
||||
}
|
||||
|
||||
sleep(Duration::from_secs(3)).await;
|
||||
|
||||
// 4. Check running worker counts
|
||||
info!("4. Checking running worker counts after scaling...");
|
||||
// 4. Check running actor counts
|
||||
info!("4. Checking running actor counts after scaling...");
|
||||
for script_type in [ScriptType::OSIS, ScriptType::SAL, ScriptType::V] {
|
||||
let count = lifecycle_manager.get_running_worker_count(&script_type).await;
|
||||
info!(" {:?}: {} workers running", script_type, count);
|
||||
let count = lifecycle_manager.get_running_actor_count(&script_type).await;
|
||||
info!(" {:?}: {} actors running", script_type, count);
|
||||
}
|
||||
|
||||
// 5. Demonstrate restart functionality
|
||||
info!("5. Demonstrating worker restart...");
|
||||
if let Err(e) = lifecycle_manager.restart_worker("osis_worker_0").await {
|
||||
warn!("Failed to restart worker: {}", e);
|
||||
info!("5. Demonstrating actor restart...");
|
||||
if let Err(e) = lifecycle_manager.restart_actor("osis_actor_0").await {
|
||||
warn!("Failed to restart actor: {}", e);
|
||||
} else {
|
||||
info!(" ✅ Successfully restarted osis_worker_0");
|
||||
info!(" ✅ Successfully restarted osis_actor_0");
|
||||
}
|
||||
|
||||
sleep(Duration::from_secs(3)).await;
|
||||
@@ -180,12 +180,12 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// 6. Simulate job dispatch and health monitoring
|
||||
info!("6. Simulating job dispatch and health monitoring...");
|
||||
|
||||
// Update job time for a worker (simulating job dispatch)
|
||||
lifecycle_manager.update_worker_job_time("sal_worker_0");
|
||||
info!(" Updated job time for sal_worker_0");
|
||||
// Update job time for a actor (simulating job dispatch)
|
||||
lifecycle_manager.update_actor_job_time("sal_actor_0");
|
||||
info!(" Updated job time for sal_actor_0");
|
||||
|
||||
// Perform health monitoring check
|
||||
if let Err(e) = lifecycle_manager.monitor_worker_health().await {
|
||||
if let Err(e) = lifecycle_manager.monitor_actor_health().await {
|
||||
warn!("Health monitoring failed: {}", e);
|
||||
} else {
|
||||
info!(" ✅ Health monitoring completed");
|
||||
@@ -196,7 +196,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let test_job = supervisor
|
||||
.new_job()
|
||||
.script_type(ScriptType::OSIS)
|
||||
.script_content("println!(\"Hello from worker!\");".to_string())
|
||||
.script_content("println!(\"Hello from actor!\");".to_string())
|
||||
.timeout(Duration::from_secs(30))
|
||||
.build()?;
|
||||
|
||||
@@ -208,27 +208,27 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// 8. Demonstrate graceful shutdown
|
||||
info!("8. Demonstrating graceful shutdown...");
|
||||
|
||||
// Stop specific workers
|
||||
info!(" Stopping specific workers...");
|
||||
for worker_name in ["osis_worker_1", "v_worker_0"] {
|
||||
if let Err(e) = lifecycle_manager.stop_worker(worker_name).await {
|
||||
warn!("Failed to stop worker {}: {}", worker_name, e);
|
||||
// Stop specific actors
|
||||
info!(" Stopping specific actors...");
|
||||
for actor_name in ["osis_actor_1", "v_actor_0"] {
|
||||
if let Err(e) = lifecycle_manager.stop_actor(actor_name).await {
|
||||
warn!("Failed to stop actor {}: {}", actor_name, e);
|
||||
} else {
|
||||
info!(" ✅ Stopped worker: {}", worker_name);
|
||||
info!(" ✅ Stopped actor: {}", actor_name);
|
||||
}
|
||||
}
|
||||
|
||||
sleep(Duration::from_secs(2)).await;
|
||||
|
||||
// Stop all remaining workers
|
||||
info!(" Stopping all remaining workers...");
|
||||
if let Err(e) = lifecycle_manager.stop_all_workers().await {
|
||||
error!("Failed to stop all workers: {}", e);
|
||||
// Stop all remaining actors
|
||||
info!(" Stopping all remaining actors...");
|
||||
if let Err(e) = lifecycle_manager.stop_all_actors().await {
|
||||
error!("Failed to stop all actors: {}", e);
|
||||
} else {
|
||||
info!(" ✅ All workers stopped successfully");
|
||||
info!(" ✅ All actors stopped successfully");
|
||||
}
|
||||
|
||||
info!("=== Worker Lifecycle Demo Completed ===");
|
||||
info!("=== Actor Lifecycle Demo Completed ===");
|
||||
|
||||
// Optional: Start health monitoring loop (commented out for demo)
|
||||
// info!("Starting health monitoring loop (Ctrl+C to stop)...");
|
||||
|
@@ -8,44 +8,44 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
|
||||
info!("Starting Hero Supervisor Lifecycle Demo");
|
||||
|
||||
// Build supervisor with simplified worker configuration
|
||||
// Workers are automatically launched during build
|
||||
// Build supervisor with simplified actor configuration
|
||||
// Actors are automatically launched during build
|
||||
let supervisor = SupervisorBuilder::new()
|
||||
.redis_url("redis://localhost:6379")
|
||||
.osis_worker("/usr/local/bin/osis_worker")
|
||||
.sal_worker("/usr/local/bin/sal_worker")
|
||||
.v_worker("/usr/local/bin/v_worker")
|
||||
.worker_env_var("REDIS_URL", "redis://localhost:6379")
|
||||
.worker_env_var("LOG_LEVEL", "info")
|
||||
.osis_actor("/usr/local/bin/osis_actor")
|
||||
.sal_actor("/usr/local/bin/sal_actor")
|
||||
.v_actor("/usr/local/bin/v_actor")
|
||||
.actor_env_var("REDIS_URL", "redis://localhost:6379")
|
||||
.actor_env_var("LOG_LEVEL", "info")
|
||||
.build().await?;
|
||||
|
||||
info!("Supervisor created and workers launched successfully");
|
||||
info!("Supervisor created and actors launched successfully");
|
||||
|
||||
// Wait a moment for workers to start
|
||||
// Wait a moment for actors to start
|
||||
sleep(Duration::from_secs(2)).await;
|
||||
|
||||
// Check worker status using the simplified API
|
||||
info!("Checking worker status...");
|
||||
let workers = supervisor.get_workers(&[]).await;
|
||||
// Check actor status using the simplified API
|
||||
info!("Checking actor status...");
|
||||
let actors = supervisor.get_actors(&[]).await;
|
||||
|
||||
for worker in &workers {
|
||||
let status_info = if worker.is_running {
|
||||
format!("Running (PID: {})", worker.status.as_ref().map(|s| s.pid).unwrap_or(0))
|
||||
for actor in &actors {
|
||||
let status_info = if actor.is_running {
|
||||
format!("Running (PID: {})", actor.status.as_ref().map(|s| s.pid).unwrap_or(0))
|
||||
} else {
|
||||
"Stopped".to_string()
|
||||
};
|
||||
info!(" Worker '{}' ({:?}): {}", worker.config.name, worker.config.script_type, status_info);
|
||||
info!(" Actor '{}' ({:?}): {}", actor.config.name, actor.config.script_type, status_info);
|
||||
}
|
||||
|
||||
// Demonstrate lifecycle operations with simplified API
|
||||
info!("=== Worker Lifecycle Operations ===");
|
||||
info!("=== Actor Lifecycle Operations ===");
|
||||
|
||||
// 1. Demonstrate restart functionality
|
||||
info!("1. Demonstrating worker restart...");
|
||||
if let Err(e) = supervisor.restart_worker("osis_worker_1").await {
|
||||
error!("Failed to restart worker: {}", e);
|
||||
info!("1. Demonstrating actor restart...");
|
||||
if let Err(e) = supervisor.restart_actor("osis_actor_1").await {
|
||||
error!("Failed to restart actor: {}", e);
|
||||
} else {
|
||||
info!(" ✅ Successfully restarted osis_worker_1");
|
||||
info!(" ✅ Successfully restarted osis_actor_1");
|
||||
}
|
||||
|
||||
sleep(Duration::from_secs(2)).await;
|
||||
@@ -61,11 +61,11 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// 3. Demonstrate graceful shutdown
|
||||
info!("3. Demonstrating graceful shutdown...");
|
||||
|
||||
// Stop specific workers
|
||||
if let Err(e) = supervisor.stop_worker("osis_worker_1").await {
|
||||
error!("Failed to stop worker: {}", e);
|
||||
// Stop specific actors
|
||||
if let Err(e) = supervisor.stop_actor("osis_actor_1").await {
|
||||
error!("Failed to stop actor: {}", e);
|
||||
} else {
|
||||
info!(" ✅ Worker stopped successfully");
|
||||
info!(" ✅ Actor stopped successfully");
|
||||
}
|
||||
|
||||
info!("Demo completed successfully!");
|
||||
|
@@ -1,18 +1,18 @@
|
||||
[global]
|
||||
redis_url = "redis://localhost:6379"
|
||||
|
||||
[osis_worker]
|
||||
binary_path = "/path/to/osis_worker"
|
||||
[osis_actor]
|
||||
binary_path = "/path/to/osis_actor"
|
||||
env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
|
||||
|
||||
[sal_worker]
|
||||
binary_path = "/path/to/sal_worker"
|
||||
[sal_actor]
|
||||
binary_path = "/path/to/sal_actor"
|
||||
env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
|
||||
|
||||
[v_worker]
|
||||
binary_path = "/path/to/v_worker"
|
||||
[v_actor]
|
||||
binary_path = "/path/to/v_actor"
|
||||
env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
|
||||
|
||||
[python_worker]
|
||||
binary_path = "/path/to/python_worker"
|
||||
[python_actor]
|
||||
binary_path = "/path/to/python_actor"
|
||||
env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
|
||||
|
@@ -16,14 +16,14 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
info!("Supervisor created.");
|
||||
|
||||
let script_content = r#"
|
||||
// This script will never be executed by a worker because the recipient does not exist.
|
||||
// This script will never be executed by a actor because the recipient does not exist.
|
||||
let x = 10;
|
||||
let y = x + 32;
|
||||
y
|
||||
"#;
|
||||
|
||||
// The worker_id points to a worker queue that doesn't have a worker.
|
||||
let non_existent_recipient = "non_existent_worker_for_timeout_test";
|
||||
// The actor_id points to a actor queue that doesn't have a actor.
|
||||
let non_existent_recipient = "non_existent_actor_for_timeout_test";
|
||||
let very_short_timeout = Duration::from_secs(2);
|
||||
|
||||
info!(
|
||||
|
@@ -21,12 +21,12 @@ pub enum SupervisorError {
|
||||
InvalidInput(String),
|
||||
/// Job operation error
|
||||
JobError(hero_job::JobError),
|
||||
/// Worker lifecycle management errors
|
||||
WorkerStartFailed(String, String),
|
||||
WorkerStopFailed(String, String),
|
||||
WorkerRestartFailed(String, String),
|
||||
WorkerStatusFailed(String, String),
|
||||
WorkerNotFound(String),
|
||||
/// Actor lifecycle management errors
|
||||
ActorStartFailed(String, String),
|
||||
ActorStopFailed(String, String),
|
||||
ActorRestartFailed(String, String),
|
||||
ActorStatusFailed(String, String),
|
||||
ActorNotFound(String),
|
||||
PingJobFailed(String, String),
|
||||
/// Zinit client operation error
|
||||
ZinitError(String),
|
||||
@@ -73,23 +73,23 @@ impl std::fmt::Display for SupervisorError {
|
||||
SupervisorError::JobError(e) => {
|
||||
write!(f, "Job error: {}", e)
|
||||
}
|
||||
SupervisorError::WorkerStartFailed(worker, reason) => {
|
||||
write!(f, "Failed to start worker '{}': {}", worker, reason)
|
||||
SupervisorError::ActorStartFailed(actor, reason) => {
|
||||
write!(f, "Failed to start actor '{}': {}", actor, reason)
|
||||
}
|
||||
SupervisorError::WorkerStopFailed(worker, reason) => {
|
||||
write!(f, "Failed to stop worker '{}': {}", worker, reason)
|
||||
SupervisorError::ActorStopFailed(actor, reason) => {
|
||||
write!(f, "Failed to stop actor '{}': {}", actor, reason)
|
||||
}
|
||||
SupervisorError::WorkerRestartFailed(worker, reason) => {
|
||||
write!(f, "Failed to restart worker '{}': {}", worker, reason)
|
||||
SupervisorError::ActorRestartFailed(actor, reason) => {
|
||||
write!(f, "Failed to restart actor '{}': {}", actor, reason)
|
||||
}
|
||||
SupervisorError::WorkerStatusFailed(worker, reason) => {
|
||||
write!(f, "Failed to get status for worker '{}': {}", worker, reason)
|
||||
SupervisorError::ActorStatusFailed(actor, reason) => {
|
||||
write!(f, "Failed to get status for actor '{}': {}", actor, reason)
|
||||
}
|
||||
SupervisorError::WorkerNotFound(worker) => {
|
||||
write!(f, "Worker '{}' not found", worker)
|
||||
SupervisorError::ActorNotFound(actor) => {
|
||||
write!(f, "Actor '{}' not found", actor)
|
||||
}
|
||||
SupervisorError::PingJobFailed(worker, reason) => {
|
||||
write!(f, "Ping job failed for worker '{}': {}", worker, reason)
|
||||
SupervisorError::PingJobFailed(actor, reason) => {
|
||||
write!(f, "Ping job failed for actor '{}': {}", actor, reason)
|
||||
}
|
||||
SupervisorError::ZinitError(msg) => {
|
||||
write!(f, "Zinit error: {}", msg)
|
||||
|
@@ -16,7 +16,7 @@ mod lifecycle;
|
||||
|
||||
pub use crate::error::SupervisorError;
|
||||
pub use crate::job::JobBuilder;
|
||||
pub use crate::lifecycle::WorkerConfig;
|
||||
pub use crate::lifecycle::ActorConfig;
|
||||
// Re-export types from hero_job for public API
|
||||
pub use hero_job::{Job, JobStatus, ScriptType};
|
||||
|
||||
@@ -28,22 +28,22 @@ pub struct Supervisor {
|
||||
|
||||
pub struct SupervisorBuilder {
|
||||
redis_url: Option<String>,
|
||||
osis_worker: Option<String>,
|
||||
sal_worker: Option<String>,
|
||||
v_worker: Option<String>,
|
||||
python_worker: Option<String>,
|
||||
worker_env_vars: HashMap<String, String>,
|
||||
osis_actor: Option<String>,
|
||||
sal_actor: Option<String>,
|
||||
v_actor: Option<String>,
|
||||
python_actor: Option<String>,
|
||||
actor_env_vars: HashMap<String, String>,
|
||||
websocket_config: Option<WebSocketServerConfig>,
|
||||
}
|
||||
|
||||
/// Helper struct to pass builder data to worker launch method
|
||||
/// Helper struct to pass builder data to actor launch method
|
||||
#[derive(Clone)]
|
||||
struct SupervisorBuilderData {
|
||||
osis_worker: Option<String>,
|
||||
sal_worker: Option<String>,
|
||||
v_worker: Option<String>,
|
||||
python_worker: Option<String>,
|
||||
worker_env_vars: HashMap<String, String>,
|
||||
osis_actor: Option<String>,
|
||||
sal_actor: Option<String>,
|
||||
v_actor: Option<String>,
|
||||
python_actor: Option<String>,
|
||||
actor_env_vars: HashMap<String, String>,
|
||||
websocket_config: Option<WebSocketServerConfig>,
|
||||
}
|
||||
|
||||
@@ -52,10 +52,10 @@ struct SupervisorBuilderData {
|
||||
pub struct SupervisorConfig {
|
||||
pub global: GlobalConfig,
|
||||
pub websocket_server: Option<WebSocketServerConfig>,
|
||||
pub osis_worker: Option<WorkerConfigToml>,
|
||||
pub sal_worker: Option<WorkerConfigToml>,
|
||||
pub v_worker: Option<WorkerConfigToml>,
|
||||
pub python_worker: Option<WorkerConfigToml>,
|
||||
pub osis_actor: Option<ActorConfigToml>,
|
||||
pub sal_actor: Option<ActorConfigToml>,
|
||||
pub v_actor: Option<ActorConfigToml>,
|
||||
pub python_actor: Option<ActorConfigToml>,
|
||||
}
|
||||
|
||||
/// Global configuration section
|
||||
@@ -64,12 +64,10 @@ pub struct GlobalConfig {
|
||||
pub redis_url: String,
|
||||
}
|
||||
|
||||
/// Worker configuration section in TOML
|
||||
/// Actor configuration section in TOML
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct WorkerConfigToml {
|
||||
pub struct ActorConfigToml {
|
||||
pub binary_path: String,
|
||||
#[serde(default)]
|
||||
pub env_vars: HashMap<String, String>,
|
||||
}
|
||||
|
||||
/// WebSocket server configuration section in TOML
|
||||
@@ -127,11 +125,11 @@ impl SupervisorBuilder {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
redis_url: None,
|
||||
osis_worker: None,
|
||||
sal_worker: None,
|
||||
v_worker: None,
|
||||
python_worker: None,
|
||||
worker_env_vars: HashMap::new(),
|
||||
osis_actor: None,
|
||||
sal_actor: None,
|
||||
v_actor: None,
|
||||
python_actor: None,
|
||||
actor_env_vars: HashMap::new(),
|
||||
websocket_config: None,
|
||||
}
|
||||
}
|
||||
@@ -147,25 +145,21 @@ impl SupervisorBuilder {
|
||||
let mut builder = Self::new()
|
||||
.redis_url(&config.global.redis_url);
|
||||
|
||||
// Configure workers based on TOML config
|
||||
if let Some(osis_config) = config.osis_worker {
|
||||
builder = builder.osis_worker(&osis_config.binary_path)
|
||||
.worker_env_vars(osis_config.env_vars);
|
||||
// Configure actors based on TOML config
|
||||
if let Some(osis_config) = config.osis_actor {
|
||||
builder = builder.osis_actor(&osis_config.binary_path);
|
||||
}
|
||||
|
||||
if let Some(sal_config) = config.sal_worker {
|
||||
builder = builder.sal_worker(&sal_config.binary_path)
|
||||
.worker_env_vars(sal_config.env_vars);
|
||||
if let Some(sal_config) = config.sal_actor {
|
||||
builder = builder.sal_actor(&sal_config.binary_path);
|
||||
}
|
||||
|
||||
if let Some(v_config) = config.v_worker {
|
||||
builder = builder.v_worker(&v_config.binary_path)
|
||||
.worker_env_vars(v_config.env_vars);
|
||||
if let Some(v_config) = config.v_actor {
|
||||
builder = builder.v_actor(&v_config.binary_path);
|
||||
}
|
||||
|
||||
if let Some(python_config) = config.python_worker {
|
||||
builder = builder.python_worker(&python_config.binary_path)
|
||||
.worker_env_vars(python_config.env_vars);
|
||||
if let Some(python_config) = config.python_actor {
|
||||
builder = builder.python_actor(&python_config.binary_path);
|
||||
}
|
||||
|
||||
// Store WebSocket configuration for later use
|
||||
@@ -176,28 +170,28 @@ impl SupervisorBuilder {
|
||||
Ok(builder)
|
||||
}
|
||||
|
||||
/// Validate that all configured worker binaries exist and are executable
|
||||
fn validate_worker_binaries(&self) -> Result<(), SupervisorError> {
|
||||
let workers = [
|
||||
("OSIS", &self.osis_worker),
|
||||
("SAL", &self.sal_worker),
|
||||
("V", &self.v_worker),
|
||||
("Python", &self.python_worker),
|
||||
/// Validate that all configured actor binaries exist and are executable
|
||||
fn validate_actor_binaries(&self) -> Result<(), SupervisorError> {
|
||||
let actors = [
|
||||
("OSIS", &self.osis_actor),
|
||||
("SAL", &self.sal_actor),
|
||||
("V", &self.v_actor),
|
||||
("Python", &self.python_actor),
|
||||
];
|
||||
|
||||
for (worker_type, binary_path) in workers {
|
||||
for (actor_type, binary_path) in actors {
|
||||
if let Some(path) = binary_path {
|
||||
let path_obj = Path::new(path);
|
||||
|
||||
if !path_obj.exists() {
|
||||
return Err(SupervisorError::ConfigError(
|
||||
format!("{} worker binary does not exist: {}", worker_type, path)
|
||||
format!("{} actor binary does not exist: {}", actor_type, path)
|
||||
));
|
||||
}
|
||||
|
||||
if !path_obj.is_file() {
|
||||
return Err(SupervisorError::ConfigError(
|
||||
format!("{} worker path is not a file: {}", worker_type, path)
|
||||
format!("{} actor path is not a file: {}", actor_type, path)
|
||||
));
|
||||
}
|
||||
|
||||
@@ -207,19 +201,19 @@ impl SupervisorBuilder {
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
let metadata = path_obj.metadata().map_err(|e| {
|
||||
SupervisorError::ConfigError(
|
||||
format!("Failed to read metadata for {} worker binary {}: {}", worker_type, path, e)
|
||||
format!("Failed to read metadata for {} actor binary {}: {}", actor_type, path, e)
|
||||
)
|
||||
})?;
|
||||
|
||||
let permissions = metadata.permissions();
|
||||
if permissions.mode() & 0o111 == 0 {
|
||||
return Err(SupervisorError::ConfigError(
|
||||
format!("{} worker binary is not executable: {}", worker_type, path)
|
||||
format!("{} actor binary is not executable: {}", actor_type, path)
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
info!("Validated {} worker binary: {}", worker_type, path);
|
||||
info!("Validated {} actor binary: {}", actor_type, path);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -231,48 +225,48 @@ impl SupervisorBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
pub fn osis_worker(mut self, binary_path: &str) -> Self {
|
||||
self.osis_worker = Some(binary_path.to_string());
|
||||
pub fn osis_actor(mut self, binary_path: &str) -> Self {
|
||||
self.osis_actor = Some(binary_path.to_string());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn sal_worker(mut self, binary_path: &str) -> Self {
|
||||
self.sal_worker = Some(binary_path.to_string());
|
||||
pub fn sal_actor(mut self, binary_path: &str) -> Self {
|
||||
self.sal_actor = Some(binary_path.to_string());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn v_worker(mut self, binary_path: &str) -> Self {
|
||||
self.v_worker = Some(binary_path.to_string());
|
||||
pub fn v_actor(mut self, binary_path: &str) -> Self {
|
||||
self.v_actor = Some(binary_path.to_string());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn python_worker(mut self, binary_path: &str) -> Self {
|
||||
self.python_worker = Some(binary_path.to_string());
|
||||
pub fn python_actor(mut self, binary_path: &str) -> Self {
|
||||
self.python_actor = Some(binary_path.to_string());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn worker_env_var(mut self, key: &str, value: &str) -> Self {
|
||||
self.worker_env_vars.insert(key.to_string(), value.to_string());
|
||||
pub fn actor_env_var(mut self, key: &str, value: &str) -> Self {
|
||||
self.actor_env_vars.insert(key.to_string(), value.to_string());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn worker_env_vars(mut self, env_vars: HashMap<String, String>) -> Self {
|
||||
self.worker_env_vars.extend(env_vars);
|
||||
pub fn actor_env_vars(mut self, env_vars: HashMap<String, String>) -> Self {
|
||||
self.actor_env_vars.extend(env_vars);
|
||||
self
|
||||
}
|
||||
|
||||
/// Builds the final `Supervisor` instance synchronously.
|
||||
///
|
||||
/// This method validates the configuration, checks worker binary existence,
|
||||
/// and creates the Redis client. Worker launching is deferred to the `start_workers()` method.
|
||||
/// This method validates the configuration, checks actor binary existence,
|
||||
/// and creates the Redis client. Actor launching is deferred to the `start_actors()` method.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// * `Ok(Supervisor)` - Successfully configured client with valid binaries
|
||||
/// * `Err(SupervisorError)` - Configuration, binary validation, or connection error
|
||||
pub async fn build(self) -> Result<Supervisor, SupervisorError> {
|
||||
// Validate that all configured worker binaries exist first
|
||||
Self::validate_worker_binaries(&self)?;
|
||||
// Validate that all configured actor binaries exist first
|
||||
Self::validate_actor_binaries(&self)?;
|
||||
|
||||
let url = self.redis_url
|
||||
.unwrap_or_else(|| "redis://127.0.0.1/".to_string());
|
||||
@@ -281,13 +275,13 @@ impl SupervisorBuilder {
|
||||
let zinit_client = ZinitClient::unix_socket("/tmp/zinit.sock").await
|
||||
.map_err(|e| SupervisorError::ZinitError(format!("Failed to create Zinit client: {}", e)))?;
|
||||
|
||||
// Store builder data for later use in start_workers()
|
||||
// Store builder data for later use in start_actors()
|
||||
let builder_data = SupervisorBuilderData {
|
||||
osis_worker: self.osis_worker,
|
||||
sal_worker: self.sal_worker,
|
||||
v_worker: self.v_worker,
|
||||
python_worker: self.python_worker,
|
||||
worker_env_vars: self.worker_env_vars,
|
||||
osis_actor: self.osis_actor,
|
||||
sal_actor: self.sal_actor,
|
||||
v_actor: self.v_actor,
|
||||
python_actor: self.python_actor,
|
||||
actor_env_vars: self.actor_env_vars,
|
||||
websocket_config: self.websocket_config,
|
||||
};
|
||||
|
||||
@@ -302,10 +296,10 @@ impl SupervisorBuilder {
|
||||
}
|
||||
|
||||
impl Supervisor {
|
||||
/// Start all configured workers asynchronously.
|
||||
/// This method should be called after build() to launch the workers.
|
||||
pub async fn start_workers(&self) -> Result<(), SupervisorError> {
|
||||
info!("Starting Hero Supervisor workers...");
|
||||
/// Start all configured actors asynchronously.
|
||||
/// This method should be called after build() to launch the actors.
|
||||
pub async fn start_actors(&self) -> Result<(), SupervisorError> {
|
||||
info!("Starting Hero Supervisor actors...");
|
||||
|
||||
// Test Zinit connection first
|
||||
info!("Testing Zinit connection at /tmp/zinit.sock...");
|
||||
@@ -319,102 +313,102 @@ impl Supervisor {
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up any existing worker services first
|
||||
info!("Cleaning up existing worker services...");
|
||||
self.cleanup_existing_workers().await?;
|
||||
// Clean up any existing actor services first
|
||||
info!("Cleaning up existing actor services...");
|
||||
self.cleanup_existing_actors().await?;
|
||||
|
||||
// Launch configured workers if builder data is available
|
||||
// Launch configured actors if builder data is available
|
||||
if let Some(builder_data) = &self.builder_data {
|
||||
info!("Launching configured workers...");
|
||||
self.launch_configured_workers(builder_data).await?;
|
||||
info!("Launching configured actors...");
|
||||
self.launch_configured_actors(builder_data).await?;
|
||||
} else {
|
||||
warn!("No builder data available, no workers to start");
|
||||
warn!("No builder data available, no actors to start");
|
||||
}
|
||||
|
||||
info!("All workers started successfully!");
|
||||
info!("All actors started successfully!");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Clean up all worker services from zinit on program exit
|
||||
/// Clean up all actor services from zinit on program exit
|
||||
pub async fn cleanup_and_shutdown(&self) -> Result<(), SupervisorError> {
|
||||
info!("Cleaning up worker services before shutdown...");
|
||||
info!("Cleaning up actor services before shutdown...");
|
||||
|
||||
let worker_names = vec![
|
||||
"osis_worker_1",
|
||||
"sal_worker_1",
|
||||
"v_worker_1",
|
||||
"python_worker_1"
|
||||
let actor_names = vec![
|
||||
"osis_actor_1",
|
||||
"sal_actor_1",
|
||||
"v_actor_1",
|
||||
"python_actor_1"
|
||||
];
|
||||
|
||||
for worker_name in worker_names {
|
||||
if let Err(e) = self.stop_and_delete_worker(worker_name).await {
|
||||
warn!("Failed to cleanup worker {}: {}", worker_name, e);
|
||||
for actor_name in actor_names {
|
||||
if let Err(e) = self.stop_and_delete_actor(actor_name).await {
|
||||
warn!("Failed to cleanup actor {}: {}", actor_name, e);
|
||||
}
|
||||
}
|
||||
|
||||
info!("Worker cleanup completed");
|
||||
info!("Actor cleanup completed");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Clean up any existing worker services on startup
|
||||
async fn cleanup_existing_workers(&self) -> Result<(), SupervisorError> {
|
||||
info!("Cleaning up any existing worker services...");
|
||||
/// Clean up any existing actor services on startup
|
||||
async fn cleanup_existing_actors(&self) -> Result<(), SupervisorError> {
|
||||
info!("Cleaning up any existing actor services...");
|
||||
|
||||
let worker_names = vec![
|
||||
"osis_worker_1",
|
||||
"sal_worker_1",
|
||||
"v_worker_1",
|
||||
"python_worker_1"
|
||||
let actor_names = vec![
|
||||
"osis_actor_1",
|
||||
"sal_actor_1",
|
||||
"v_actor_1",
|
||||
"python_actor_1"
|
||||
];
|
||||
|
||||
for worker_name in worker_names {
|
||||
for actor_name in actor_names {
|
||||
// Try to stop and delete, but don't fail if they don't exist
|
||||
info!("Attempting to cleanup worker: {}", worker_name);
|
||||
match self.stop_and_delete_worker(worker_name).await {
|
||||
Ok(_) => info!("Successfully cleaned up worker: {}", worker_name),
|
||||
Err(e) => debug!("Failed to cleanup worker {}: {}", worker_name, e),
|
||||
info!("Attempting to cleanup actor: {}", actor_name);
|
||||
match self.stop_and_delete_actor(actor_name).await {
|
||||
Ok(_) => info!("Successfully cleaned up actor: {}", actor_name),
|
||||
Err(e) => debug!("Failed to cleanup actor {}: {}", actor_name, e),
|
||||
}
|
||||
}
|
||||
|
||||
info!("Existing worker cleanup completed");
|
||||
info!("Existing actor cleanup completed");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Stop and delete a worker service from zinit
|
||||
async fn stop_and_delete_worker(&self, worker_name: &str) -> Result<(), SupervisorError> {
|
||||
info!("Starting cleanup for worker: {}", worker_name);
|
||||
/// Stop and delete a actor service from zinit
|
||||
async fn stop_and_delete_actor(&self, actor_name: &str) -> Result<(), SupervisorError> {
|
||||
info!("Starting cleanup for actor: {}", actor_name);
|
||||
|
||||
// First try to stop the worker
|
||||
info!("Attempting to stop worker: {}", worker_name);
|
||||
if let Err(e) = self.zinit_client.stop(worker_name).await {
|
||||
debug!("Worker {} was not running or failed to stop: {}", worker_name, e);
|
||||
// First try to stop the actor
|
||||
info!("Attempting to stop actor: {}", actor_name);
|
||||
if let Err(e) = self.zinit_client.stop(actor_name).await {
|
||||
debug!("Actor {} was not running or failed to stop: {}", actor_name, e);
|
||||
} else {
|
||||
info!("Successfully stopped worker: {}", worker_name);
|
||||
info!("Successfully stopped actor: {}", actor_name);
|
||||
}
|
||||
|
||||
// Then forget the service to stop monitoring it
|
||||
info!("Attempting to forget worker: {}", worker_name);
|
||||
if let Err(e) = self.zinit_client.forget(worker_name).await {
|
||||
info!("Worker {} was not being monitored or failed to forget: {}", worker_name, e);
|
||||
info!("Attempting to forget actor: {}", actor_name);
|
||||
if let Err(e) = self.zinit_client.forget(actor_name).await {
|
||||
info!("Actor {} was not being monitored or failed to forget: {}", actor_name, e);
|
||||
} else {
|
||||
info!("Successfully forgot worker service: {}", worker_name);
|
||||
info!("Successfully forgot actor service: {}", actor_name);
|
||||
}
|
||||
|
||||
// Finally, delete the service configuration
|
||||
info!("Attempting to delete service for worker: {}", worker_name);
|
||||
if let Err(e) = self.zinit_client.delete_service(worker_name).await {
|
||||
debug!("Worker {} service did not exist or failed to delete: {}", worker_name, e);
|
||||
info!("Attempting to delete service for actor: {}", actor_name);
|
||||
if let Err(e) = self.zinit_client.delete_service(actor_name).await {
|
||||
debug!("Actor {} service did not exist or failed to delete: {}", actor_name, e);
|
||||
} else {
|
||||
info!("Successfully deleted worker service: {}", worker_name);
|
||||
info!("Successfully deleted actor service: {}", actor_name);
|
||||
}
|
||||
|
||||
info!("Completed cleanup for worker: {}", worker_name);
|
||||
info!("Completed cleanup for actor: {}", actor_name);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get the hardcoded worker queue key for the script type
|
||||
fn get_worker_queue_key(&self, script_type: &ScriptType) -> String {
|
||||
format!("{}worker_queue:{}", NAMESPACE_PREFIX, script_type.worker_queue_suffix())
|
||||
/// Get the hardcoded actor queue key for the script type
|
||||
fn get_actor_queue_key(&self, script_type: &ScriptType) -> String {
|
||||
format!("{}actor_queue:{}", NAMESPACE_PREFIX, script_type.actor_queue_suffix())
|
||||
}
|
||||
|
||||
pub fn new_job(&self) -> JobBuilder {
|
||||
@@ -432,63 +426,58 @@ impl Supervisor {
|
||||
})
|
||||
}
|
||||
|
||||
/// Extract worker configurations from the supervisor's builder data
|
||||
pub fn get_worker_configs(&self) -> Result<Vec<WorkerConfig>, SupervisorError> {
|
||||
/// Extract actor configurations from the supervisor's builder data
|
||||
pub fn get_actor_configs(&self) -> Result<Vec<ActorConfig>, SupervisorError> {
|
||||
let builder_data = self.builder_data.as_ref().ok_or_else(|| {
|
||||
SupervisorError::ConfigError("No builder data available for worker configs".to_string())
|
||||
SupervisorError::ConfigError("No builder data available for actor configs".to_string())
|
||||
})?;
|
||||
|
||||
let mut configs = Vec::new();
|
||||
let env_vars = builder_data.worker_env_vars.clone();
|
||||
|
||||
if let Some(osis_path) = &builder_data.osis_worker {
|
||||
if let Some(osis_path) = &builder_data.osis_actor {
|
||||
configs.push(
|
||||
WorkerConfig::new("osis_worker_1".to_string(), PathBuf::from(osis_path), ScriptType::OSIS)
|
||||
.with_env(env_vars.clone())
|
||||
ActorConfig::new("osis_actor_1".to_string(), PathBuf::from(osis_path), ScriptType::OSIS)
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(sal_path) = &builder_data.sal_worker {
|
||||
if let Some(sal_path) = &builder_data.sal_actor {
|
||||
configs.push(
|
||||
WorkerConfig::new("sal_worker_1".to_string(), PathBuf::from(sal_path), ScriptType::SAL)
|
||||
.with_env(env_vars.clone())
|
||||
ActorConfig::new("sal_actor_1".to_string(), PathBuf::from(sal_path), ScriptType::SAL)
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(v_path) = &builder_data.v_worker {
|
||||
if let Some(v_path) = &builder_data.v_actor {
|
||||
configs.push(
|
||||
WorkerConfig::new("v_worker_1".to_string(), PathBuf::from(v_path), ScriptType::V)
|
||||
.with_env(env_vars.clone())
|
||||
ActorConfig::new("v_actor_1".to_string(), PathBuf::from(v_path), ScriptType::V)
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(python_path) = &builder_data.python_worker {
|
||||
if let Some(python_path) = &builder_data.python_actor {
|
||||
configs.push(
|
||||
WorkerConfig::new("python_worker_1".to_string(), PathBuf::from(python_path), ScriptType::Python)
|
||||
.with_env(env_vars.clone())
|
||||
ActorConfig::new("python_actor_1".to_string(), PathBuf::from(python_path), ScriptType::Python)
|
||||
);
|
||||
}
|
||||
|
||||
Ok(configs)
|
||||
}
|
||||
|
||||
/// Spawn a background lifecycle manager that continuously monitors and maintains worker health
|
||||
/// Spawn a background lifecycle manager that continuously monitors and maintains actor health
|
||||
/// Returns a JoinHandle that can be used to stop the lifecycle manager
|
||||
pub fn spawn_lifecycle_manager(
|
||||
self: Arc<Self>,
|
||||
worker_configs: Vec<WorkerConfig>,
|
||||
actor_configs: Vec<ActorConfig>,
|
||||
health_check_interval: Duration,
|
||||
) -> tokio::task::JoinHandle<Result<(), SupervisorError>> {
|
||||
let supervisor = self;
|
||||
|
||||
tokio::spawn(async move {
|
||||
info!("Starting background lifecycle manager with {} workers", worker_configs.len());
|
||||
info!("Starting background lifecycle manager with {} actors", actor_configs.len());
|
||||
info!("Health check interval: {:?}", health_check_interval);
|
||||
|
||||
// Initial worker startup
|
||||
info!("Performing initial worker startup...");
|
||||
if let Err(e) = supervisor.start_workers().await {
|
||||
error!("Failed to start workers during initialization: {}", e);
|
||||
// Initial actor startup
|
||||
info!("Performing initial actor startup...");
|
||||
if let Err(e) = supervisor.start_actors().await {
|
||||
error!("Failed to start actors during initialization: {}", e);
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
@@ -499,12 +488,12 @@ impl Supervisor {
|
||||
loop {
|
||||
interval.tick().await;
|
||||
|
||||
info!("Running periodic worker health check...");
|
||||
info!("Running periodic actor health check...");
|
||||
|
||||
// Check each worker's health and restart if needed
|
||||
for worker_config in &worker_configs {
|
||||
if let Err(e) = supervisor.check_and_restart_worker(worker_config).await {
|
||||
error!("Failed to check/restart worker {}: {}", worker_config.name, e);
|
||||
// Check each actor's health and restart if needed
|
||||
for actor_config in &actor_configs {
|
||||
if let Err(e) = supervisor.check_and_restart_actor(actor_config).await {
|
||||
error!("Failed to check/restart actor {}: {}", actor_config.name, e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -513,59 +502,59 @@ impl Supervisor {
|
||||
})
|
||||
}
|
||||
|
||||
/// Check a single worker's health and restart if needed
|
||||
async fn check_and_restart_worker(&self, worker_config: &WorkerConfig) -> Result<(), SupervisorError> {
|
||||
let worker_name = &worker_config.name;
|
||||
/// Check a single actor's health and restart if needed
|
||||
async fn check_and_restart_actor(&self, actor_config: &ActorConfig) -> Result<(), SupervisorError> {
|
||||
let actor_name = &actor_config.name;
|
||||
|
||||
// Get worker status
|
||||
match self.zinit_client.status(worker_name).await {
|
||||
// Get actor status
|
||||
match self.zinit_client.status(actor_name).await {
|
||||
Ok(status) => {
|
||||
let is_healthy = status.state == "running" && status.pid > 0;
|
||||
|
||||
if is_healthy {
|
||||
debug!("Worker {} is healthy (state: {}, pid: {})", worker_name, status.state, status.pid);
|
||||
debug!("Actor {} is healthy (state: {}, pid: {})", actor_name, status.state, status.pid);
|
||||
|
||||
// Optionally send a ping job for deeper health check
|
||||
if let Err(e) = self.send_ping_job(worker_config.script_type.clone()).await {
|
||||
warn!("Ping job failed for worker {}: {}", worker_name, e);
|
||||
if let Err(e) = self.send_ping_job(actor_config.script_type.clone()).await {
|
||||
warn!("Ping job failed for actor {}: {}", actor_name, e);
|
||||
// Note: We don't restart on ping failure as it might be temporary
|
||||
}
|
||||
} else {
|
||||
warn!("Worker {} is unhealthy (state: {}, pid: {}), restarting...",
|
||||
worker_name, status.state, status.pid);
|
||||
warn!("Actor {} is unhealthy (state: {}, pid: {}), restarting...",
|
||||
actor_name, status.state, status.pid);
|
||||
|
||||
// Attempt to restart the worker
|
||||
if let Err(e) = self.restart_worker(worker_name).await {
|
||||
error!("Failed to restart unhealthy worker {}: {}", worker_name, e);
|
||||
// Attempt to restart the actor
|
||||
if let Err(e) = self.restart_actor(actor_name).await {
|
||||
error!("Failed to restart unhealthy actor {}: {}", actor_name, e);
|
||||
|
||||
// If restart fails, try a full stop/start cycle
|
||||
warn!("Attempting full stop/start cycle for worker: {}", worker_name);
|
||||
if let Err(e) = self.stop_and_delete_worker(worker_name).await {
|
||||
error!("Failed to stop worker {} during recovery: {}", worker_name, e);
|
||||
warn!("Attempting full stop/start cycle for actor: {}", actor_name);
|
||||
if let Err(e) = self.stop_and_delete_actor(actor_name).await {
|
||||
error!("Failed to stop actor {} during recovery: {}", actor_name, e);
|
||||
}
|
||||
|
||||
if let Err(e) = self.start_worker(worker_config).await {
|
||||
error!("Failed to start worker {} during recovery: {}", worker_name, e);
|
||||
if let Err(e) = self.start_actor(actor_config).await {
|
||||
error!("Failed to start actor {} during recovery: {}", actor_name, e);
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
info!("Successfully recovered worker: {}", worker_name);
|
||||
info!("Successfully recovered actor: {}", actor_name);
|
||||
} else {
|
||||
info!("Successfully restarted worker: {}", worker_name);
|
||||
info!("Successfully restarted actor: {}", actor_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Could not get status for worker {} (may not exist): {}", worker_name, e);
|
||||
warn!("Could not get status for actor {} (may not exist): {}", actor_name, e);
|
||||
|
||||
// Worker doesn't exist, try to start it
|
||||
info!("Attempting to start missing worker: {}", worker_name);
|
||||
if let Err(e) = self.start_worker(worker_config).await {
|
||||
error!("Failed to start missing worker {}: {}", worker_name, e);
|
||||
// Actor doesn't exist, try to start it
|
||||
info!("Attempting to start missing actor: {}", actor_name);
|
||||
if let Err(e) = self.start_actor(actor_config).await {
|
||||
error!("Failed to start missing actor {}: {}", actor_name, e);
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
info!("Successfully started missing worker: {}", worker_name);
|
||||
info!("Successfully started missing actor: {}", actor_name);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -597,18 +586,18 @@ impl Supervisor {
|
||||
job_id: String,
|
||||
script_type: &ScriptType
|
||||
) -> Result<(), SupervisorError> {
|
||||
let worker_queue_key = self.get_worker_queue_key(script_type);
|
||||
let actor_queue_key = self.get_actor_queue_key(script_type);
|
||||
|
||||
// lpush also infers its types, RV is typically i64 (length of list) or () depending on exact command variant
|
||||
// For `redis::AsyncCommands::lpush`, it's `RedisResult<R>` where R: FromRedisValue
|
||||
// Often this is the length of the list. Let's allow inference or specify if needed.
|
||||
let _: redis::RedisResult<i64> =
|
||||
conn.lpush(&worker_queue_key, job_id.clone()).await;
|
||||
conn.lpush(&actor_queue_key, job_id.clone()).await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Internal helper to await response from worker
|
||||
// Internal helper to await response from actor
|
||||
async fn await_response_from_connection(
|
||||
&self,
|
||||
conn: &mut redis::aio::MultiplexedConnection,
|
||||
@@ -679,7 +668,7 @@ impl Supervisor {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// New method using dedicated reply queue with automatic worker selection
|
||||
// New method using dedicated reply queue with automatic actor selection
|
||||
pub async fn run_job_and_await_result(
|
||||
&self,
|
||||
job: &Job
|
||||
@@ -782,7 +771,7 @@ impl Supervisor {
|
||||
pub async fn stop_job(&self, job_id: &str) -> Result<(), SupervisorError> {
|
||||
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
||||
|
||||
// Get job details to determine script type and appropriate worker
|
||||
// Get job details to determine script type and appropriate actor
|
||||
let job_key = format!("{}job:{}", NAMESPACE_PREFIX, job_id);
|
||||
let job_data: std::collections::HashMap<String, String> = conn.hgetall(&job_key).await?;
|
||||
|
||||
@@ -798,7 +787,7 @@ impl Supervisor {
|
||||
.map_err(|e| SupervisorError::InvalidInput(format!("Invalid script type: {}", e)))?;
|
||||
|
||||
// Use hardcoded stop queue key for this script type
|
||||
let stop_queue_key = format!("{}stop_queue:{}", NAMESPACE_PREFIX, script_type.worker_queue_suffix());
|
||||
let stop_queue_key = format!("{}stop_queue:{}", NAMESPACE_PREFIX, script_type.actor_queue_suffix());
|
||||
|
||||
// Push job ID to the stop queue
|
||||
conn.lpush::<_, _, ()>(&stop_queue_key, job_id).await?;
|
||||
@@ -931,7 +920,7 @@ impl Supervisor {
|
||||
/// Dispatch jobs that are ready (have all prerequisites completed)
|
||||
pub async fn dispatch_ready_jobs(&self, ready_job_ids: Vec<String>) -> Result<(), SupervisorError> {
|
||||
for job_id in ready_job_ids {
|
||||
// Get job data to determine script type and select worker
|
||||
// Get job data to determine script type and select actor
|
||||
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
||||
let job_key = format!("{}job:{}", NAMESPACE_PREFIX, job_id);
|
||||
let job_data: std::collections::HashMap<String, String> = conn.hgetall(&job_key).await?;
|
||||
|
@@ -1,6 +1,6 @@
|
||||
//! Worker lifecycle management functionality for the Hero Supervisor
|
||||
//! Actor lifecycle management functionality for the Hero Supervisor
|
||||
//!
|
||||
//! This module provides worker process lifecycle management using Zinit as the process manager.
|
||||
//! This module provides actor process lifecycle management using Zinit as the process manager.
|
||||
//! All functionality is implemented as methods on the Supervisor struct for a clean API.
|
||||
|
||||
use log::{debug, error, info, warn};
|
||||
@@ -12,28 +12,28 @@ use zinit_client::{Client as ZinitClient, Status};
|
||||
use hero_job::ScriptType;
|
||||
use crate::{Supervisor, SupervisorError};
|
||||
|
||||
/// Information about a worker including its configuration and current status
|
||||
/// Information about a actor including its configuration and current status
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct WorkerInfo {
|
||||
pub config: WorkerConfig,
|
||||
pub struct ActorInfo {
|
||||
pub config: ActorConfig,
|
||||
pub status: Option<Status>,
|
||||
pub is_running: bool,
|
||||
}
|
||||
|
||||
/// Configuration for a worker binary
|
||||
/// Configuration for a actor binary
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct WorkerConfig {
|
||||
/// Name of the worker service
|
||||
pub struct ActorConfig {
|
||||
/// Name of the actor service
|
||||
pub name: String,
|
||||
/// Path to the worker binary
|
||||
/// Path to the actor binary
|
||||
pub binary_path: PathBuf,
|
||||
/// Script type this worker handles
|
||||
/// Script type this actor handles
|
||||
pub script_type: ScriptType,
|
||||
/// Command line arguments for the worker
|
||||
/// Command line arguments for the actor
|
||||
pub args: Vec<String>,
|
||||
/// Environment variables for the worker
|
||||
/// Environment variables for the actor
|
||||
pub env: HashMap<String, String>,
|
||||
/// Whether this worker should restart on exit
|
||||
/// Whether this actor should restart on exit
|
||||
pub restart_on_exit: bool,
|
||||
/// Health check command (optional)
|
||||
pub health_check: Option<String>,
|
||||
@@ -41,7 +41,7 @@ pub struct WorkerConfig {
|
||||
pub dependencies: Vec<String>,
|
||||
}
|
||||
|
||||
impl WorkerConfig {
|
||||
impl ActorConfig {
|
||||
pub fn new(name: String, binary_path: PathBuf, script_type: ScriptType) -> Self {
|
||||
Self {
|
||||
name,
|
||||
@@ -81,122 +81,122 @@ impl WorkerConfig {
|
||||
}
|
||||
}
|
||||
|
||||
/// Worker lifecycle management methods for Supervisor
|
||||
/// Actor lifecycle management methods for Supervisor
|
||||
impl Supervisor {
|
||||
/// Get all workers with their configuration and status - unified method
|
||||
pub async fn get_workers(&self, worker_configs: &[WorkerConfig]) -> Vec<WorkerInfo> {
|
||||
let mut workers = Vec::new();
|
||||
/// Get all actors with their configuration and status - unified method
|
||||
pub async fn get_actors(&self, actor_configs: &[ActorConfig]) -> Vec<ActorInfo> {
|
||||
let mut actors = Vec::new();
|
||||
|
||||
for config in worker_configs {
|
||||
for config in actor_configs {
|
||||
let status = self.zinit_client.status(&config.name).await.ok();
|
||||
let is_running = status.as_ref()
|
||||
.map(|s| s.state == "running" && s.pid > 0)
|
||||
.unwrap_or(false);
|
||||
|
||||
workers.push(WorkerInfo {
|
||||
actors.push(ActorInfo {
|
||||
config: config.clone(),
|
||||
status,
|
||||
is_running,
|
||||
});
|
||||
}
|
||||
|
||||
workers
|
||||
actors
|
||||
}
|
||||
|
||||
/// Start a worker using Zinit
|
||||
pub async fn start_worker(
|
||||
/// Start a actor using Zinit
|
||||
pub async fn start_actor(
|
||||
&self,
|
||||
worker_config: &WorkerConfig,
|
||||
actor_config: &ActorConfig,
|
||||
) -> Result<(), SupervisorError> {
|
||||
info!("Starting worker: {}", worker_config.name);
|
||||
info!("Starting actor: {}", actor_config.name);
|
||||
|
||||
// Create service configuration for Zinit
|
||||
let service_config = self.create_service_config(worker_config);
|
||||
let service_config = self.create_service_config(actor_config);
|
||||
|
||||
// Create the service in Zinit
|
||||
self.zinit_client.create_service(&worker_config.name, service_config).await
|
||||
self.zinit_client.create_service(&actor_config.name, service_config).await
|
||||
.map_err(|e| SupervisorError::ZinitError(format!("Failed to create service: {}", e)))?;
|
||||
|
||||
// Monitor the service so Zinit starts managing it
|
||||
self.zinit_client.monitor(&worker_config.name).await
|
||||
self.zinit_client.monitor(&actor_config.name).await
|
||||
.map_err(|e| SupervisorError::ZinitError(format!("Failed to monitor service: {}", e)))?;
|
||||
|
||||
// Start the service
|
||||
self.zinit_client.start(&worker_config.name).await
|
||||
.map_err(|e| SupervisorError::ZinitError(format!("Failed to start worker: {}", e)))?;
|
||||
self.zinit_client.start(&actor_config.name).await
|
||||
.map_err(|e| SupervisorError::ZinitError(format!("Failed to start actor: {}", e)))?;
|
||||
|
||||
info!("Successfully started worker: {}", worker_config.name);
|
||||
info!("Successfully started actor: {}", actor_config.name);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Stop a worker using Zinit
|
||||
pub async fn stop_worker(
|
||||
/// Stop a actor using Zinit
|
||||
pub async fn stop_actor(
|
||||
&self,
|
||||
worker_name: &str,
|
||||
actor_name: &str,
|
||||
) -> Result<(), SupervisorError> {
|
||||
info!("Stopping worker: {}", worker_name);
|
||||
info!("Stopping actor: {}", actor_name);
|
||||
|
||||
match self.zinit_client.stop(worker_name).await {
|
||||
match self.zinit_client.stop(actor_name).await {
|
||||
Ok(_) => {
|
||||
info!("Successfully stopped worker: {}", worker_name);
|
||||
info!("Successfully stopped actor: {}", actor_name);
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to stop worker {}: {}", worker_name, e);
|
||||
Err(SupervisorError::WorkerStopFailed(worker_name.to_string(), e.to_string()))
|
||||
error!("Failed to stop actor {}: {}", actor_name, e);
|
||||
Err(SupervisorError::ActorStopFailed(actor_name.to_string(), e.to_string()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Restart a worker using Zinit
|
||||
pub async fn restart_worker(
|
||||
/// Restart a actor using Zinit
|
||||
pub async fn restart_actor(
|
||||
&self,
|
||||
worker_name: &str,
|
||||
actor_name: &str,
|
||||
) -> Result<(), SupervisorError> {
|
||||
info!("Restarting worker: {}", worker_name);
|
||||
info!("Restarting actor: {}", actor_name);
|
||||
|
||||
match self.zinit_client.restart(worker_name).await {
|
||||
match self.zinit_client.restart(actor_name).await {
|
||||
Ok(_) => {
|
||||
info!("Successfully restarted worker: {}", worker_name);
|
||||
info!("Successfully restarted actor: {}", actor_name);
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to restart worker {}: {}", worker_name, e);
|
||||
Err(SupervisorError::WorkerRestartFailed(worker_name.to_string(), e.to_string()))
|
||||
error!("Failed to restart actor {}: {}", actor_name, e);
|
||||
Err(SupervisorError::ActorRestartFailed(actor_name.to_string(), e.to_string()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get status of a worker using Zinit
|
||||
pub async fn get_worker_status(
|
||||
/// Get status of a actor using Zinit
|
||||
pub async fn get_actor_status(
|
||||
&self,
|
||||
worker_name: &str,
|
||||
actor_name: &str,
|
||||
zinit_client: &ZinitClient,
|
||||
) -> Result<Status, SupervisorError> {
|
||||
match zinit_client.status(worker_name).await {
|
||||
match zinit_client.status(actor_name).await {
|
||||
Ok(status) => Ok(status),
|
||||
Err(e) => {
|
||||
error!("Failed to get status for worker {}: {}", worker_name, e);
|
||||
Err(SupervisorError::WorkerStatusFailed(worker_name.to_string(), e.to_string()))
|
||||
error!("Failed to get status for actor {}: {}", actor_name, e);
|
||||
Err(SupervisorError::ActorStatusFailed(actor_name.to_string(), e.to_string()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get status of all workers
|
||||
pub async fn get_all_worker_status(
|
||||
/// Get status of all actors
|
||||
pub async fn get_all_actor_status(
|
||||
&self,
|
||||
worker_configs: &[WorkerConfig],
|
||||
actor_configs: &[ActorConfig],
|
||||
zinit_client: &ZinitClient,
|
||||
) -> Result<HashMap<String, Status>, SupervisorError> {
|
||||
let mut status_map = HashMap::new();
|
||||
|
||||
for worker in worker_configs {
|
||||
match zinit_client.status(&worker.name).await {
|
||||
for actor in actor_configs {
|
||||
match zinit_client.status(&actor.name).await {
|
||||
Ok(status) => {
|
||||
status_map.insert(worker.name.clone(), status);
|
||||
status_map.insert(actor.name.clone(), status);
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Failed to get status for worker {}: {}", worker.name, e);
|
||||
warn!("Failed to get status for actor {}: {}", actor.name, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -206,32 +206,32 @@ impl Supervisor {
|
||||
|
||||
|
||||
|
||||
/// Stop multiple workers
|
||||
pub async fn stop_workers(
|
||||
/// Stop multiple actors
|
||||
pub async fn stop_actors(
|
||||
&self,
|
||||
worker_names: &[String],
|
||||
actor_names: &[String],
|
||||
) -> Result<(), SupervisorError> {
|
||||
info!("Stopping {} workers", worker_names.len());
|
||||
info!("Stopping {} actors", actor_names.len());
|
||||
|
||||
for worker_name in worker_names {
|
||||
self.stop_worker(worker_name).await?;
|
||||
for actor_name in actor_names {
|
||||
self.stop_actor(actor_name).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get count of running workers for a script type
|
||||
pub async fn get_running_worker_count(
|
||||
/// Get count of running actors for a script type
|
||||
pub async fn get_running_actor_count(
|
||||
&self,
|
||||
worker_configs: &[WorkerConfig],
|
||||
actor_configs: &[ActorConfig],
|
||||
script_type: &ScriptType,
|
||||
zinit_client: &ZinitClient,
|
||||
) -> usize {
|
||||
let mut running_count = 0;
|
||||
|
||||
for worker in worker_configs {
|
||||
if worker.script_type == *script_type {
|
||||
if let Ok(status) = zinit_client.status(&worker.name).await {
|
||||
for actor in actor_configs {
|
||||
if actor.script_type == *script_type {
|
||||
if let Ok(status) = zinit_client.status(&actor.name).await {
|
||||
if status.state == "running" {
|
||||
running_count += 1;
|
||||
}
|
||||
@@ -242,7 +242,7 @@ impl Supervisor {
|
||||
running_count
|
||||
}
|
||||
|
||||
/// Send a ping job to a worker for health checking
|
||||
/// Send a ping job to a actor for health checking
|
||||
pub async fn send_ping_job(
|
||||
&self,
|
||||
script_type: ScriptType,
|
||||
@@ -268,8 +268,8 @@ impl Supervisor {
|
||||
}
|
||||
}
|
||||
|
||||
/// Create Zinit service configuration from worker config
|
||||
fn create_service_config(&self, worker: &WorkerConfig) -> serde_json::Map<String, serde_json::Value> {
|
||||
/// Create Zinit service configuration from actor config
|
||||
fn create_service_config(&self, actor: &ActorConfig) -> serde_json::Map<String, serde_json::Value> {
|
||||
use serde_json::{Map, Value};
|
||||
|
||||
let mut config = Map::new();
|
||||
@@ -277,117 +277,117 @@ impl Supervisor {
|
||||
config.insert(
|
||||
"exec".to_string(),
|
||||
Value::String(format!("{} {}",
|
||||
worker.binary_path.display(),
|
||||
worker.args.join(" ")
|
||||
actor.binary_path.display(),
|
||||
actor.args.join(" ")
|
||||
))
|
||||
);
|
||||
|
||||
config.insert(
|
||||
"oneshot".to_string(),
|
||||
Value::Bool(!worker.restart_on_exit)
|
||||
Value::Bool(!actor.restart_on_exit)
|
||||
);
|
||||
|
||||
if let Some(health_check) = &worker.health_check {
|
||||
if let Some(health_check) = &actor.health_check {
|
||||
config.insert("test".to_string(), Value::String(health_check.clone()));
|
||||
}
|
||||
|
||||
if !worker.dependencies.is_empty() {
|
||||
config.insert("after".to_string(), json!(worker.dependencies));
|
||||
if !actor.dependencies.is_empty() {
|
||||
config.insert("after".to_string(), json!(actor.dependencies));
|
||||
}
|
||||
|
||||
// Add environment variables if any
|
||||
if !worker.env.is_empty() {
|
||||
config.insert("env".to_string(), json!(worker.env));
|
||||
if !actor.env.is_empty() {
|
||||
config.insert("env".to_string(), json!(actor.env));
|
||||
}
|
||||
|
||||
config
|
||||
}
|
||||
|
||||
/// Launch workers based on SupervisorBuilder configuration
|
||||
pub(crate) async fn launch_configured_workers(&self, builder: &crate::SupervisorBuilderData) -> Result<(), SupervisorError> {
|
||||
/// Launch actors based on SupervisorBuilder configuration
|
||||
pub(crate) async fn launch_configured_actors(&self, builder: &crate::SupervisorBuilderData) -> Result<(), SupervisorError> {
|
||||
use hero_job::ScriptType;
|
||||
use std::path::PathBuf;
|
||||
|
||||
let mut errors = Vec::new();
|
||||
|
||||
// Launch OSIS worker if configured
|
||||
if let Some(binary_path) = &builder.osis_worker {
|
||||
let worker_id = "osis_worker_1";
|
||||
let mut config = WorkerConfig::new(
|
||||
worker_id.to_string(),
|
||||
// Launch OSIS actor if configured
|
||||
if let Some(binary_path) = &builder.osis_actor {
|
||||
let actor_id = "osis_actor_1";
|
||||
let mut config = ActorConfig::new(
|
||||
actor_id.to_string(),
|
||||
PathBuf::from(binary_path),
|
||||
ScriptType::OSIS
|
||||
);
|
||||
config.env.extend(builder.worker_env_vars.clone());
|
||||
config.env.extend(builder.actor_env_vars.clone());
|
||||
|
||||
info!("Launching OSIS worker: {}", worker_id);
|
||||
if let Err(e) = self.start_worker(&config).await {
|
||||
let error_msg = format!("Failed to start OSIS worker: {}", e);
|
||||
info!("Launching OSIS actor: {}", actor_id);
|
||||
if let Err(e) = self.start_actor(&config).await {
|
||||
let error_msg = format!("Failed to start OSIS actor: {}", e);
|
||||
warn!("{}", error_msg);
|
||||
errors.push(error_msg);
|
||||
}
|
||||
}
|
||||
|
||||
// Launch SAL worker if configured
|
||||
if let Some(binary_path) = &builder.sal_worker {
|
||||
let worker_id = "sal_worker_1";
|
||||
let mut config = WorkerConfig::new(
|
||||
worker_id.to_string(),
|
||||
// Launch SAL actor if configured
|
||||
if let Some(binary_path) = &builder.sal_actor {
|
||||
let actor_id = "sal_actor_1";
|
||||
let mut config = ActorConfig::new(
|
||||
actor_id.to_string(),
|
||||
PathBuf::from(binary_path),
|
||||
ScriptType::SAL
|
||||
);
|
||||
config.env.extend(builder.worker_env_vars.clone());
|
||||
config.env.extend(builder.actor_env_vars.clone());
|
||||
|
||||
info!("Launching SAL worker: {}", worker_id);
|
||||
if let Err(e) = self.start_worker(&config).await {
|
||||
let error_msg = format!("Failed to start SAL worker: {}", e);
|
||||
info!("Launching SAL actor: {}", actor_id);
|
||||
if let Err(e) = self.start_actor(&config).await {
|
||||
let error_msg = format!("Failed to start SAL actor: {}", e);
|
||||
warn!("{}", error_msg);
|
||||
errors.push(error_msg);
|
||||
}
|
||||
}
|
||||
|
||||
// Launch V worker if configured
|
||||
if let Some(binary_path) = &builder.v_worker {
|
||||
let worker_id = "v_worker_1";
|
||||
let mut config = WorkerConfig::new(
|
||||
worker_id.to_string(),
|
||||
// Launch V actor if configured
|
||||
if let Some(binary_path) = &builder.v_actor {
|
||||
let actor_id = "v_actor_1";
|
||||
let mut config = ActorConfig::new(
|
||||
actor_id.to_string(),
|
||||
PathBuf::from(binary_path),
|
||||
ScriptType::V
|
||||
);
|
||||
config.env.extend(builder.worker_env_vars.clone());
|
||||
config.env.extend(builder.actor_env_vars.clone());
|
||||
|
||||
info!("Launching V worker: {}", worker_id);
|
||||
if let Err(e) = self.start_worker(&config).await {
|
||||
let error_msg = format!("Failed to start V worker: {}", e);
|
||||
info!("Launching V actor: {}", actor_id);
|
||||
if let Err(e) = self.start_actor(&config).await {
|
||||
let error_msg = format!("Failed to start V actor: {}", e);
|
||||
warn!("{}", error_msg);
|
||||
errors.push(error_msg);
|
||||
}
|
||||
}
|
||||
|
||||
// Launch Python worker if configured
|
||||
if let Some(binary_path) = &builder.python_worker {
|
||||
let worker_id = "python_worker_1";
|
||||
let mut config = WorkerConfig::new(
|
||||
worker_id.to_string(),
|
||||
// Launch Python actor if configured
|
||||
if let Some(binary_path) = &builder.python_actor {
|
||||
let actor_id = "python_actor_1";
|
||||
let mut config = ActorConfig::new(
|
||||
actor_id.to_string(),
|
||||
PathBuf::from(binary_path),
|
||||
ScriptType::Python
|
||||
);
|
||||
config.env.extend(builder.worker_env_vars.clone());
|
||||
config.env.extend(builder.actor_env_vars.clone());
|
||||
|
||||
info!("Launching Python worker: {}", worker_id);
|
||||
if let Err(e) = self.start_worker(&config).await {
|
||||
let error_msg = format!("Failed to start Python worker: {}", e);
|
||||
info!("Launching Python actor: {}", actor_id);
|
||||
if let Err(e) = self.start_actor(&config).await {
|
||||
let error_msg = format!("Failed to start Python actor: {}", e);
|
||||
warn!("{}", error_msg);
|
||||
errors.push(error_msg);
|
||||
}
|
||||
}
|
||||
|
||||
// Return result based on whether any workers started successfully
|
||||
// Return result based on whether any actors started successfully
|
||||
if errors.is_empty() {
|
||||
info!("All configured workers started successfully");
|
||||
info!("All configured actors started successfully");
|
||||
Ok(())
|
||||
} else {
|
||||
let combined_error = format!("Some workers failed to start: {}", errors.join("; "));
|
||||
let combined_error = format!("Some actors failed to start: {}", errors.join("; "));
|
||||
warn!("{}", combined_error);
|
||||
Err(SupervisorError::ZinitError(combined_error))
|
||||
}
|
||||
|
Reference in New Issue
Block a user