rename worker to actor

This commit is contained in:
Timur Gordon
2025-08-05 15:44:33 +02:00
parent 5283f383b3
commit 89e953ca1d
67 changed files with 1629 additions and 1737 deletions

View File

@@ -1,20 +1,20 @@
# Worker Lifecycle Management
# Actor Lifecycle Management
The Hero Supervisor includes comprehensive worker lifecycle management functionality using [Zinit](https://github.com/threefoldtech/zinit) as the process manager. This enables the supervisor to manage worker processes, perform health monitoring, and implement load balancing.
The Hero Supervisor includes comprehensive actor lifecycle management functionality using [Zinit](https://github.com/threefoldtech/zinit) as the process manager. This enables the supervisor to manage actor processes, perform health monitoring, and implement load balancing.
## Overview
The lifecycle management system provides:
- **Worker Process Management**: Start, stop, restart, and monitor worker binaries
- **Health Monitoring**: Automatic ping jobs every 10 minutes for idle workers
- **Graceful Shutdown**: Clean termination of worker processes
- **Actor Process Management**: Start, stop, restart, and monitor actor binaries
- **Health Monitoring**: Automatic ping jobs every 10 minutes for idle actors
- **Graceful Shutdown**: Clean termination of actor processes
## Architecture
```
┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐
│ Supervisor │ │ WorkerLifecycle │ │ Zinit │
│ Supervisor │ │ ActorLifecycle │ │ Zinit │
│ │◄──►│ Manager │◄──►│ (Process │
│ (Job Dispatch) │ │ │ │ Manager) │
└─────────────────┘ └──────────────────┘ └─────────────────┘
@@ -22,49 +22,49 @@ The lifecycle management system provides:
│ │ │
▼ ▼ ▼
┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐
│ Redis │ │ Health Monitor │ │ Worker Binaries │
│ Redis │ │ Health Monitor │ │ Actor Binaries │
│ (Job Queue) │ │ (Ping Jobs) │ │ (OSIS/SAL/V) │
└─────────────────┘ └──────────────────┘ └─────────────────┘
```
## Components
### WorkerConfig
### ActorConfig
Defines configuration for a worker binary:
Defines configuration for a actor binary:
```rust
use hero_supervisor::{WorkerConfig, ScriptType};
use hero_supervisor::{ActorConfig, ScriptType};
use std::path::PathBuf;
use std::collections::HashMap;
let config = WorkerConfig::new(
"osis_worker_0".to_string(),
PathBuf::from("/usr/local/bin/osis_worker"),
let config = ActorConfig::new(
"osis_actor_0".to_string(),
PathBuf::from("/usr/local/bin/osis_actor"),
ScriptType::OSIS,
)
.with_args(vec![
"--redis-url".to_string(),
"redis://localhost:6379".to_string(),
"--worker-id".to_string(),
"osis_worker_0".to_string(),
"--actor-id".to_string(),
"osis_actor_0".to_string(),
])
.with_env({
let mut env = HashMap::new();
env.insert("RUST_LOG".to_string(), "info".to_string());
env.insert("WORKER_TYPE".to_string(), "osis".to_string());
env.insert("ACTOR_TYPE".to_string(), "osis".to_string());
env
})
.with_health_check("/usr/local/bin/osis_worker --health-check".to_string())
.with_health_check("/usr/local/bin/osis_actor --health-check".to_string())
.with_dependencies(vec!["redis".to_string()]);
```
### WorkerLifecycleManager
### ActorLifecycleManager
Main component for managing worker lifecycles:
Main component for managing actor lifecycles:
```rust
use hero_supervisor::{WorkerLifecycleManagerBuilder, Supervisor};
use hero_supervisor::{ActorLifecycleManagerBuilder, Supervisor};
let supervisor = SupervisorBuilder::new()
.redis_url("redis://localhost:6379")
@@ -72,11 +72,11 @@ let supervisor = SupervisorBuilder::new()
.context_id("production")
.build()?;
let mut lifecycle_manager = WorkerLifecycleManagerBuilder::new("/var/run/zinit.sock".to_string())
let mut lifecycle_manager = ActorLifecycleManagerBuilder::new("/var/run/zinit.sock".to_string())
.with_supervisor(supervisor.clone())
.add_worker(osis_worker_config)
.add_worker(sal_worker_config)
.add_worker(v_worker_config)
.add_actor(osis_actor_config)
.add_actor(sal_actor_config)
.add_actor(v_actor_config)
.build();
```
@@ -84,45 +84,45 @@ let mut lifecycle_manager = WorkerLifecycleManagerBuilder::new("/var/run/zinit.s
The lifecycle manager supports all Hero script types:
- **OSIS**: Rhai/HeroScript execution workers
- **SAL**: System Abstraction Layer workers
- **OSIS**: Rhai/HeroScript execution actors
- **SAL**: System Abstraction Layer actors
- **V**: HeroScript execution in V language
- **Python**: HeroScript execution in Python
## Key Features
### 1. Worker Management
### 1. Actor Management
```rust
// Start all configured workers
lifecycle_manager.start_all_workers().await?;
// Start all configured actors
lifecycle_manager.start_all_actors().await?;
// Stop all workers
lifecycle_manager.stop_all_workers().await?;
// Stop all actors
lifecycle_manager.stop_all_actors().await?;
// Restart specific worker
lifecycle_manager.restart_worker("osis_worker_0").await?;
// Restart specific actor
lifecycle_manager.restart_actor("osis_actor_0").await?;
// Get worker status
let status = lifecycle_manager.get_worker_status("osis_worker_0").await?;
println!("Worker state: {:?}, PID: {}", status.state, status.pid);
// Get actor status
let status = lifecycle_manager.get_actor_status("osis_actor_0").await?;
println!("Actor state: {:?}, PID: {}", status.state, status.pid);
```
### 2. Health Monitoring
The system automatically monitors worker health:
The system automatically monitors actor health:
- Tracks last job execution time for each worker
- Sends ping jobs to workers idle for 10+ minutes
- Restarts workers that fail ping checks 3 times
- Updates job times when workers receive tasks
- Tracks last job execution time for each actor
- Sends ping jobs to actors idle for 10+ minutes
- Restarts actors that fail ping checks 3 times
- Updates job times when actors receive tasks
```rust
// Manual health check
lifecycle_manager.monitor_worker_health().await?;
lifecycle_manager.monitor_actor_health().await?;
// Update job time (called automatically by supervisor)
lifecycle_manager.update_worker_job_time("osis_worker_0");
lifecycle_manager.update_actor_job_time("osis_actor_0");
// Start continuous health monitoring
lifecycle_manager.start_health_monitoring().await; // Runs forever
@@ -130,26 +130,26 @@ lifecycle_manager.start_health_monitoring().await; // Runs forever
### 3. Dynamic Scaling
Scale workers up or down based on demand:
Scale actors up or down based on demand:
```rust
// Scale OSIS workers to 5 instances
lifecycle_manager.scale_workers(&ScriptType::OSIS, 5).await?;
// Scale OSIS actors to 5 instances
lifecycle_manager.scale_actors(&ScriptType::OSIS, 5).await?;
// Scale down SAL workers to 1 instance
lifecycle_manager.scale_workers(&ScriptType::SAL, 1).await?;
// Scale down SAL actors to 1 instance
lifecycle_manager.scale_actors(&ScriptType::SAL, 1).await?;
// Check current running count
let count = lifecycle_manager.get_running_worker_count(&ScriptType::V).await;
println!("Running V workers: {}", count);
let count = lifecycle_manager.get_running_actor_count(&ScriptType::V).await;
println!("Running V actors: {}", count);
```
### 4. Service Dependencies
Workers can depend on other services:
Actors can depend on other services:
```rust
let config = WorkerConfig::new(name, binary, script_type)
let config = ActorConfig::new(name, binary, script_type)
.with_dependencies(vec![
"redis".to_string(),
"database".to_string(),
@@ -157,25 +157,25 @@ let config = WorkerConfig::new(name, binary, script_type)
]);
```
Zinit ensures dependencies start before the worker.
Zinit ensures dependencies start before the actor.
## Integration with Supervisor
The lifecycle manager integrates seamlessly with the supervisor:
```rust
use hero_supervisor::{Supervisor, WorkerLifecycleManager};
use hero_supervisor::{Supervisor, ActorLifecycleManager};
// Create supervisor and lifecycle manager
let supervisor = SupervisorBuilder::new().build()?;
let mut lifecycle_manager = WorkerLifecycleManagerBuilder::new(zinit_socket)
let mut lifecycle_manager = ActorLifecycleManagerBuilder::new(zinit_socket)
.with_supervisor(supervisor.clone())
.build();
// Start workers
lifecycle_manager.start_all_workers().await?;
// Start actors
lifecycle_manager.start_all_actors().await?;
// Create and execute jobs (supervisor automatically routes to workers)
// Create and execute jobs (supervisor automatically routes to actors)
let job = supervisor
.new_job()
.script_type(ScriptType::OSIS)
@@ -191,15 +191,15 @@ println!("Job result: {}", result);
The lifecycle manager automatically creates Zinit service configurations:
```yaml
# Generated service config for osis_worker_0
exec: "/usr/local/bin/osis_worker --redis-url redis://localhost:6379 --worker-id osis_worker_0"
test: "/usr/local/bin/osis_worker --health-check"
# Generated service config for osis_actor_0
exec: "/usr/local/bin/osis_actor --redis-url redis://localhost:6379 --actor-id osis_actor_0"
test: "/usr/local/bin/osis_actor --health-check"
oneshot: false # Restart on exit
after:
- redis
env:
RUST_LOG: "info"
WORKER_TYPE: "osis"
ACTOR_TYPE: "osis"
```
## Error Handling
@@ -209,10 +209,10 @@ The system provides comprehensive error handling:
```rust
use hero_supervisor::SupervisorError;
match lifecycle_manager.start_worker(&config).await {
Ok(_) => println!("Worker started successfully"),
Err(SupervisorError::WorkerStartFailed(worker, reason)) => {
eprintln!("Failed to start {}: {}", worker, reason);
match lifecycle_manager.start_actor(&config).await {
Ok(_) => println!("Actor started successfully"),
Err(SupervisorError::ActorStartFailed(actor, reason)) => {
eprintln!("Failed to start {}: {}", actor, reason);
}
Err(e) => eprintln!("Other error: {}", e),
}
@@ -243,11 +243,11 @@ REDIS_URL=redis://localhost:6379 cargo run --example lifecycle_demo
redis-server
```
3. **Worker Binaries**: Compiled worker binaries for each script type
- `/usr/local/bin/osis_worker`
- `/usr/local/bin/sal_worker`
- `/usr/local/bin/v_worker`
- `/usr/local/bin/python_worker`
3. **Actor Binaries**: Compiled actor binaries for each script type
- `/usr/local/bin/osis_actor`
- `/usr/local/bin/sal_actor`
- `/usr/local/bin/v_actor`
- `/usr/local/bin/python_actor`
## Configuration Best Practices
@@ -267,15 +267,15 @@ REDIS_URL=redis://localhost:6379 cargo run --example lifecycle_demo
- Check socket permissions: `ls -la /var/run/zinit.sock`
- Verify socket path in configuration
2. **Worker Start Failed**
2. **Actor Start Failed**
- Check binary exists and is executable
- Verify dependencies are running
- Review Zinit logs: `zinit logs <service-name>`
3. **Health Check Failures**
- Implement proper health check endpoint in workers
- Implement proper health check endpoint in actors
- Verify health check command syntax
- Check worker responsiveness
- Check actor responsiveness
4. **Redis Connection Issues**
- Ensure Redis is running and accessible
@@ -289,10 +289,10 @@ REDIS_URL=redis://localhost:6379 cargo run --example lifecycle_demo
zinit list
# View service logs
zinit logs osis_worker_0
zinit logs osis_actor_0
# Check service status
zinit status osis_worker_0
zinit status osis_actor_0
# Monitor Redis queues
redis-cli keys "hero:job:*"
@@ -300,20 +300,20 @@ redis-cli keys "hero:job:*"
## Performance Considerations
- **Scaling**: Start with minimal workers and scale based on queue depth
- **Scaling**: Start with minimal actors and scale based on queue depth
- **Health Monitoring**: Adjust ping intervals based on workload patterns
- **Resource Usage**: Monitor CPU/memory usage of worker processes
- **Resource Usage**: Monitor CPU/memory usage of actor processes
- **Queue Depth**: Monitor Redis queue lengths for scaling decisions
## Security
- **Process Isolation**: Zinit provides process isolation
- **User Permissions**: Run workers with appropriate user permissions
- **User Permissions**: Run actors with appropriate user permissions
- **Network Security**: Secure Redis and Zinit socket access
- **Binary Validation**: Verify worker binary integrity before deployment
- **Binary Validation**: Verify actor binary integrity before deployment
## Future
- **Load Balancing**: Dynamic scaling of workers based on demand
- **Load Balancing**: Dynamic scaling of actors based on demand
- **Service Dependencies**: Proper startup ordering with dependency management

View File

@@ -1,60 +1,60 @@
# Hero Supervisor
The **Hero Supervisor** is responsible for supervising the lifecycle of workers and dispatching jobs to them via Redis queues.
The **Hero Supervisor** is responsible for supervising the lifecycle of actors and dispatching jobs to them via Redis queues.
## Overview
The system involves four primary actors:
1. **OSIS**: A worker that executes Rhai and HeroScript.
2. **SAL**: A worker that performs system abstraction layer functionalities using Rhai.
3. **V**: A worker that executes HeroScript in the V programming language.
4. **Python**: A worker that executes HeroScript in Python.
1. **OSIS**: A actor that executes Rhai and HeroScript.
2. **SAL**: A actor that performs system abstraction layer functionalities using Rhai.
3. **V**: A actor that executes HeroScript in the V programming language.
4. **Python**: A actor that executes HeroScript in Python.
The Supervisor utilizes **zinit** to start and monitor these workers, ensuring they are running correctly.
The Supervisor utilizes **zinit** to start and monitor these actors, ensuring they are running correctly.
### Key Features
- **Worker Lifecycle Supervision**: Oversee the lifecycle of workers, including starting, stopping, restarting, and load balancing based on job demand.
- **Job Supervision**: API for efficiently managing jobs dispatched to workers over Redis queues.
- **Actor Lifecycle Supervision**: Oversee the lifecycle of actors, including starting, stopping, restarting, and load balancing based on job demand.
- **Job Supervision**: API for efficiently managing jobs dispatched to actors over Redis queues.
## Worker Lifecycle Supervision
## Actor Lifecycle Supervision
The Supervisor oversees the lifecycle of the workers, ensuring they are operational and efficiently allocated. Load balancing is implemented to dynamically adjust the number of active workers based on job demand.
The Supervisor oversees the lifecycle of the actors, ensuring they are operational and efficiently allocated. Load balancing is implemented to dynamically adjust the number of active actors based on job demand.
Additionally, the Supervisor implements health monitoring for worker engines: if a worker engine does not receive a job within 10 minutes, the Supervisor sends a ping job. The engine must respond immediately; if it fails to do so, the Supervisor restarts the requested job engine.
Additionally, the Supervisor implements health monitoring for actor engines: if a actor engine does not receive a job within 10 minutes, the Supervisor sends a ping job. The engine must respond immediately; if it fails to do so, the Supervisor restarts the requested job engine.
### Prerequisites
**Important**: Before running any lifecycle examples or using worker management features, you must start the Zinit daemon:
**Important**: Before running any lifecycle examples or using actor management features, you must start the Zinit daemon:
```bash
# Start Zinit daemon (required for worker lifecycle management)
# Start Zinit daemon (required for actor lifecycle management)
sudo zinit init
# Or start Zinit with a custom socket path
sudo zinit --socket /var/run/zinit.sock init
```
**Note**: The Supervisor uses Zinit as the process manager for worker lifecycle operations. The default socket path is `/var/run/zinit.sock`, but you can configure a custom path using the `SupervisorBuilder::zinit_socket_path()` method.
**Note**: The Supervisor uses Zinit as the process manager for actor lifecycle operations. The default socket path is `/var/run/zinit.sock`, but you can configure a custom path using the `SupervisorBuilder::zinit_socket_path()` method.
**Troubleshooting**: If you get connection errors when running examples, ensure:
1. Zinit daemon is running (`zinit list` should work)
2. The socket path matches between Zinit and your Supervisor configuration
3. You have appropriate permissions to access the Zinit socket
### Supervisor API for Worker Lifecycle
### Supervisor API for Actor Lifecycle
The Supervisor provides the following methods for supervising the worker lifecycle:
The Supervisor provides the following methods for supervising the actor lifecycle:
- **`start_worker()`**: Initializes and starts a specified worker.
- **`stop_worker()`**: Gracefully stops a specified worker.
- **`restart_worker()`**: Restarts a specified worker to ensure it operates correctly.
- **`get_worker_status()`**: Checks the status of a specific worker.
- **`start_actor()`**: Initializes and starts a specified actor.
- **`stop_actor()`**: Gracefully stops a specified actor.
- **`restart_actor()`**: Restarts a specified actor to ensure it operates correctly.
- **`get_actor_status()`**: Checks the status of a specific actor.
## Job Supervision
Jobs are dispatched to workers through their designated Redis queues, and the Supervisor provides an API for comprehensive job supervision.
Jobs are dispatched to actors through their designated Redis queues, and the Supervisor provides an API for comprehensive job supervision.
### Supervisor API for Job Supervision
@@ -95,9 +95,9 @@ You can modify these in the example source code if your setup differs.
Jobs are managed within the `hero:` namespace in Redis:
- **`hero:job:{job_id}`**: Stores job parameters as a Redis hash.
- **`hero:work_queue:{worker_id}`**: Contains worker-specific job queues for dispatching jobs.
- **`hero:work_queue:{actor_id}`**: Contains actor-specific job queues for dispatching jobs.
- **`hero:reply:{job_id}`**: Dedicated queues for job results.
## Prerequisites
- A Redis server must be accessible to both the Supervisor and the workers.
- A Redis server must be accessible to both the Supervisor and the actors.

View File

@@ -1,10 +1,10 @@
# Hero Supervisor Protocol
This document describes the Redis-based protocol used by the Hero Supervisor for job management and worker communication.
This document describes the Redis-based protocol used by the Hero Supervisor for job management and actor communication.
## Overview
The Hero Supervisor uses Redis as a message broker and data store for managing distributed job execution. Jobs are stored as Redis hashes, and communication with workers happens through Redis lists (queues).
The Hero Supervisor uses Redis as a message broker and data store for managing distributed job execution. Jobs are stored as Redis hashes, and communication with actors happens through Redis lists (queues).
## Redis Namespace
@@ -22,7 +22,7 @@ hero:job:{job_id}
**Job Hash Fields:**
- `id`: Unique job identifier (UUID v4)
- `caller_id`: Identifier of the client that created the job
- `worker_id`: Target worker identifier
- `actor_id`: Target actor identifier
- `context_id`: Execution context identifier
- `script`: Script content to execute (Rhai or HeroScript)
- `timeout`: Execution timeout in seconds
@@ -35,8 +35,8 @@ hero:job:{job_id}
- `env_vars`: Environment variables as JSON object (optional)
- `prerequisites`: JSON array of job IDs that must complete before this job (optional)
- `dependents`: JSON array of job IDs that depend on this job completing (optional)
- `output`: Job execution result (set by worker)
- `error`: Error message if job failed (set by worker)
- `output`: Job execution result (set by actor)
- `error`: Error message if job failed (set by actor)
- `dependencies`: List of job IDs that this job depends on
### Job Dependencies
@@ -47,19 +47,19 @@ Jobs can have dependencies on other jobs, which are stored in the `dependencies`
Jobs are queued for execution using Redis lists:
```
hero:work_queue:{worker_id}
hero:work_queue:{actor_id}
```
Workers listen on their specific queue using `BLPOP` for job IDs to process.
Actors listen on their specific queue using `BLPOP` for job IDs to process.
### Stop Queues
Job stop requests are sent through dedicated stop queues:
```
hero:stop_queue:{worker_id}
hero:stop_queue:{actor_id}
```
Workers monitor these queues to receive stop requests for running jobs.
Actors monitor these queues to receive stop requests for running jobs.
### Reply Queues
@@ -68,7 +68,7 @@ For synchronous job execution, dedicated reply queues are used:
hero:reply:{job_id}
```
Workers send results to these queues when jobs complete.
Actors send results to these queues when jobs complete.
## Job Lifecycle
@@ -79,20 +79,20 @@ Client -> Redis: HSET hero:job:{job_id} {job_fields}
### 2. Job Submission
```
Client -> Redis: LPUSH hero:work_queue:{worker_id} {job_id}
Client -> Redis: LPUSH hero:work_queue:{actor_id} {job_id}
```
### 3. Job Processing
```
Worker -> Redis: BLPOP hero:work_queue:{worker_id}
Worker -> Redis: HSET hero:job:{job_id} status "started"
Worker: Execute script
Worker -> Redis: HSET hero:job:{job_id} status "finished" output "{result}"
Actor -> Redis: BLPOP hero:work_queue:{actor_id}
Actor -> Redis: HSET hero:job:{job_id} status "started"
Actor: Execute script
Actor -> Redis: HSET hero:job:{job_id} status "finished" output "{result}"
```
### 4. Job Completion (Async)
```
Worker -> Redis: LPUSH hero:reply:{job_id} {result}
Actor -> Redis: LPUSH hero:reply:{job_id} {result}
```
## API Operations
@@ -110,7 +110,7 @@ supervisor.list_jobs() -> Vec<String>
supervisor.stop_job(job_id) -> Result<(), SupervisorError>
```
**Redis Operations:**
- `LPUSH hero:stop_queue:{worker_id} {job_id}` - Send stop request
- `LPUSH hero:stop_queue:{actor_id} {job_id}` - Send stop request
### Get Job Status
```rust
@@ -131,20 +131,20 @@ supervisor.get_job_logs(job_id) -> Result<Option<String>, SupervisorError>
### Run Job and Await Result
```rust
supervisor.run_job_and_await_result(job, worker_id) -> Result<String, SupervisorError>
supervisor.run_job_and_await_result(job, actor_id) -> Result<String, SupervisorError>
```
**Redis Operations:**
1. `HSET hero:job:{job_id} {job_fields}` - Store job
2. `LPUSH hero:work_queue:{worker_id} {job_id}` - Submit job
2. `LPUSH hero:work_queue:{actor_id} {job_id}` - Submit job
3. `BLPOP hero:reply:{job_id} {timeout}` - Wait for result
## Worker Protocol
## Actor Protocol
### Job Processing Loop
```rust
loop {
// 1. Wait for job
job_id = BLPOP hero:work_queue:{worker_id}
job_id = BLPOP hero:work_queue:{actor_id}
// 2. Get job details
job_data = HGETALL hero:job:{job_id}
@@ -153,8 +153,8 @@ loop {
HSET hero:job:{job_id} status "started"
// 4. Check for stop requests
if LLEN hero:stop_queue:{worker_id} > 0 {
stop_job_id = LPOP hero:stop_queue:{worker_id}
if LLEN hero:stop_queue:{actor_id} > 0 {
stop_job_id = LPOP hero:stop_queue:{actor_id}
if stop_job_id == job_id {
HSET hero:job:{job_id} status "error" error "stopped"
continue
@@ -175,15 +175,15 @@ loop {
```
### Stop Request Handling
Workers should periodically check the stop queue during long-running jobs:
Actors should periodically check the stop queue during long-running jobs:
```rust
if LLEN hero:stop_queue:{worker_id} > 0 {
stop_requests = LRANGE hero:stop_queue:{worker_id} 0 -1
if LLEN hero:stop_queue:{actor_id} > 0 {
stop_requests = LRANGE hero:stop_queue:{actor_id} 0 -1
if stop_requests.contains(current_job_id) {
// Stop current job execution
HSET hero:job:{current_job_id} status "error" error "stopped_by_request"
// Remove stop request
LREM hero:stop_queue:{worker_id} 1 current_job_id
LREM hero:stop_queue:{actor_id} 1 current_job_id
return
}
}
@@ -193,17 +193,17 @@ if LLEN hero:stop_queue:{worker_id} > 0 {
### Job Timeouts
- Client sets timeout when creating job
- Worker should respect timeout and stop execution
- Actor should respect timeout and stop execution
- If timeout exceeded: `HSET hero:job:{job_id} status "error" error "timeout"`
### Worker Failures
- If worker crashes, job remains in "started" status
### Actor Failures
- If actor crashes, job remains in "started" status
- Monitoring systems can detect stale jobs and retry
- Jobs can be requeued: `LPUSH hero:work_queue:{worker_id} {job_id}`
- Jobs can be requeued: `LPUSH hero:work_queue:{actor_id} {job_id}`
### Redis Connection Issues
- Clients should implement retry logic with exponential backoff
- Workers should reconnect and resume processing
- Actors should reconnect and resume processing
- Use Redis persistence to survive Redis restarts
## Monitoring and Observability
@@ -211,10 +211,10 @@ if LLEN hero:stop_queue:{worker_id} > 0 {
### Queue Monitoring
```bash
# Check work queue length
LLEN hero:work_queue:{worker_id}
LLEN hero:work_queue:{actor_id}
# Check stop queue length
LLEN hero:stop_queue:{worker_id}
LLEN hero:stop_queue:{actor_id}
# List all jobs
KEYS hero:job:*
@@ -228,7 +228,7 @@ HGETALL hero:job:{job_id}
- Jobs completed per second
- Average job execution time
- Queue depths
- Worker availability
- Actor availability
- Error rates by job type
## Security Considerations
@@ -237,7 +237,7 @@ HGETALL hero:job:{job_id}
- Use Redis AUTH for authentication
- Enable TLS for Redis connections
- Restrict Redis network access
- Use Redis ACLs to limit worker permissions
- Use Redis ACLs to limit actor permissions
### Job Security
- Validate script content before execution
@@ -265,8 +265,8 @@ HGETALL hero:job:{job_id}
- Batch similar jobs when possible
- Implement job prioritization if needed
### Worker Optimization
- Pool worker connections to Redis
### Actor Optimization
- Pool actor connections to Redis
- Use async I/O for Redis operations
- Implement graceful shutdown handling
- Monitor worker resource usage
- Monitor actor resource usage

View File

@@ -1,6 +1,6 @@
# Hero Supervisor CLI Example
This example demonstrates how to use the `hive-supervisor` CLI tool for managing workers and jobs in the Hero ecosystem.
This example demonstrates how to use the `hive-supervisor` CLI tool for managing actors and jobs in the Hero ecosystem.
## Prerequisites
@@ -19,20 +19,20 @@ This example demonstrates how to use the `hive-supervisor` CLI tool for managing
# Follow Zinit installation instructions for your platform
```
3. **Worker Binaries**: The configuration references worker binaries that need to be available:
- `/usr/local/bin/osis_worker`
- `/usr/local/bin/sal_worker`
- `/usr/local/bin/v_worker`
- `/usr/local/bin/python_worker`
3. **Actor Binaries**: The configuration references actor binaries that need to be available:
- `/usr/local/bin/osis_actor`
- `/usr/local/bin/sal_actor`
- `/usr/local/bin/v_actor`
- `/usr/local/bin/python_actor`
For testing purposes, you can create mock worker binaries or update the paths in `config.toml` to point to existing binaries.
For testing purposes, you can create mock actor binaries or update the paths in `config.toml` to point to existing binaries.
## Configuration
The `config.toml` file contains the supervisor configuration:
- **Global settings**: Redis URL and Zinit socket path
- **Worker configurations**: Binary paths and environment variables for each worker type
- **Actor configurations**: Binary paths and environment variables for each actor type
## Usage Examples
@@ -43,29 +43,29 @@ The `config.toml` file contains the supervisor configuration:
cargo build --bin hive-supervisor --release
```
### 2. Worker Management
### 2. Actor Management
```bash
# Show help
./target/release/hive-supervisor --config examples/cli/config.toml --help
# List all configured workers
./target/release/hive-supervisor --config examples/cli/config.toml workers list
# List all configured actors
./target/release/hive-supervisor --config examples/cli/config.toml actors list
# Start all workers
./target/release/hive-supervisor --config examples/cli/config.toml workers start
# Start all actors
./target/release/hive-supervisor --config examples/cli/config.toml actors start
# Start specific workers
./target/release/hive-supervisor --config examples/cli/config.toml workers start osis_worker sal_worker
# Start specific actors
./target/release/hive-supervisor --config examples/cli/config.toml actors start osis_actor sal_actor
# Check worker status
./target/release/hive-supervisor --config examples/cli/config.toml workers status
# Check actor status
./target/release/hive-supervisor --config examples/cli/config.toml actors status
# Stop all workers
./target/release/hive-supervisor --config examples/cli/config.toml workers stop
# Stop all actors
./target/release/hive-supervisor --config examples/cli/config.toml actors stop
# Restart specific worker
./target/release/hive-supervisor --config examples/cli/config.toml workers restart osis_worker
# Restart specific actor
./target/release/hive-supervisor --config examples/cli/config.toml actors restart osis_actor
```
### 3. Job Management
@@ -73,7 +73,7 @@ cargo build --bin hive-supervisor --release
```bash
# Create a job with inline script
./target/release/hive-supervisor --config examples/cli/config.toml jobs create \
--script 'print("Hello from OSIS worker!");' \
--script 'print("Hello from OSIS actor!");' \
--script-type osis \
--caller-id "user123" \
--context-id "session456"
@@ -118,18 +118,18 @@ cargo build --bin hive-supervisor --release
```bash
# Enable debug logging
./target/release/hive-supervisor --config examples/cli/config.toml -v workers status
./target/release/hive-supervisor --config examples/cli/config.toml -v actors status
# Enable trace logging
./target/release/hive-supervisor --config examples/cli/config.toml -vv workers status
./target/release/hive-supervisor --config examples/cli/config.toml -vv actors status
# Disable timestamps
./target/release/hive-supervisor --config examples/cli/config.toml --no-timestamp workers status
./target/release/hive-supervisor --config examples/cli/config.toml --no-timestamp actors status
```
## Sample Scripts
The `sample_scripts/` directory contains example scripts for different worker types:
The `sample_scripts/` directory contains example scripts for different actor types:
- `hello_osis.rhai` - Simple OSIS/HeroScript example
- `system_sal.rhai` - SAL system operation example
@@ -148,9 +148,9 @@ The `sample_scripts/` directory contains example scripts for different worker ty
- Verify Zinit is running and the socket path is correct
- Check permissions on the socket file
3. **Worker Binary Not Found**
3. **Actor Binary Not Found**
- Update binary paths in `config.toml` to match your system
- Ensure worker binaries are executable
- Ensure actor binaries are executable
4. **Permission Denied**
- Check file permissions on configuration and binary files
@@ -161,7 +161,7 @@ The `sample_scripts/` directory contains example scripts for different worker ty
Run with verbose logging to see detailed operation information:
```bash
RUST_LOG=debug ./target/release/hive-supervisor --config examples/cli/config.toml -vv workers status
RUST_LOG=debug ./target/release/hive-supervisor --config examples/cli/config.toml -vv actors status
```
## Configuration Customization
@@ -170,15 +170,15 @@ You can customize the configuration for your environment:
1. **Update Redis URL**: Change `redis_url` in the `[global]` section
2. **Update Zinit Socket**: Change `zinit_socket_path` for your Zinit installation
3. **Worker Paths**: Update binary paths in worker sections to match your setup
4. **Environment Variables**: Add or modify environment variables for each worker type
3. **Actor Paths**: Update binary paths in actor sections to match your setup
4. **Environment Variables**: Add or modify environment variables for each actor type
## Integration with Hero Ecosystem
This CLI integrates with the broader Hero ecosystem:
- **Job Queue**: Uses Redis for job queuing and status tracking
- **Process Management**: Uses Zinit for worker lifecycle management
- **Process Management**: Uses Zinit for actor lifecycle management
- **Script Execution**: Supports multiple script types (OSIS, SAL, V, Python)
- **Monitoring**: Provides real-time status and logging capabilities

View File

@@ -1,19 +1,19 @@
# Hero Supervisor CLI Configuration Example
# This configuration demonstrates how to set up the hive-supervisor CLI
# with different worker types for script execution.
# with different actor types for script execution.
[global]
# Redis connection URL for job queuing
redis_url = "redis://localhost:6379"
# OSIS Worker Configuration
# OSIS Actor Configuration
# Handles OSIS (HeroScript) execution
[osis_worker]
[osis_actor]
binary_path = "../../../target/debug/osis"
env_vars = { "RUST_LOG" = "info", "WORKER_TYPE" = "osis", "MAX_CONCURRENT_JOBS" = "5" }
env_vars = { "RUST_LOG" = "info", "ACTOR_TYPE" = "osis", "MAX_CONCURRENT_JOBS" = "5" }
# SAL Worker Configuration
# SAL Actor Configuration
# Handles System Abstraction Layer scripts
[sal_worker]
[sal_actor]
binary_path = "../../../target/debug/sal"
env_vars = { "RUST_LOG" = "info", "WORKER_TYPE" = "sal", "MAX_CONCURRENT_JOBS" = "3" }
env_vars = { "RUST_LOG" = "info", "ACTOR_TYPE" = "sal", "MAX_CONCURRENT_JOBS" = "3" }

View File

@@ -58,25 +58,25 @@ fi
echo -e "${BLUE}=== CLI Help and Information ===${NC}"
run_cli "Show main help" --help
echo -e "${BLUE}=== Worker Management Examples ===${NC}"
run_cli "List configured workers" workers list
run_cli "Show worker management help" workers --help
echo -e "${BLUE}=== Actor Management Examples ===${NC}"
run_cli "List configured actors" actors list
run_cli "Show actor management help" actors --help
# Note: These commands would require actual worker binaries and Zinit setup
echo -e "${YELLOW}Note: The following commands require actual worker binaries and Zinit setup${NC}"
# Note: These commands would require actual actor binaries and Zinit setup
echo -e "${YELLOW}Note: The following commands require actual actor binaries and Zinit setup${NC}"
echo -e "${YELLOW}They are shown for demonstration but may fail without proper setup${NC}"
echo
# Uncomment these if you have the proper setup
# run_cli "Check worker status" workers status
# run_cli "Start all workers" workers start
# run_cli "Check worker status after start" workers status
# run_cli "Check actor status" actors status
# run_cli "Start all actors" actors start
# run_cli "Check actor status after start" actors status
echo -e "${BLUE}=== Job Management Examples ===${NC}"
run_cli "Show job management help" jobs --help
# Create sample jobs (these will also require workers to be running)
echo -e "${YELLOW}Sample job creation commands (require running workers):${NC}"
# Create sample jobs (these will also require actors to be running)
echo -e "${YELLOW}Sample job creation commands (require running actors):${NC}"
echo
echo "# Create OSIS job with inline script:"
@@ -123,22 +123,22 @@ echo
echo -e "${BLUE}=== Verbose Logging Examples ===${NC}"
echo "# Debug logging:"
echo "$CLI_BINARY --config $CONFIG_FILE -v workers list"
echo "$CLI_BINARY --config $CONFIG_FILE -v actors list"
echo
echo "# Trace logging:"
echo "$CLI_BINARY --config $CONFIG_FILE -vv workers list"
echo "$CLI_BINARY --config $CONFIG_FILE -vv actors list"
echo
echo "# No timestamps:"
echo "$CLI_BINARY --config $CONFIG_FILE --no-timestamp workers list"
echo "$CLI_BINARY --config $CONFIG_FILE --no-timestamp actors list"
echo
echo -e "${GREEN}=== Example Runner Complete ===${NC}"
echo -e "${YELLOW}To run actual commands, ensure you have:${NC}"
echo "1. Redis server running on localhost:6379"
echo "2. Zinit process manager installed and configured"
echo "3. Worker binaries available at the paths specified in config.toml"
echo "3. Actor binaries available at the paths specified in config.toml"
echo
echo -e "${YELLOW}For testing without full setup, you can:${NC}"
echo "1. Update config.toml with paths to existing binaries"
echo "2. Use the CLI help commands and configuration validation"
echo "3. Test the REPL mode (requires workers to be running)"
echo "3. Test the REPL mode (requires actors to be running)"

View File

@@ -1,7 +1,7 @@
#!/usr/bin/env python3
"""
Sample Python script for demonstration
This script demonstrates Python worker functionality
This script demonstrates Python actor functionality
"""
import json
@@ -9,7 +9,7 @@ import datetime
from typing import List, Dict
def main():
print("=== Python Worker Demo ===")
print("=== Python Actor Demo ===")
print("Python data processing operations")
# Data structures

View File

@@ -1,8 +1,8 @@
// Sample OSIS/HeroScript for demonstration
// This script demonstrates basic OSIS worker functionality
// This script demonstrates basic OSIS actor functionality
print("=== OSIS Worker Demo ===");
print("Hello from the OSIS worker!");
print("=== OSIS Actor Demo ===");
print("Hello from the OSIS actor!");
// Basic variable operations
let name = "Hero";

View File

@@ -1,12 +1,12 @@
// Sample V language script for demonstration
// This script demonstrates V worker functionality
// This script demonstrates V actor functionality
module main
import math
fn main() {
println("=== V Worker Demo ===")
println("=== V Actor Demo ===")
println("V language mathematical operations")
// Basic arithmetic

View File

@@ -1,7 +1,7 @@
// Sample SAL (System Abstraction Layer) script for demonstration
// This script demonstrates system-level operations through SAL worker
// This script demonstrates system-level operations through SAL actor
print("=== SAL Worker Demo ===");
print("=== SAL Actor Demo ===");
print("System Abstraction Layer operations");
// System information gathering

View File

@@ -1,6 +1,6 @@
use hero_supervisor::{
Supervisor, SupervisorBuilder, WorkerConfig, WorkerLifecycleManager,
WorkerLifecycleManagerBuilder, ScriptType
Supervisor, SupervisorBuilder, ActorConfig, ActorLifecycleManager,
ActorLifecycleManagerBuilder, ScriptType
};
use log::{info, warn, error};
use std::collections::HashMap;
@@ -13,7 +13,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
// Initialize logging
env_logger::init();
info!("Starting Worker Lifecycle Management Demo");
info!("Starting Actor Lifecycle Management Demo");
// Configuration
let redis_url = "redis://localhost:6379";
@@ -25,154 +25,154 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
.context_id("demo_context")
.build()?;
// Configure workers for different script types
let mut worker_configs = Vec::new();
// Configure actors for different script types
let mut actor_configs = Vec::new();
// OSIS workers (Rhai/HeroScript)
// OSIS actors (Rhai/HeroScript)
for i in 0..2 {
let config = WorkerConfig::new(
format!("osis_worker_{}", i),
PathBuf::from("/usr/local/bin/osis_worker"),
let config = ActorConfig::new(
format!("osis_actor_{}", i),
PathBuf::from("/usr/local/bin/osis_actor"),
ScriptType::OSIS,
)
.with_args(vec![
"--redis-url".to_string(),
redis_url.to_string(),
"--worker-id".to_string(),
format!("osis_worker_{}", i),
"--actor-id".to_string(),
format!("osis_actor_{}", i),
])
.with_env({
let mut env = HashMap::new();
env.insert("RUST_LOG".to_string(), "info".to_string());
env.insert("WORKER_TYPE".to_string(), "osis".to_string());
env.insert("ACTOR_TYPE".to_string(), "osis".to_string());
env
})
.with_health_check("/usr/local/bin/osis_worker --health-check".to_string())
.with_health_check("/usr/local/bin/osis_actor --health-check".to_string())
.with_dependencies(vec!["redis".to_string()]);
worker_configs.push(config);
actor_configs.push(config);
}
// SAL workers (System Abstraction Layer)
// SAL actors (System Abstraction Layer)
for i in 0..3 {
let config = WorkerConfig::new(
format!("sal_worker_{}", i),
PathBuf::from("/usr/local/bin/sal_worker"),
let config = ActorConfig::new(
format!("sal_actor_{}", i),
PathBuf::from("/usr/local/bin/sal_actor"),
ScriptType::SAL,
)
.with_args(vec![
"--redis-url".to_string(),
redis_url.to_string(),
"--worker-id".to_string(),
format!("sal_worker_{}", i),
"--actor-id".to_string(),
format!("sal_actor_{}", i),
])
.with_env({
let mut env = HashMap::new();
env.insert("RUST_LOG".to_string(), "info".to_string());
env.insert("WORKER_TYPE".to_string(), "sal".to_string());
env.insert("ACTOR_TYPE".to_string(), "sal".to_string());
env
})
.with_health_check("/usr/local/bin/sal_worker --health-check".to_string())
.with_health_check("/usr/local/bin/sal_actor --health-check".to_string())
.with_dependencies(vec!["redis".to_string()]);
worker_configs.push(config);
actor_configs.push(config);
}
// V workers (HeroScript in V language)
// V actors (HeroScript in V language)
for i in 0..2 {
let config = WorkerConfig::new(
format!("v_worker_{}", i),
PathBuf::from("/usr/local/bin/v_worker"),
let config = ActorConfig::new(
format!("v_actor_{}", i),
PathBuf::from("/usr/local/bin/v_actor"),
ScriptType::V,
)
.with_args(vec![
"--redis-url".to_string(),
redis_url.to_string(),
"--worker-id".to_string(),
format!("v_worker_{}", i),
"--actor-id".to_string(),
format!("v_actor_{}", i),
])
.with_env({
let mut env = HashMap::new();
env.insert("RUST_LOG".to_string(), "info".to_string());
env.insert("WORKER_TYPE".to_string(), "v".to_string());
env.insert("ACTOR_TYPE".to_string(), "v".to_string());
env
})
.with_health_check("/usr/local/bin/v_worker --health-check".to_string())
.with_health_check("/usr/local/bin/v_actor --health-check".to_string())
.with_dependencies(vec!["redis".to_string()]);
worker_configs.push(config);
actor_configs.push(config);
}
// Create lifecycle manager
let mut lifecycle_manager = WorkerLifecycleManagerBuilder::new(zinit_socket.to_string())
let mut lifecycle_manager = ActorLifecycleManagerBuilder::new(zinit_socket.to_string())
.with_supervisor(supervisor.clone());
// Add all worker configurations
for config in worker_configs {
lifecycle_manager = lifecycle_manager.add_worker(config);
// Add all actor configurations
for config in actor_configs {
lifecycle_manager = lifecycle_manager.add_actor(config);
}
let mut lifecycle_manager = lifecycle_manager.build();
// Demonstrate lifecycle operations
info!("=== Starting Worker Lifecycle Demo ===");
info!("=== Starting Actor Lifecycle Demo ===");
// 1. Start all workers
info!("1. Starting all workers...");
match lifecycle_manager.start_all_workers().await {
Ok(_) => info!("✅ All workers started successfully"),
// 1. Start all actors
info!("1. Starting all actors...");
match lifecycle_manager.start_all_actors().await {
Ok(_) => info!("✅ All actors started successfully"),
Err(e) => {
error!("❌ Failed to start workers: {}", e);
error!("❌ Failed to start actors: {}", e);
return Err(e.into());
}
}
// Wait for workers to initialize
// Wait for actors to initialize
sleep(Duration::from_secs(5)).await;
// 2. Check worker status
info!("2. Checking worker status...");
match lifecycle_manager.get_all_worker_status().await {
// 2. Check actor status
info!("2. Checking actor status...");
match lifecycle_manager.get_all_actor_status().await {
Ok(status_map) => {
for (worker_name, status) in status_map {
info!(" Worker '{}': State={:?}, PID={}", worker_name, status.state, status.pid);
for (actor_name, status) in status_map {
info!(" Actor '{}': State={:?}, PID={}", actor_name, status.state, status.pid);
}
}
Err(e) => warn!("Failed to get worker status: {}", e),
Err(e) => warn!("Failed to get actor status: {}", e),
}
// 3. Demonstrate scaling
info!("3. Demonstrating worker scaling...");
info!("3. Demonstrating actor scaling...");
// Scale up OSIS workers
info!(" Scaling up OSIS workers to 3...");
if let Err(e) = lifecycle_manager.scale_workers(&ScriptType::OSIS, 3).await {
warn!("Failed to scale OSIS workers: {}", e);
// Scale up OSIS actors
info!(" Scaling up OSIS actors to 3...");
if let Err(e) = lifecycle_manager.scale_actors(&ScriptType::OSIS, 3).await {
warn!("Failed to scale OSIS actors: {}", e);
}
sleep(Duration::from_secs(3)).await;
// Scale down SAL workers
info!(" Scaling down SAL workers to 1...");
if let Err(e) = lifecycle_manager.scale_workers(&ScriptType::SAL, 1).await {
warn!("Failed to scale SAL workers: {}", e);
// Scale down SAL actors
info!(" Scaling down SAL actors to 1...");
if let Err(e) = lifecycle_manager.scale_actors(&ScriptType::SAL, 1).await {
warn!("Failed to scale SAL actors: {}", e);
}
sleep(Duration::from_secs(3)).await;
// 4. Check running worker counts
info!("4. Checking running worker counts after scaling...");
// 4. Check running actor counts
info!("4. Checking running actor counts after scaling...");
for script_type in [ScriptType::OSIS, ScriptType::SAL, ScriptType::V] {
let count = lifecycle_manager.get_running_worker_count(&script_type).await;
info!(" {:?}: {} workers running", script_type, count);
let count = lifecycle_manager.get_running_actor_count(&script_type).await;
info!(" {:?}: {} actors running", script_type, count);
}
// 5. Demonstrate restart functionality
info!("5. Demonstrating worker restart...");
if let Err(e) = lifecycle_manager.restart_worker("osis_worker_0").await {
warn!("Failed to restart worker: {}", e);
info!("5. Demonstrating actor restart...");
if let Err(e) = lifecycle_manager.restart_actor("osis_actor_0").await {
warn!("Failed to restart actor: {}", e);
} else {
info!(" ✅ Successfully restarted osis_worker_0");
info!(" ✅ Successfully restarted osis_actor_0");
}
sleep(Duration::from_secs(3)).await;
@@ -180,12 +180,12 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
// 6. Simulate job dispatch and health monitoring
info!("6. Simulating job dispatch and health monitoring...");
// Update job time for a worker (simulating job dispatch)
lifecycle_manager.update_worker_job_time("sal_worker_0");
info!(" Updated job time for sal_worker_0");
// Update job time for a actor (simulating job dispatch)
lifecycle_manager.update_actor_job_time("sal_actor_0");
info!(" Updated job time for sal_actor_0");
// Perform health monitoring check
if let Err(e) = lifecycle_manager.monitor_worker_health().await {
if let Err(e) = lifecycle_manager.monitor_actor_health().await {
warn!("Health monitoring failed: {}", e);
} else {
info!(" ✅ Health monitoring completed");
@@ -196,7 +196,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
let test_job = supervisor
.new_job()
.script_type(ScriptType::OSIS)
.script_content("println!(\"Hello from worker!\");".to_string())
.script_content("println!(\"Hello from actor!\");".to_string())
.timeout(Duration::from_secs(30))
.build()?;
@@ -208,27 +208,27 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
// 8. Demonstrate graceful shutdown
info!("8. Demonstrating graceful shutdown...");
// Stop specific workers
info!(" Stopping specific workers...");
for worker_name in ["osis_worker_1", "v_worker_0"] {
if let Err(e) = lifecycle_manager.stop_worker(worker_name).await {
warn!("Failed to stop worker {}: {}", worker_name, e);
// Stop specific actors
info!(" Stopping specific actors...");
for actor_name in ["osis_actor_1", "v_actor_0"] {
if let Err(e) = lifecycle_manager.stop_actor(actor_name).await {
warn!("Failed to stop actor {}: {}", actor_name, e);
} else {
info!(" ✅ Stopped worker: {}", worker_name);
info!(" ✅ Stopped actor: {}", actor_name);
}
}
sleep(Duration::from_secs(2)).await;
// Stop all remaining workers
info!(" Stopping all remaining workers...");
if let Err(e) = lifecycle_manager.stop_all_workers().await {
error!("Failed to stop all workers: {}", e);
// Stop all remaining actors
info!(" Stopping all remaining actors...");
if let Err(e) = lifecycle_manager.stop_all_actors().await {
error!("Failed to stop all actors: {}", e);
} else {
info!(" ✅ All workers stopped successfully");
info!(" ✅ All actors stopped successfully");
}
info!("=== Worker Lifecycle Demo Completed ===");
info!("=== Actor Lifecycle Demo Completed ===");
// Optional: Start health monitoring loop (commented out for demo)
// info!("Starting health monitoring loop (Ctrl+C to stop)...");

View File

@@ -8,44 +8,44 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
info!("Starting Hero Supervisor Lifecycle Demo");
// Build supervisor with simplified worker configuration
// Workers are automatically launched during build
// Build supervisor with simplified actor configuration
// Actors are automatically launched during build
let supervisor = SupervisorBuilder::new()
.redis_url("redis://localhost:6379")
.osis_worker("/usr/local/bin/osis_worker")
.sal_worker("/usr/local/bin/sal_worker")
.v_worker("/usr/local/bin/v_worker")
.worker_env_var("REDIS_URL", "redis://localhost:6379")
.worker_env_var("LOG_LEVEL", "info")
.osis_actor("/usr/local/bin/osis_actor")
.sal_actor("/usr/local/bin/sal_actor")
.v_actor("/usr/local/bin/v_actor")
.actor_env_var("REDIS_URL", "redis://localhost:6379")
.actor_env_var("LOG_LEVEL", "info")
.build().await?;
info!("Supervisor created and workers launched successfully");
info!("Supervisor created and actors launched successfully");
// Wait a moment for workers to start
// Wait a moment for actors to start
sleep(Duration::from_secs(2)).await;
// Check worker status using the simplified API
info!("Checking worker status...");
let workers = supervisor.get_workers(&[]).await;
// Check actor status using the simplified API
info!("Checking actor status...");
let actors = supervisor.get_actors(&[]).await;
for worker in &workers {
let status_info = if worker.is_running {
format!("Running (PID: {})", worker.status.as_ref().map(|s| s.pid).unwrap_or(0))
for actor in &actors {
let status_info = if actor.is_running {
format!("Running (PID: {})", actor.status.as_ref().map(|s| s.pid).unwrap_or(0))
} else {
"Stopped".to_string()
};
info!(" Worker '{}' ({:?}): {}", worker.config.name, worker.config.script_type, status_info);
info!(" Actor '{}' ({:?}): {}", actor.config.name, actor.config.script_type, status_info);
}
// Demonstrate lifecycle operations with simplified API
info!("=== Worker Lifecycle Operations ===");
info!("=== Actor Lifecycle Operations ===");
// 1. Demonstrate restart functionality
info!("1. Demonstrating worker restart...");
if let Err(e) = supervisor.restart_worker("osis_worker_1").await {
error!("Failed to restart worker: {}", e);
info!("1. Demonstrating actor restart...");
if let Err(e) = supervisor.restart_actor("osis_actor_1").await {
error!("Failed to restart actor: {}", e);
} else {
info!(" ✅ Successfully restarted osis_worker_1");
info!(" ✅ Successfully restarted osis_actor_1");
}
sleep(Duration::from_secs(2)).await;
@@ -61,11 +61,11 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
// 3. Demonstrate graceful shutdown
info!("3. Demonstrating graceful shutdown...");
// Stop specific workers
if let Err(e) = supervisor.stop_worker("osis_worker_1").await {
error!("Failed to stop worker: {}", e);
// Stop specific actors
if let Err(e) = supervisor.stop_actor("osis_actor_1").await {
error!("Failed to stop actor: {}", e);
} else {
info!("Worker stopped successfully");
info!("Actor stopped successfully");
}
info!("Demo completed successfully!");

View File

@@ -1,18 +1,18 @@
[global]
redis_url = "redis://localhost:6379"
[osis_worker]
binary_path = "/path/to/osis_worker"
[osis_actor]
binary_path = "/path/to/osis_actor"
env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
[sal_worker]
binary_path = "/path/to/sal_worker"
[sal_actor]
binary_path = "/path/to/sal_actor"
env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
[v_worker]
binary_path = "/path/to/v_worker"
[v_actor]
binary_path = "/path/to/v_actor"
env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
[python_worker]
binary_path = "/path/to/python_worker"
[python_actor]
binary_path = "/path/to/python_actor"
env_vars = { "VAR1" = "value1", "VAR2" = "value2" }

View File

@@ -16,14 +16,14 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
info!("Supervisor created.");
let script_content = r#"
// This script will never be executed by a worker because the recipient does not exist.
// This script will never be executed by a actor because the recipient does not exist.
let x = 10;
let y = x + 32;
y
"#;
// The worker_id points to a worker queue that doesn't have a worker.
let non_existent_recipient = "non_existent_worker_for_timeout_test";
// The actor_id points to a actor queue that doesn't have a actor.
let non_existent_recipient = "non_existent_actor_for_timeout_test";
let very_short_timeout = Duration::from_secs(2);
info!(

View File

@@ -21,12 +21,12 @@ pub enum SupervisorError {
InvalidInput(String),
/// Job operation error
JobError(hero_job::JobError),
/// Worker lifecycle management errors
WorkerStartFailed(String, String),
WorkerStopFailed(String, String),
WorkerRestartFailed(String, String),
WorkerStatusFailed(String, String),
WorkerNotFound(String),
/// Actor lifecycle management errors
ActorStartFailed(String, String),
ActorStopFailed(String, String),
ActorRestartFailed(String, String),
ActorStatusFailed(String, String),
ActorNotFound(String),
PingJobFailed(String, String),
/// Zinit client operation error
ZinitError(String),
@@ -73,23 +73,23 @@ impl std::fmt::Display for SupervisorError {
SupervisorError::JobError(e) => {
write!(f, "Job error: {}", e)
}
SupervisorError::WorkerStartFailed(worker, reason) => {
write!(f, "Failed to start worker '{}': {}", worker, reason)
SupervisorError::ActorStartFailed(actor, reason) => {
write!(f, "Failed to start actor '{}': {}", actor, reason)
}
SupervisorError::WorkerStopFailed(worker, reason) => {
write!(f, "Failed to stop worker '{}': {}", worker, reason)
SupervisorError::ActorStopFailed(actor, reason) => {
write!(f, "Failed to stop actor '{}': {}", actor, reason)
}
SupervisorError::WorkerRestartFailed(worker, reason) => {
write!(f, "Failed to restart worker '{}': {}", worker, reason)
SupervisorError::ActorRestartFailed(actor, reason) => {
write!(f, "Failed to restart actor '{}': {}", actor, reason)
}
SupervisorError::WorkerStatusFailed(worker, reason) => {
write!(f, "Failed to get status for worker '{}': {}", worker, reason)
SupervisorError::ActorStatusFailed(actor, reason) => {
write!(f, "Failed to get status for actor '{}': {}", actor, reason)
}
SupervisorError::WorkerNotFound(worker) => {
write!(f, "Worker '{}' not found", worker)
SupervisorError::ActorNotFound(actor) => {
write!(f, "Actor '{}' not found", actor)
}
SupervisorError::PingJobFailed(worker, reason) => {
write!(f, "Ping job failed for worker '{}': {}", worker, reason)
SupervisorError::PingJobFailed(actor, reason) => {
write!(f, "Ping job failed for actor '{}': {}", actor, reason)
}
SupervisorError::ZinitError(msg) => {
write!(f, "Zinit error: {}", msg)

View File

@@ -16,7 +16,7 @@ mod lifecycle;
pub use crate::error::SupervisorError;
pub use crate::job::JobBuilder;
pub use crate::lifecycle::WorkerConfig;
pub use crate::lifecycle::ActorConfig;
// Re-export types from hero_job for public API
pub use hero_job::{Job, JobStatus, ScriptType};
@@ -28,22 +28,22 @@ pub struct Supervisor {
pub struct SupervisorBuilder {
redis_url: Option<String>,
osis_worker: Option<String>,
sal_worker: Option<String>,
v_worker: Option<String>,
python_worker: Option<String>,
worker_env_vars: HashMap<String, String>,
osis_actor: Option<String>,
sal_actor: Option<String>,
v_actor: Option<String>,
python_actor: Option<String>,
actor_env_vars: HashMap<String, String>,
websocket_config: Option<WebSocketServerConfig>,
}
/// Helper struct to pass builder data to worker launch method
/// Helper struct to pass builder data to actor launch method
#[derive(Clone)]
struct SupervisorBuilderData {
osis_worker: Option<String>,
sal_worker: Option<String>,
v_worker: Option<String>,
python_worker: Option<String>,
worker_env_vars: HashMap<String, String>,
osis_actor: Option<String>,
sal_actor: Option<String>,
v_actor: Option<String>,
python_actor: Option<String>,
actor_env_vars: HashMap<String, String>,
websocket_config: Option<WebSocketServerConfig>,
}
@@ -52,10 +52,10 @@ struct SupervisorBuilderData {
pub struct SupervisorConfig {
pub global: GlobalConfig,
pub websocket_server: Option<WebSocketServerConfig>,
pub osis_worker: Option<WorkerConfigToml>,
pub sal_worker: Option<WorkerConfigToml>,
pub v_worker: Option<WorkerConfigToml>,
pub python_worker: Option<WorkerConfigToml>,
pub osis_actor: Option<ActorConfigToml>,
pub sal_actor: Option<ActorConfigToml>,
pub v_actor: Option<ActorConfigToml>,
pub python_actor: Option<ActorConfigToml>,
}
/// Global configuration section
@@ -64,12 +64,10 @@ pub struct GlobalConfig {
pub redis_url: String,
}
/// Worker configuration section in TOML
/// Actor configuration section in TOML
#[derive(Debug, Deserialize, Serialize)]
pub struct WorkerConfigToml {
pub struct ActorConfigToml {
pub binary_path: String,
#[serde(default)]
pub env_vars: HashMap<String, String>,
}
/// WebSocket server configuration section in TOML
@@ -127,11 +125,11 @@ impl SupervisorBuilder {
pub fn new() -> Self {
Self {
redis_url: None,
osis_worker: None,
sal_worker: None,
v_worker: None,
python_worker: None,
worker_env_vars: HashMap::new(),
osis_actor: None,
sal_actor: None,
v_actor: None,
python_actor: None,
actor_env_vars: HashMap::new(),
websocket_config: None,
}
}
@@ -147,25 +145,21 @@ impl SupervisorBuilder {
let mut builder = Self::new()
.redis_url(&config.global.redis_url);
// Configure workers based on TOML config
if let Some(osis_config) = config.osis_worker {
builder = builder.osis_worker(&osis_config.binary_path)
.worker_env_vars(osis_config.env_vars);
// Configure actors based on TOML config
if let Some(osis_config) = config.osis_actor {
builder = builder.osis_actor(&osis_config.binary_path);
}
if let Some(sal_config) = config.sal_worker {
builder = builder.sal_worker(&sal_config.binary_path)
.worker_env_vars(sal_config.env_vars);
if let Some(sal_config) = config.sal_actor {
builder = builder.sal_actor(&sal_config.binary_path);
}
if let Some(v_config) = config.v_worker {
builder = builder.v_worker(&v_config.binary_path)
.worker_env_vars(v_config.env_vars);
if let Some(v_config) = config.v_actor {
builder = builder.v_actor(&v_config.binary_path);
}
if let Some(python_config) = config.python_worker {
builder = builder.python_worker(&python_config.binary_path)
.worker_env_vars(python_config.env_vars);
if let Some(python_config) = config.python_actor {
builder = builder.python_actor(&python_config.binary_path);
}
// Store WebSocket configuration for later use
@@ -176,28 +170,28 @@ impl SupervisorBuilder {
Ok(builder)
}
/// Validate that all configured worker binaries exist and are executable
fn validate_worker_binaries(&self) -> Result<(), SupervisorError> {
let workers = [
("OSIS", &self.osis_worker),
("SAL", &self.sal_worker),
("V", &self.v_worker),
("Python", &self.python_worker),
/// Validate that all configured actor binaries exist and are executable
fn validate_actor_binaries(&self) -> Result<(), SupervisorError> {
let actors = [
("OSIS", &self.osis_actor),
("SAL", &self.sal_actor),
("V", &self.v_actor),
("Python", &self.python_actor),
];
for (worker_type, binary_path) in workers {
for (actor_type, binary_path) in actors {
if let Some(path) = binary_path {
let path_obj = Path::new(path);
if !path_obj.exists() {
return Err(SupervisorError::ConfigError(
format!("{} worker binary does not exist: {}", worker_type, path)
format!("{} actor binary does not exist: {}", actor_type, path)
));
}
if !path_obj.is_file() {
return Err(SupervisorError::ConfigError(
format!("{} worker path is not a file: {}", worker_type, path)
format!("{} actor path is not a file: {}", actor_type, path)
));
}
@@ -207,19 +201,19 @@ impl SupervisorBuilder {
use std::os::unix::fs::PermissionsExt;
let metadata = path_obj.metadata().map_err(|e| {
SupervisorError::ConfigError(
format!("Failed to read metadata for {} worker binary {}: {}", worker_type, path, e)
format!("Failed to read metadata for {} actor binary {}: {}", actor_type, path, e)
)
})?;
let permissions = metadata.permissions();
if permissions.mode() & 0o111 == 0 {
return Err(SupervisorError::ConfigError(
format!("{} worker binary is not executable: {}", worker_type, path)
format!("{} actor binary is not executable: {}", actor_type, path)
));
}
}
info!("Validated {} worker binary: {}", worker_type, path);
info!("Validated {} actor binary: {}", actor_type, path);
}
}
@@ -231,48 +225,48 @@ impl SupervisorBuilder {
self
}
pub fn osis_worker(mut self, binary_path: &str) -> Self {
self.osis_worker = Some(binary_path.to_string());
pub fn osis_actor(mut self, binary_path: &str) -> Self {
self.osis_actor = Some(binary_path.to_string());
self
}
pub fn sal_worker(mut self, binary_path: &str) -> Self {
self.sal_worker = Some(binary_path.to_string());
pub fn sal_actor(mut self, binary_path: &str) -> Self {
self.sal_actor = Some(binary_path.to_string());
self
}
pub fn v_worker(mut self, binary_path: &str) -> Self {
self.v_worker = Some(binary_path.to_string());
pub fn v_actor(mut self, binary_path: &str) -> Self {
self.v_actor = Some(binary_path.to_string());
self
}
pub fn python_worker(mut self, binary_path: &str) -> Self {
self.python_worker = Some(binary_path.to_string());
pub fn python_actor(mut self, binary_path: &str) -> Self {
self.python_actor = Some(binary_path.to_string());
self
}
pub fn worker_env_var(mut self, key: &str, value: &str) -> Self {
self.worker_env_vars.insert(key.to_string(), value.to_string());
pub fn actor_env_var(mut self, key: &str, value: &str) -> Self {
self.actor_env_vars.insert(key.to_string(), value.to_string());
self
}
pub fn worker_env_vars(mut self, env_vars: HashMap<String, String>) -> Self {
self.worker_env_vars.extend(env_vars);
pub fn actor_env_vars(mut self, env_vars: HashMap<String, String>) -> Self {
self.actor_env_vars.extend(env_vars);
self
}
/// Builds the final `Supervisor` instance synchronously.
///
/// This method validates the configuration, checks worker binary existence,
/// and creates the Redis client. Worker launching is deferred to the `start_workers()` method.
/// This method validates the configuration, checks actor binary existence,
/// and creates the Redis client. Actor launching is deferred to the `start_actors()` method.
///
/// # Returns
///
/// * `Ok(Supervisor)` - Successfully configured client with valid binaries
/// * `Err(SupervisorError)` - Configuration, binary validation, or connection error
pub async fn build(self) -> Result<Supervisor, SupervisorError> {
// Validate that all configured worker binaries exist first
Self::validate_worker_binaries(&self)?;
// Validate that all configured actor binaries exist first
Self::validate_actor_binaries(&self)?;
let url = self.redis_url
.unwrap_or_else(|| "redis://127.0.0.1/".to_string());
@@ -281,13 +275,13 @@ impl SupervisorBuilder {
let zinit_client = ZinitClient::unix_socket("/tmp/zinit.sock").await
.map_err(|e| SupervisorError::ZinitError(format!("Failed to create Zinit client: {}", e)))?;
// Store builder data for later use in start_workers()
// Store builder data for later use in start_actors()
let builder_data = SupervisorBuilderData {
osis_worker: self.osis_worker,
sal_worker: self.sal_worker,
v_worker: self.v_worker,
python_worker: self.python_worker,
worker_env_vars: self.worker_env_vars,
osis_actor: self.osis_actor,
sal_actor: self.sal_actor,
v_actor: self.v_actor,
python_actor: self.python_actor,
actor_env_vars: self.actor_env_vars,
websocket_config: self.websocket_config,
};
@@ -302,10 +296,10 @@ impl SupervisorBuilder {
}
impl Supervisor {
/// Start all configured workers asynchronously.
/// This method should be called after build() to launch the workers.
pub async fn start_workers(&self) -> Result<(), SupervisorError> {
info!("Starting Hero Supervisor workers...");
/// Start all configured actors asynchronously.
/// This method should be called after build() to launch the actors.
pub async fn start_actors(&self) -> Result<(), SupervisorError> {
info!("Starting Hero Supervisor actors...");
// Test Zinit connection first
info!("Testing Zinit connection at /tmp/zinit.sock...");
@@ -319,102 +313,102 @@ impl Supervisor {
}
}
// Clean up any existing worker services first
info!("Cleaning up existing worker services...");
self.cleanup_existing_workers().await?;
// Clean up any existing actor services first
info!("Cleaning up existing actor services...");
self.cleanup_existing_actors().await?;
// Launch configured workers if builder data is available
// Launch configured actors if builder data is available
if let Some(builder_data) = &self.builder_data {
info!("Launching configured workers...");
self.launch_configured_workers(builder_data).await?;
info!("Launching configured actors...");
self.launch_configured_actors(builder_data).await?;
} else {
warn!("No builder data available, no workers to start");
warn!("No builder data available, no actors to start");
}
info!("All workers started successfully!");
info!("All actors started successfully!");
Ok(())
}
/// Clean up all worker services from zinit on program exit
/// Clean up all actor services from zinit on program exit
pub async fn cleanup_and_shutdown(&self) -> Result<(), SupervisorError> {
info!("Cleaning up worker services before shutdown...");
info!("Cleaning up actor services before shutdown...");
let worker_names = vec![
"osis_worker_1",
"sal_worker_1",
"v_worker_1",
"python_worker_1"
let actor_names = vec![
"osis_actor_1",
"sal_actor_1",
"v_actor_1",
"python_actor_1"
];
for worker_name in worker_names {
if let Err(e) = self.stop_and_delete_worker(worker_name).await {
warn!("Failed to cleanup worker {}: {}", worker_name, e);
for actor_name in actor_names {
if let Err(e) = self.stop_and_delete_actor(actor_name).await {
warn!("Failed to cleanup actor {}: {}", actor_name, e);
}
}
info!("Worker cleanup completed");
info!("Actor cleanup completed");
Ok(())
}
/// Clean up any existing worker services on startup
async fn cleanup_existing_workers(&self) -> Result<(), SupervisorError> {
info!("Cleaning up any existing worker services...");
/// Clean up any existing actor services on startup
async fn cleanup_existing_actors(&self) -> Result<(), SupervisorError> {
info!("Cleaning up any existing actor services...");
let worker_names = vec![
"osis_worker_1",
"sal_worker_1",
"v_worker_1",
"python_worker_1"
let actor_names = vec![
"osis_actor_1",
"sal_actor_1",
"v_actor_1",
"python_actor_1"
];
for worker_name in worker_names {
for actor_name in actor_names {
// Try to stop and delete, but don't fail if they don't exist
info!("Attempting to cleanup worker: {}", worker_name);
match self.stop_and_delete_worker(worker_name).await {
Ok(_) => info!("Successfully cleaned up worker: {}", worker_name),
Err(e) => debug!("Failed to cleanup worker {}: {}", worker_name, e),
info!("Attempting to cleanup actor: {}", actor_name);
match self.stop_and_delete_actor(actor_name).await {
Ok(_) => info!("Successfully cleaned up actor: {}", actor_name),
Err(e) => debug!("Failed to cleanup actor {}: {}", actor_name, e),
}
}
info!("Existing worker cleanup completed");
info!("Existing actor cleanup completed");
Ok(())
}
/// Stop and delete a worker service from zinit
async fn stop_and_delete_worker(&self, worker_name: &str) -> Result<(), SupervisorError> {
info!("Starting cleanup for worker: {}", worker_name);
/// Stop and delete a actor service from zinit
async fn stop_and_delete_actor(&self, actor_name: &str) -> Result<(), SupervisorError> {
info!("Starting cleanup for actor: {}", actor_name);
// First try to stop the worker
info!("Attempting to stop worker: {}", worker_name);
if let Err(e) = self.zinit_client.stop(worker_name).await {
debug!("Worker {} was not running or failed to stop: {}", worker_name, e);
// First try to stop the actor
info!("Attempting to stop actor: {}", actor_name);
if let Err(e) = self.zinit_client.stop(actor_name).await {
debug!("Actor {} was not running or failed to stop: {}", actor_name, e);
} else {
info!("Successfully stopped worker: {}", worker_name);
info!("Successfully stopped actor: {}", actor_name);
}
// Then forget the service to stop monitoring it
info!("Attempting to forget worker: {}", worker_name);
if let Err(e) = self.zinit_client.forget(worker_name).await {
info!("Worker {} was not being monitored or failed to forget: {}", worker_name, e);
info!("Attempting to forget actor: {}", actor_name);
if let Err(e) = self.zinit_client.forget(actor_name).await {
info!("Actor {} was not being monitored or failed to forget: {}", actor_name, e);
} else {
info!("Successfully forgot worker service: {}", worker_name);
info!("Successfully forgot actor service: {}", actor_name);
}
// Finally, delete the service configuration
info!("Attempting to delete service for worker: {}", worker_name);
if let Err(e) = self.zinit_client.delete_service(worker_name).await {
debug!("Worker {} service did not exist or failed to delete: {}", worker_name, e);
info!("Attempting to delete service for actor: {}", actor_name);
if let Err(e) = self.zinit_client.delete_service(actor_name).await {
debug!("Actor {} service did not exist or failed to delete: {}", actor_name, e);
} else {
info!("Successfully deleted worker service: {}", worker_name);
info!("Successfully deleted actor service: {}", actor_name);
}
info!("Completed cleanup for worker: {}", worker_name);
info!("Completed cleanup for actor: {}", actor_name);
Ok(())
}
/// Get the hardcoded worker queue key for the script type
fn get_worker_queue_key(&self, script_type: &ScriptType) -> String {
format!("{}worker_queue:{}", NAMESPACE_PREFIX, script_type.worker_queue_suffix())
/// Get the hardcoded actor queue key for the script type
fn get_actor_queue_key(&self, script_type: &ScriptType) -> String {
format!("{}actor_queue:{}", NAMESPACE_PREFIX, script_type.actor_queue_suffix())
}
pub fn new_job(&self) -> JobBuilder {
@@ -432,63 +426,58 @@ impl Supervisor {
})
}
/// Extract worker configurations from the supervisor's builder data
pub fn get_worker_configs(&self) -> Result<Vec<WorkerConfig>, SupervisorError> {
/// Extract actor configurations from the supervisor's builder data
pub fn get_actor_configs(&self) -> Result<Vec<ActorConfig>, SupervisorError> {
let builder_data = self.builder_data.as_ref().ok_or_else(|| {
SupervisorError::ConfigError("No builder data available for worker configs".to_string())
SupervisorError::ConfigError("No builder data available for actor configs".to_string())
})?;
let mut configs = Vec::new();
let env_vars = builder_data.worker_env_vars.clone();
if let Some(osis_path) = &builder_data.osis_worker {
if let Some(osis_path) = &builder_data.osis_actor {
configs.push(
WorkerConfig::new("osis_worker_1".to_string(), PathBuf::from(osis_path), ScriptType::OSIS)
.with_env(env_vars.clone())
ActorConfig::new("osis_actor_1".to_string(), PathBuf::from(osis_path), ScriptType::OSIS)
);
}
if let Some(sal_path) = &builder_data.sal_worker {
if let Some(sal_path) = &builder_data.sal_actor {
configs.push(
WorkerConfig::new("sal_worker_1".to_string(), PathBuf::from(sal_path), ScriptType::SAL)
.with_env(env_vars.clone())
ActorConfig::new("sal_actor_1".to_string(), PathBuf::from(sal_path), ScriptType::SAL)
);
}
if let Some(v_path) = &builder_data.v_worker {
if let Some(v_path) = &builder_data.v_actor {
configs.push(
WorkerConfig::new("v_worker_1".to_string(), PathBuf::from(v_path), ScriptType::V)
.with_env(env_vars.clone())
ActorConfig::new("v_actor_1".to_string(), PathBuf::from(v_path), ScriptType::V)
);
}
if let Some(python_path) = &builder_data.python_worker {
if let Some(python_path) = &builder_data.python_actor {
configs.push(
WorkerConfig::new("python_worker_1".to_string(), PathBuf::from(python_path), ScriptType::Python)
.with_env(env_vars.clone())
ActorConfig::new("python_actor_1".to_string(), PathBuf::from(python_path), ScriptType::Python)
);
}
Ok(configs)
}
/// Spawn a background lifecycle manager that continuously monitors and maintains worker health
/// Spawn a background lifecycle manager that continuously monitors and maintains actor health
/// Returns a JoinHandle that can be used to stop the lifecycle manager
pub fn spawn_lifecycle_manager(
self: Arc<Self>,
worker_configs: Vec<WorkerConfig>,
actor_configs: Vec<ActorConfig>,
health_check_interval: Duration,
) -> tokio::task::JoinHandle<Result<(), SupervisorError>> {
let supervisor = self;
tokio::spawn(async move {
info!("Starting background lifecycle manager with {} workers", worker_configs.len());
info!("Starting background lifecycle manager with {} actors", actor_configs.len());
info!("Health check interval: {:?}", health_check_interval);
// Initial worker startup
info!("Performing initial worker startup...");
if let Err(e) = supervisor.start_workers().await {
error!("Failed to start workers during initialization: {}", e);
// Initial actor startup
info!("Performing initial actor startup...");
if let Err(e) = supervisor.start_actors().await {
error!("Failed to start actors during initialization: {}", e);
return Err(e);
}
@@ -499,12 +488,12 @@ impl Supervisor {
loop {
interval.tick().await;
info!("Running periodic worker health check...");
info!("Running periodic actor health check...");
// Check each worker's health and restart if needed
for worker_config in &worker_configs {
if let Err(e) = supervisor.check_and_restart_worker(worker_config).await {
error!("Failed to check/restart worker {}: {}", worker_config.name, e);
// Check each actor's health and restart if needed
for actor_config in &actor_configs {
if let Err(e) = supervisor.check_and_restart_actor(actor_config).await {
error!("Failed to check/restart actor {}: {}", actor_config.name, e);
}
}
@@ -513,59 +502,59 @@ impl Supervisor {
})
}
/// Check a single worker's health and restart if needed
async fn check_and_restart_worker(&self, worker_config: &WorkerConfig) -> Result<(), SupervisorError> {
let worker_name = &worker_config.name;
/// Check a single actor's health and restart if needed
async fn check_and_restart_actor(&self, actor_config: &ActorConfig) -> Result<(), SupervisorError> {
let actor_name = &actor_config.name;
// Get worker status
match self.zinit_client.status(worker_name).await {
// Get actor status
match self.zinit_client.status(actor_name).await {
Ok(status) => {
let is_healthy = status.state == "running" && status.pid > 0;
if is_healthy {
debug!("Worker {} is healthy (state: {}, pid: {})", worker_name, status.state, status.pid);
debug!("Actor {} is healthy (state: {}, pid: {})", actor_name, status.state, status.pid);
// Optionally send a ping job for deeper health check
if let Err(e) = self.send_ping_job(worker_config.script_type.clone()).await {
warn!("Ping job failed for worker {}: {}", worker_name, e);
if let Err(e) = self.send_ping_job(actor_config.script_type.clone()).await {
warn!("Ping job failed for actor {}: {}", actor_name, e);
// Note: We don't restart on ping failure as it might be temporary
}
} else {
warn!("Worker {} is unhealthy (state: {}, pid: {}), restarting...",
worker_name, status.state, status.pid);
warn!("Actor {} is unhealthy (state: {}, pid: {}), restarting...",
actor_name, status.state, status.pid);
// Attempt to restart the worker
if let Err(e) = self.restart_worker(worker_name).await {
error!("Failed to restart unhealthy worker {}: {}", worker_name, e);
// Attempt to restart the actor
if let Err(e) = self.restart_actor(actor_name).await {
error!("Failed to restart unhealthy actor {}: {}", actor_name, e);
// If restart fails, try a full stop/start cycle
warn!("Attempting full stop/start cycle for worker: {}", worker_name);
if let Err(e) = self.stop_and_delete_worker(worker_name).await {
error!("Failed to stop worker {} during recovery: {}", worker_name, e);
warn!("Attempting full stop/start cycle for actor: {}", actor_name);
if let Err(e) = self.stop_and_delete_actor(actor_name).await {
error!("Failed to stop actor {} during recovery: {}", actor_name, e);
}
if let Err(e) = self.start_worker(worker_config).await {
error!("Failed to start worker {} during recovery: {}", worker_name, e);
if let Err(e) = self.start_actor(actor_config).await {
error!("Failed to start actor {} during recovery: {}", actor_name, e);
return Err(e);
}
info!("Successfully recovered worker: {}", worker_name);
info!("Successfully recovered actor: {}", actor_name);
} else {
info!("Successfully restarted worker: {}", worker_name);
info!("Successfully restarted actor: {}", actor_name);
}
}
}
Err(e) => {
warn!("Could not get status for worker {} (may not exist): {}", worker_name, e);
warn!("Could not get status for actor {} (may not exist): {}", actor_name, e);
// Worker doesn't exist, try to start it
info!("Attempting to start missing worker: {}", worker_name);
if let Err(e) = self.start_worker(worker_config).await {
error!("Failed to start missing worker {}: {}", worker_name, e);
// Actor doesn't exist, try to start it
info!("Attempting to start missing actor: {}", actor_name);
if let Err(e) = self.start_actor(actor_config).await {
error!("Failed to start missing actor {}: {}", actor_name, e);
return Err(e);
}
info!("Successfully started missing worker: {}", worker_name);
info!("Successfully started missing actor: {}", actor_name);
}
}
@@ -597,18 +586,18 @@ impl Supervisor {
job_id: String,
script_type: &ScriptType
) -> Result<(), SupervisorError> {
let worker_queue_key = self.get_worker_queue_key(script_type);
let actor_queue_key = self.get_actor_queue_key(script_type);
// lpush also infers its types, RV is typically i64 (length of list) or () depending on exact command variant
// For `redis::AsyncCommands::lpush`, it's `RedisResult<R>` where R: FromRedisValue
// Often this is the length of the list. Let's allow inference or specify if needed.
let _: redis::RedisResult<i64> =
conn.lpush(&worker_queue_key, job_id.clone()).await;
conn.lpush(&actor_queue_key, job_id.clone()).await;
Ok(())
}
// Internal helper to await response from worker
// Internal helper to await response from actor
async fn await_response_from_connection(
&self,
conn: &mut redis::aio::MultiplexedConnection,
@@ -679,7 +668,7 @@ impl Supervisor {
Ok(())
}
// New method using dedicated reply queue with automatic worker selection
// New method using dedicated reply queue with automatic actor selection
pub async fn run_job_and_await_result(
&self,
job: &Job
@@ -782,7 +771,7 @@ impl Supervisor {
pub async fn stop_job(&self, job_id: &str) -> Result<(), SupervisorError> {
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
// Get job details to determine script type and appropriate worker
// Get job details to determine script type and appropriate actor
let job_key = format!("{}job:{}", NAMESPACE_PREFIX, job_id);
let job_data: std::collections::HashMap<String, String> = conn.hgetall(&job_key).await?;
@@ -798,7 +787,7 @@ impl Supervisor {
.map_err(|e| SupervisorError::InvalidInput(format!("Invalid script type: {}", e)))?;
// Use hardcoded stop queue key for this script type
let stop_queue_key = format!("{}stop_queue:{}", NAMESPACE_PREFIX, script_type.worker_queue_suffix());
let stop_queue_key = format!("{}stop_queue:{}", NAMESPACE_PREFIX, script_type.actor_queue_suffix());
// Push job ID to the stop queue
conn.lpush::<_, _, ()>(&stop_queue_key, job_id).await?;
@@ -931,7 +920,7 @@ impl Supervisor {
/// Dispatch jobs that are ready (have all prerequisites completed)
pub async fn dispatch_ready_jobs(&self, ready_job_ids: Vec<String>) -> Result<(), SupervisorError> {
for job_id in ready_job_ids {
// Get job data to determine script type and select worker
// Get job data to determine script type and select actor
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
let job_key = format!("{}job:{}", NAMESPACE_PREFIX, job_id);
let job_data: std::collections::HashMap<String, String> = conn.hgetall(&job_key).await?;

View File

@@ -1,6 +1,6 @@
//! Worker lifecycle management functionality for the Hero Supervisor
//! Actor lifecycle management functionality for the Hero Supervisor
//!
//! This module provides worker process lifecycle management using Zinit as the process manager.
//! This module provides actor process lifecycle management using Zinit as the process manager.
//! All functionality is implemented as methods on the Supervisor struct for a clean API.
use log::{debug, error, info, warn};
@@ -12,28 +12,28 @@ use zinit_client::{Client as ZinitClient, Status};
use hero_job::ScriptType;
use crate::{Supervisor, SupervisorError};
/// Information about a worker including its configuration and current status
/// Information about a actor including its configuration and current status
#[derive(Debug, Clone)]
pub struct WorkerInfo {
pub config: WorkerConfig,
pub struct ActorInfo {
pub config: ActorConfig,
pub status: Option<Status>,
pub is_running: bool,
}
/// Configuration for a worker binary
/// Configuration for a actor binary
#[derive(Debug, Clone)]
pub struct WorkerConfig {
/// Name of the worker service
pub struct ActorConfig {
/// Name of the actor service
pub name: String,
/// Path to the worker binary
/// Path to the actor binary
pub binary_path: PathBuf,
/// Script type this worker handles
/// Script type this actor handles
pub script_type: ScriptType,
/// Command line arguments for the worker
/// Command line arguments for the actor
pub args: Vec<String>,
/// Environment variables for the worker
/// Environment variables for the actor
pub env: HashMap<String, String>,
/// Whether this worker should restart on exit
/// Whether this actor should restart on exit
pub restart_on_exit: bool,
/// Health check command (optional)
pub health_check: Option<String>,
@@ -41,7 +41,7 @@ pub struct WorkerConfig {
pub dependencies: Vec<String>,
}
impl WorkerConfig {
impl ActorConfig {
pub fn new(name: String, binary_path: PathBuf, script_type: ScriptType) -> Self {
Self {
name,
@@ -81,122 +81,122 @@ impl WorkerConfig {
}
}
/// Worker lifecycle management methods for Supervisor
/// Actor lifecycle management methods for Supervisor
impl Supervisor {
/// Get all workers with their configuration and status - unified method
pub async fn get_workers(&self, worker_configs: &[WorkerConfig]) -> Vec<WorkerInfo> {
let mut workers = Vec::new();
/// Get all actors with their configuration and status - unified method
pub async fn get_actors(&self, actor_configs: &[ActorConfig]) -> Vec<ActorInfo> {
let mut actors = Vec::new();
for config in worker_configs {
for config in actor_configs {
let status = self.zinit_client.status(&config.name).await.ok();
let is_running = status.as_ref()
.map(|s| s.state == "running" && s.pid > 0)
.unwrap_or(false);
workers.push(WorkerInfo {
actors.push(ActorInfo {
config: config.clone(),
status,
is_running,
});
}
workers
actors
}
/// Start a worker using Zinit
pub async fn start_worker(
/// Start a actor using Zinit
pub async fn start_actor(
&self,
worker_config: &WorkerConfig,
actor_config: &ActorConfig,
) -> Result<(), SupervisorError> {
info!("Starting worker: {}", worker_config.name);
info!("Starting actor: {}", actor_config.name);
// Create service configuration for Zinit
let service_config = self.create_service_config(worker_config);
let service_config = self.create_service_config(actor_config);
// Create the service in Zinit
self.zinit_client.create_service(&worker_config.name, service_config).await
self.zinit_client.create_service(&actor_config.name, service_config).await
.map_err(|e| SupervisorError::ZinitError(format!("Failed to create service: {}", e)))?;
// Monitor the service so Zinit starts managing it
self.zinit_client.monitor(&worker_config.name).await
self.zinit_client.monitor(&actor_config.name).await
.map_err(|e| SupervisorError::ZinitError(format!("Failed to monitor service: {}", e)))?;
// Start the service
self.zinit_client.start(&worker_config.name).await
.map_err(|e| SupervisorError::ZinitError(format!("Failed to start worker: {}", e)))?;
self.zinit_client.start(&actor_config.name).await
.map_err(|e| SupervisorError::ZinitError(format!("Failed to start actor: {}", e)))?;
info!("Successfully started worker: {}", worker_config.name);
info!("Successfully started actor: {}", actor_config.name);
Ok(())
}
/// Stop a worker using Zinit
pub async fn stop_worker(
/// Stop a actor using Zinit
pub async fn stop_actor(
&self,
worker_name: &str,
actor_name: &str,
) -> Result<(), SupervisorError> {
info!("Stopping worker: {}", worker_name);
info!("Stopping actor: {}", actor_name);
match self.zinit_client.stop(worker_name).await {
match self.zinit_client.stop(actor_name).await {
Ok(_) => {
info!("Successfully stopped worker: {}", worker_name);
info!("Successfully stopped actor: {}", actor_name);
Ok(())
}
Err(e) => {
error!("Failed to stop worker {}: {}", worker_name, e);
Err(SupervisorError::WorkerStopFailed(worker_name.to_string(), e.to_string()))
error!("Failed to stop actor {}: {}", actor_name, e);
Err(SupervisorError::ActorStopFailed(actor_name.to_string(), e.to_string()))
}
}
}
/// Restart a worker using Zinit
pub async fn restart_worker(
/// Restart a actor using Zinit
pub async fn restart_actor(
&self,
worker_name: &str,
actor_name: &str,
) -> Result<(), SupervisorError> {
info!("Restarting worker: {}", worker_name);
info!("Restarting actor: {}", actor_name);
match self.zinit_client.restart(worker_name).await {
match self.zinit_client.restart(actor_name).await {
Ok(_) => {
info!("Successfully restarted worker: {}", worker_name);
info!("Successfully restarted actor: {}", actor_name);
Ok(())
}
Err(e) => {
error!("Failed to restart worker {}: {}", worker_name, e);
Err(SupervisorError::WorkerRestartFailed(worker_name.to_string(), e.to_string()))
error!("Failed to restart actor {}: {}", actor_name, e);
Err(SupervisorError::ActorRestartFailed(actor_name.to_string(), e.to_string()))
}
}
}
/// Get status of a worker using Zinit
pub async fn get_worker_status(
/// Get status of a actor using Zinit
pub async fn get_actor_status(
&self,
worker_name: &str,
actor_name: &str,
zinit_client: &ZinitClient,
) -> Result<Status, SupervisorError> {
match zinit_client.status(worker_name).await {
match zinit_client.status(actor_name).await {
Ok(status) => Ok(status),
Err(e) => {
error!("Failed to get status for worker {}: {}", worker_name, e);
Err(SupervisorError::WorkerStatusFailed(worker_name.to_string(), e.to_string()))
error!("Failed to get status for actor {}: {}", actor_name, e);
Err(SupervisorError::ActorStatusFailed(actor_name.to_string(), e.to_string()))
}
}
}
/// Get status of all workers
pub async fn get_all_worker_status(
/// Get status of all actors
pub async fn get_all_actor_status(
&self,
worker_configs: &[WorkerConfig],
actor_configs: &[ActorConfig],
zinit_client: &ZinitClient,
) -> Result<HashMap<String, Status>, SupervisorError> {
let mut status_map = HashMap::new();
for worker in worker_configs {
match zinit_client.status(&worker.name).await {
for actor in actor_configs {
match zinit_client.status(&actor.name).await {
Ok(status) => {
status_map.insert(worker.name.clone(), status);
status_map.insert(actor.name.clone(), status);
}
Err(e) => {
warn!("Failed to get status for worker {}: {}", worker.name, e);
warn!("Failed to get status for actor {}: {}", actor.name, e);
}
}
}
@@ -206,32 +206,32 @@ impl Supervisor {
/// Stop multiple workers
pub async fn stop_workers(
/// Stop multiple actors
pub async fn stop_actors(
&self,
worker_names: &[String],
actor_names: &[String],
) -> Result<(), SupervisorError> {
info!("Stopping {} workers", worker_names.len());
info!("Stopping {} actors", actor_names.len());
for worker_name in worker_names {
self.stop_worker(worker_name).await?;
for actor_name in actor_names {
self.stop_actor(actor_name).await?;
}
Ok(())
}
/// Get count of running workers for a script type
pub async fn get_running_worker_count(
/// Get count of running actors for a script type
pub async fn get_running_actor_count(
&self,
worker_configs: &[WorkerConfig],
actor_configs: &[ActorConfig],
script_type: &ScriptType,
zinit_client: &ZinitClient,
) -> usize {
let mut running_count = 0;
for worker in worker_configs {
if worker.script_type == *script_type {
if let Ok(status) = zinit_client.status(&worker.name).await {
for actor in actor_configs {
if actor.script_type == *script_type {
if let Ok(status) = zinit_client.status(&actor.name).await {
if status.state == "running" {
running_count += 1;
}
@@ -242,7 +242,7 @@ impl Supervisor {
running_count
}
/// Send a ping job to a worker for health checking
/// Send a ping job to a actor for health checking
pub async fn send_ping_job(
&self,
script_type: ScriptType,
@@ -268,8 +268,8 @@ impl Supervisor {
}
}
/// Create Zinit service configuration from worker config
fn create_service_config(&self, worker: &WorkerConfig) -> serde_json::Map<String, serde_json::Value> {
/// Create Zinit service configuration from actor config
fn create_service_config(&self, actor: &ActorConfig) -> serde_json::Map<String, serde_json::Value> {
use serde_json::{Map, Value};
let mut config = Map::new();
@@ -277,117 +277,117 @@ impl Supervisor {
config.insert(
"exec".to_string(),
Value::String(format!("{} {}",
worker.binary_path.display(),
worker.args.join(" ")
actor.binary_path.display(),
actor.args.join(" ")
))
);
config.insert(
"oneshot".to_string(),
Value::Bool(!worker.restart_on_exit)
Value::Bool(!actor.restart_on_exit)
);
if let Some(health_check) = &worker.health_check {
if let Some(health_check) = &actor.health_check {
config.insert("test".to_string(), Value::String(health_check.clone()));
}
if !worker.dependencies.is_empty() {
config.insert("after".to_string(), json!(worker.dependencies));
if !actor.dependencies.is_empty() {
config.insert("after".to_string(), json!(actor.dependencies));
}
// Add environment variables if any
if !worker.env.is_empty() {
config.insert("env".to_string(), json!(worker.env));
if !actor.env.is_empty() {
config.insert("env".to_string(), json!(actor.env));
}
config
}
/// Launch workers based on SupervisorBuilder configuration
pub(crate) async fn launch_configured_workers(&self, builder: &crate::SupervisorBuilderData) -> Result<(), SupervisorError> {
/// Launch actors based on SupervisorBuilder configuration
pub(crate) async fn launch_configured_actors(&self, builder: &crate::SupervisorBuilderData) -> Result<(), SupervisorError> {
use hero_job::ScriptType;
use std::path::PathBuf;
let mut errors = Vec::new();
// Launch OSIS worker if configured
if let Some(binary_path) = &builder.osis_worker {
let worker_id = "osis_worker_1";
let mut config = WorkerConfig::new(
worker_id.to_string(),
// Launch OSIS actor if configured
if let Some(binary_path) = &builder.osis_actor {
let actor_id = "osis_actor_1";
let mut config = ActorConfig::new(
actor_id.to_string(),
PathBuf::from(binary_path),
ScriptType::OSIS
);
config.env.extend(builder.worker_env_vars.clone());
config.env.extend(builder.actor_env_vars.clone());
info!("Launching OSIS worker: {}", worker_id);
if let Err(e) = self.start_worker(&config).await {
let error_msg = format!("Failed to start OSIS worker: {}", e);
info!("Launching OSIS actor: {}", actor_id);
if let Err(e) = self.start_actor(&config).await {
let error_msg = format!("Failed to start OSIS actor: {}", e);
warn!("{}", error_msg);
errors.push(error_msg);
}
}
// Launch SAL worker if configured
if let Some(binary_path) = &builder.sal_worker {
let worker_id = "sal_worker_1";
let mut config = WorkerConfig::new(
worker_id.to_string(),
// Launch SAL actor if configured
if let Some(binary_path) = &builder.sal_actor {
let actor_id = "sal_actor_1";
let mut config = ActorConfig::new(
actor_id.to_string(),
PathBuf::from(binary_path),
ScriptType::SAL
);
config.env.extend(builder.worker_env_vars.clone());
config.env.extend(builder.actor_env_vars.clone());
info!("Launching SAL worker: {}", worker_id);
if let Err(e) = self.start_worker(&config).await {
let error_msg = format!("Failed to start SAL worker: {}", e);
info!("Launching SAL actor: {}", actor_id);
if let Err(e) = self.start_actor(&config).await {
let error_msg = format!("Failed to start SAL actor: {}", e);
warn!("{}", error_msg);
errors.push(error_msg);
}
}
// Launch V worker if configured
if let Some(binary_path) = &builder.v_worker {
let worker_id = "v_worker_1";
let mut config = WorkerConfig::new(
worker_id.to_string(),
// Launch V actor if configured
if let Some(binary_path) = &builder.v_actor {
let actor_id = "v_actor_1";
let mut config = ActorConfig::new(
actor_id.to_string(),
PathBuf::from(binary_path),
ScriptType::V
);
config.env.extend(builder.worker_env_vars.clone());
config.env.extend(builder.actor_env_vars.clone());
info!("Launching V worker: {}", worker_id);
if let Err(e) = self.start_worker(&config).await {
let error_msg = format!("Failed to start V worker: {}", e);
info!("Launching V actor: {}", actor_id);
if let Err(e) = self.start_actor(&config).await {
let error_msg = format!("Failed to start V actor: {}", e);
warn!("{}", error_msg);
errors.push(error_msg);
}
}
// Launch Python worker if configured
if let Some(binary_path) = &builder.python_worker {
let worker_id = "python_worker_1";
let mut config = WorkerConfig::new(
worker_id.to_string(),
// Launch Python actor if configured
if let Some(binary_path) = &builder.python_actor {
let actor_id = "python_actor_1";
let mut config = ActorConfig::new(
actor_id.to_string(),
PathBuf::from(binary_path),
ScriptType::Python
);
config.env.extend(builder.worker_env_vars.clone());
config.env.extend(builder.actor_env_vars.clone());
info!("Launching Python worker: {}", worker_id);
if let Err(e) = self.start_worker(&config).await {
let error_msg = format!("Failed to start Python worker: {}", e);
info!("Launching Python actor: {}", actor_id);
if let Err(e) = self.start_actor(&config).await {
let error_msg = format!("Failed to start Python actor: {}", e);
warn!("{}", error_msg);
errors.push(error_msg);
}
}
// Return result based on whether any workers started successfully
// Return result based on whether any actors started successfully
if errors.is_empty() {
info!("All configured workers started successfully");
info!("All configured actors started successfully");
Ok(())
} else {
let combined_error = format!("Some workers failed to start: {}", errors.join("; "));
let combined_error = format!("Some actors failed to start: {}", errors.join("; "));
warn!("{}", combined_error);
Err(SupervisorError::ZinitError(combined_error))
}