refactor wip
This commit is contained in:
@@ -7,19 +7,35 @@ edition = "2021"
|
||||
name = "supervisor"
|
||||
path = "cmd/supervisor.rs"
|
||||
|
||||
[[bin]]
|
||||
name = "hive-supervisor"
|
||||
path = "cmd/hive_supervisor.rs"
|
||||
|
||||
[[bin]]
|
||||
name = "hive-supervisor-tui"
|
||||
path = "cmd/hive_supervisor_tui.rs"
|
||||
|
||||
[[bin]]
|
||||
name = "hive-supervisor-tui-safe"
|
||||
path = "cmd/hive_supervisor_tui_safe.rs"
|
||||
|
||||
[dependencies]
|
||||
clap = { version = "4.4", features = ["derive"] }
|
||||
env_logger = "0.10"
|
||||
redis = { version = "0.25.0", features = ["tokio-comp"] }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
toml = "0.8"
|
||||
uuid = { version = "1.6", features = ["v4", "serde"] }
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
log = "0.4"
|
||||
tokio = { version = "1", features = ["macros", "rt-multi-thread"] } # For async main in examples, and general async
|
||||
colored = "2.0"
|
||||
hero_job = { path = "../job" }
|
||||
zinit-client = "0.4.0"
|
||||
zinit-client = { path = "/Users/timurgordon/code/github/threefoldtech/zinit/zinit-client" }
|
||||
ratatui = "0.28"
|
||||
crossterm = "0.28"
|
||||
anyhow = "1.0"
|
||||
|
||||
[dev-dependencies] # For examples later
|
||||
env_logger = "0.10"
|
||||
|
@@ -8,8 +8,6 @@ The lifecycle management system provides:
|
||||
|
||||
- **Worker Process Management**: Start, stop, restart, and monitor worker binaries
|
||||
- **Health Monitoring**: Automatic ping jobs every 10 minutes for idle workers
|
||||
- **Load Balancing**: Dynamic scaling of workers based on demand
|
||||
- **Service Dependencies**: Proper startup ordering with dependency management
|
||||
- **Graceful Shutdown**: Clean termination of worker processes
|
||||
|
||||
## Architecture
|
||||
@@ -313,3 +311,9 @@ redis-cli keys "hero:job:*"
|
||||
- **User Permissions**: Run workers with appropriate user permissions
|
||||
- **Network Security**: Secure Redis and Zinit socket access
|
||||
- **Binary Validation**: Verify worker binary integrity before deployment
|
||||
|
||||
|
||||
## Future
|
||||
|
||||
- **Load Balancing**: Dynamic scaling of workers based on demand
|
||||
- **Service Dependencies**: Proper startup ordering with dependency management
|
@@ -1,157 +1,66 @@
|
||||
# Rhai Client Binary
|
||||
# Supervisor CLI
|
||||
|
||||
A command-line client for executing Rhai scripts on remote workers via Redis.
|
||||
A command-line interface for the Hero Supervisor.
|
||||
|
||||
## Binary: `client`
|
||||
## Binary: `hive-supervisor`
|
||||
|
||||
### Installation
|
||||
|
||||
Build the binary:
|
||||
```bash
|
||||
cargo build --bin client --release
|
||||
cargo build --bin hive-supervisor --release
|
||||
```
|
||||
|
||||
|
||||
### Usage
|
||||
|
||||
```bash
|
||||
# Basic usage - requires caller and circle keys
|
||||
client --caller-key <CALLER_KEY> --circle-key <CIRCLE_KEY>
|
||||
|
||||
# Execute inline script
|
||||
client -c <CALLER_KEY> -k <CIRCLE_KEY> --script "print('Hello World!')"
|
||||
|
||||
# Execute script from file
|
||||
client -c <CALLER_KEY> -k <CIRCLE_KEY> --file script.rhai
|
||||
|
||||
# Use specific worker (defaults to circle key)
|
||||
client -c <CALLER_KEY> -k <CIRCLE_KEY> -w <WORKER_KEY> --script "2 + 2"
|
||||
|
||||
# Custom Redis and timeout
|
||||
client -c <CALLER_KEY> -k <CIRCLE_KEY> --redis-url redis://localhost:6379/1 --timeout 60
|
||||
|
||||
# Remove timestamps from logs
|
||||
client -c <CALLER_KEY> -k <CIRCLE_KEY> --no-timestamp
|
||||
|
||||
# Increase verbosity
|
||||
client -c <CALLER_KEY> -k <CIRCLE_KEY> -v --script "debug_info()"
|
||||
```
|
||||
|
||||
### Command-Line Options
|
||||
|
||||
| Option | Short | Default | Description |
|
||||
|--------|-------|---------|-------------|
|
||||
| `--caller-key` | `-c` | **Required** | Caller public key (your identity) |
|
||||
| `--circle-key` | `-k` | **Required** | Circle public key (execution context) |
|
||||
| `--worker-key` | `-w` | `circle-key` | Worker public key (target worker) |
|
||||
| `--redis-url` | `-r` | `redis://localhost:6379` | Redis connection URL |
|
||||
| `--script` | `-s` | | Rhai script to execute |
|
||||
| `--file` | `-f` | | Path to Rhai script file |
|
||||
| `--timeout` | `-t` | `30` | Timeout for script execution (seconds) |
|
||||
| `--no-timestamp` | | `false` | Remove timestamps from log output |
|
||||
| `--verbose` | `-v` | | Increase verbosity (stackable) |
|
||||
|
||||
### Execution Modes
|
||||
|
||||
#### Inline Script Execution
|
||||
```bash
|
||||
# Execute a simple calculation
|
||||
client -c caller_123 -k circle_456 -s "let result = 2 + 2; print(result);"
|
||||
|
||||
# Execute with specific worker
|
||||
client -c caller_123 -k circle_456 -w worker_789 -s "get_user_data()"
|
||||
```
|
||||
|
||||
#### Script File Execution
|
||||
```bash
|
||||
# Execute script from file
|
||||
client -c caller_123 -k circle_456 -f examples/data_processing.rhai
|
||||
|
||||
# Execute with custom timeout
|
||||
client -c caller_123 -k circle_456 -f long_running_script.rhai -t 120
|
||||
```
|
||||
|
||||
#### Interactive Mode
|
||||
```bash
|
||||
# Enter interactive REPL mode (when no script or file provided)
|
||||
client -c caller_123 -k circle_456
|
||||
|
||||
# Interactive mode with verbose logging
|
||||
client -c caller_123 -k circle_456 -v --no-timestamp
|
||||
```
|
||||
|
||||
### Interactive Mode
|
||||
|
||||
When no script (`-s`) or file (`-f`) is provided, the client enters interactive mode:
|
||||
# Basic usage
|
||||
hive-supervisor --config <CONFIG_PATH>
|
||||
|
||||
```
|
||||
🔗 Starting Rhai Client
|
||||
📋 Configuration:
|
||||
Caller Key: caller_123
|
||||
Circle Key: circle_456
|
||||
Worker Key: circle_456
|
||||
Redis URL: redis://localhost:6379
|
||||
Timeout: 30s
|
||||
|
||||
✅ Connected to Redis at redis://localhost:6379
|
||||
🎮 Entering interactive mode
|
||||
Type Rhai scripts and press Enter to execute. Type 'exit' or 'quit' to close.
|
||||
rhai> let x = 42; print(x);
|
||||
Status: completed
|
||||
Output: 42
|
||||
rhai> exit
|
||||
👋 Goodbye!
|
||||
Where config is toml file with the following structure:
|
||||
```toml
|
||||
[global]
|
||||
redis_url = "redis://localhost:6379"
|
||||
|
||||
[osis_worker]
|
||||
binary_path = "/path/to/osis_worker"
|
||||
env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
|
||||
|
||||
[sal_worker]
|
||||
binary_path = "/path/to/sal_worker"
|
||||
env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
|
||||
|
||||
[v_worker]
|
||||
binary_path = "/path/to/v_worker"
|
||||
env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
|
||||
|
||||
[python_worker]
|
||||
binary_path = "/path/to/python_worker"
|
||||
env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
|
||||
```
|
||||
|
||||
### Configuration Examples
|
||||
|
||||
#### Development Usage
|
||||
```bash
|
||||
# Simple development client
|
||||
client -c dev_user -k dev_circle
|
||||
Lets have verbosity settings etc.
|
||||
CLI Offers a few commands:
|
||||
|
||||
# Development with clean logs
|
||||
client -c dev_user -k dev_circle --no-timestamp -v
|
||||
```
|
||||
workers:
|
||||
start
|
||||
stop
|
||||
restart
|
||||
status
|
||||
logs
|
||||
list
|
||||
|
||||
#### Production Usage
|
||||
```bash
|
||||
# Production client with specific worker
|
||||
client \
|
||||
--caller-key prod_user_123 \
|
||||
--circle-key prod_circle_456 \
|
||||
--worker-key prod_worker_789 \
|
||||
--redis-url redis://redis-cluster:6379/0 \
|
||||
--timeout 300 \
|
||||
--file production_script.rhai
|
||||
```
|
||||
|
||||
#### Batch Processing
|
||||
```bash
|
||||
# Process multiple scripts
|
||||
for script in scripts/*.rhai; do
|
||||
client -c batch_user -k batch_circle -f "$script" --no-timestamp
|
||||
done
|
||||
```
|
||||
|
||||
### Key Concepts
|
||||
|
||||
- **Caller Key**: Your identity - used for authentication and tracking
|
||||
- **Circle Key**: Execution context - defines the environment/permissions
|
||||
- **Worker Key**: Target worker - which worker should execute the script (defaults to circle key)
|
||||
|
||||
### Error Handling
|
||||
|
||||
The client provides clear error messages for:
|
||||
- Missing required keys
|
||||
- Redis connection failures
|
||||
- Script execution timeouts
|
||||
- Worker unavailability
|
||||
- Script syntax errors
|
||||
|
||||
### Dependencies
|
||||
|
||||
- `rhai_supervisor`: Core client library for Redis-based script execution
|
||||
- `redis`: Redis client for task queue communication
|
||||
- `clap`: Command-line argument parsing
|
||||
- `env_logger`: Logging infrastructure
|
||||
- `tokio`: Async runtime
|
||||
jobs:
|
||||
create
|
||||
start
|
||||
stop
|
||||
restart
|
||||
status
|
||||
logs
|
||||
list
|
||||
|
||||
repl: you can enter interactive mode to run scripts, however predefine caller_id, context_id and worker type so supervisor dispathces jobs accordingly
|
365
core/supervisor/cmd/hive_supervisor_tui_safe.rs
Normal file
365
core/supervisor/cmd/hive_supervisor_tui_safe.rs
Normal file
@@ -0,0 +1,365 @@
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use crossterm::{
|
||||
event::{self, DisableMouseCapture, EnableMouseCapture, Event, KeyCode, KeyEventKind},
|
||||
execute,
|
||||
terminal::{disable_raw_mode, enable_raw_mode, EnterAlternateScreen, LeaveAlternateScreen},
|
||||
};
|
||||
use hero_supervisor::{Supervisor, SupervisorBuilder};
|
||||
use zinit_client::ZinitClient;
|
||||
use log::{error, info};
|
||||
use ratatui::{
|
||||
backend::CrosstermBackend,
|
||||
layout::{Constraint, Direction, Layout, Rect},
|
||||
style::{Color, Modifier, Style},
|
||||
text::Line,
|
||||
widgets::{
|
||||
Block, Borders, List, ListItem, Paragraph, Tabs, Wrap,
|
||||
},
|
||||
Frame, Terminal,
|
||||
};
|
||||
use std::{
|
||||
io,
|
||||
path::PathBuf,
|
||||
sync::Arc,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
use tokio::time::sleep;
|
||||
use toml;
|
||||
use serde::Deserialize;
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(name = "hive-supervisor-tui")]
|
||||
#[command(about = "Hero Supervisor Terminal User Interface")]
|
||||
struct Args {
|
||||
#[arg(short, long, help = "Configuration file path")]
|
||||
config: PathBuf,
|
||||
|
||||
#[arg(short, long, help = "Enable verbose logging")]
|
||||
verbose: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct Config {
|
||||
global: GlobalConfig,
|
||||
#[serde(flatten)]
|
||||
workers: std::collections::HashMap<String, WorkerConfigToml>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct GlobalConfig {
|
||||
redis_url: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct WorkerConfigToml {
|
||||
binary_path: String,
|
||||
env_vars: Option<std::collections::HashMap<String, String>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
enum TabId {
|
||||
Dashboard,
|
||||
Workers,
|
||||
Jobs,
|
||||
Logs,
|
||||
}
|
||||
|
||||
impl TabId {
|
||||
fn all() -> Vec<TabId> {
|
||||
vec![TabId::Dashboard, TabId::Workers, TabId::Jobs, TabId::Logs]
|
||||
}
|
||||
|
||||
fn title(&self) -> &str {
|
||||
match self {
|
||||
TabId::Dashboard => "Dashboard",
|
||||
TabId::Workers => "Workers",
|
||||
TabId::Jobs => "Jobs",
|
||||
TabId::Logs => "Logs",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct App {
|
||||
supervisor: Arc<Supervisor>,
|
||||
current_tab: TabId,
|
||||
should_quit: bool,
|
||||
logs: Vec<String>,
|
||||
last_update: Instant,
|
||||
}
|
||||
|
||||
impl App {
|
||||
fn new(supervisor: Arc<Supervisor>) -> Self {
|
||||
Self {
|
||||
supervisor,
|
||||
current_tab: TabId::Dashboard,
|
||||
should_quit: false,
|
||||
logs: vec!["TUI started successfully".to_string()],
|
||||
last_update: Instant::now(),
|
||||
}
|
||||
}
|
||||
|
||||
fn next_tab(&mut self) {
|
||||
let tabs = TabId::all();
|
||||
let current_index = tabs.iter().position(|t| *t == self.current_tab).unwrap_or(0);
|
||||
let next_index = (current_index + 1) % tabs.len();
|
||||
self.current_tab = tabs[next_index].clone();
|
||||
}
|
||||
|
||||
fn prev_tab(&mut self) {
|
||||
let tabs = TabId::all();
|
||||
let current_index = tabs.iter().position(|t| *t == self.current_tab).unwrap_or(0);
|
||||
let prev_index = if current_index == 0 { tabs.len() - 1 } else { current_index - 1 };
|
||||
self.current_tab = tabs[prev_index].clone();
|
||||
}
|
||||
|
||||
fn add_log(&mut self, message: String) {
|
||||
self.logs.push(format!("[{}] {}",
|
||||
chrono::Utc::now().format("%H:%M:%S"),
|
||||
message
|
||||
));
|
||||
if self.logs.len() > 100 {
|
||||
self.logs.remove(0);
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_key(&mut self, key: KeyCode) -> bool {
|
||||
match key {
|
||||
KeyCode::Char('q') => {
|
||||
self.should_quit = true;
|
||||
true
|
||||
}
|
||||
KeyCode::Tab => {
|
||||
self.next_tab();
|
||||
false
|
||||
}
|
||||
KeyCode::BackTab => {
|
||||
self.prev_tab();
|
||||
false
|
||||
}
|
||||
_ => false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn render_ui(f: &mut Frame, app: &mut App) {
|
||||
let chunks = Layout::default()
|
||||
.direction(Direction::Vertical)
|
||||
.constraints([Constraint::Length(3), Constraint::Min(0)].as_ref())
|
||||
.split(f.area());
|
||||
|
||||
// Render tabs
|
||||
let tabs_list = TabId::all();
|
||||
let tab_titles: Vec<Line> = tabs_list
|
||||
.iter()
|
||||
.map(|t| Line::from(t.title()))
|
||||
.collect();
|
||||
|
||||
let selected_tab = TabId::all().iter().position(|t| *t == app.current_tab).unwrap_or(0);
|
||||
let tabs = Tabs::new(tab_titles)
|
||||
.block(Block::default().borders(Borders::ALL).title("Hero Supervisor TUI"))
|
||||
.select(selected_tab)
|
||||
.style(Style::default().fg(Color::Cyan))
|
||||
.highlight_style(Style::default().add_modifier(Modifier::BOLD).bg(Color::Black));
|
||||
|
||||
f.render_widget(tabs, chunks[0]);
|
||||
|
||||
// Render content based on selected tab
|
||||
match app.current_tab {
|
||||
TabId::Dashboard => render_dashboard(f, chunks[1], app),
|
||||
TabId::Workers => render_workers(f, chunks[1], app),
|
||||
TabId::Jobs => render_jobs(f, chunks[1], app),
|
||||
TabId::Logs => render_logs(f, chunks[1], app),
|
||||
}
|
||||
}
|
||||
|
||||
fn render_dashboard(f: &mut Frame, area: Rect, app: &App) {
|
||||
let chunks = Layout::default()
|
||||
.direction(Direction::Vertical)
|
||||
.constraints([Constraint::Length(7), Constraint::Min(0)].as_ref())
|
||||
.split(area);
|
||||
|
||||
// Status overview - supervisor is already running if we get here
|
||||
let status_text = "Status: ✓ Running\nWorkers: Started successfully\nJobs: Ready for processing\n\nPress 'q' to quit, Tab to navigate";
|
||||
|
||||
let status_paragraph = Paragraph::new(status_text)
|
||||
.block(Block::default().borders(Borders::ALL).title("System Status"))
|
||||
.wrap(Wrap { trim: true });
|
||||
|
||||
f.render_widget(status_paragraph, chunks[0]);
|
||||
|
||||
// Recent logs
|
||||
let log_items: Vec<ListItem> = app.logs
|
||||
.iter()
|
||||
.rev()
|
||||
.take(10)
|
||||
.map(|log| ListItem::new(log.as_str()))
|
||||
.collect();
|
||||
|
||||
let logs_list = List::new(log_items)
|
||||
.block(Block::default().borders(Borders::ALL).title("Recent Activity"));
|
||||
|
||||
f.render_widget(logs_list, chunks[1]);
|
||||
}
|
||||
|
||||
fn render_workers(f: &mut Frame, area: Rect, _app: &App) {
|
||||
let paragraph = Paragraph::new("Workers tab - Status checking not implemented yet to avoid system issues")
|
||||
.block(Block::default().borders(Borders::ALL).title("Workers"))
|
||||
.wrap(Wrap { trim: true });
|
||||
|
||||
f.render_widget(paragraph, area);
|
||||
}
|
||||
|
||||
fn render_jobs(f: &mut Frame, area: Rect, _app: &App) {
|
||||
let paragraph = Paragraph::new("Jobs tab - Job monitoring not implemented yet to avoid system issues")
|
||||
.block(Block::default().borders(Borders::ALL).title("Jobs"))
|
||||
.wrap(Wrap { trim: true });
|
||||
|
||||
f.render_widget(paragraph, area);
|
||||
}
|
||||
|
||||
fn render_logs(f: &mut Frame, area: Rect, app: &App) {
|
||||
let items: Vec<ListItem> = app.logs
|
||||
.iter()
|
||||
.map(|log| ListItem::new(log.as_str()))
|
||||
.collect();
|
||||
|
||||
let logs_list = List::new(items)
|
||||
.block(Block::default().borders(Borders::ALL).title("System Logs"));
|
||||
|
||||
f.render_widget(logs_list, area);
|
||||
}
|
||||
|
||||
async fn run_app(
|
||||
terminal: &mut Terminal<CrosstermBackend<io::Stdout>>,
|
||||
app: &mut App,
|
||||
) -> Result<()> {
|
||||
loop {
|
||||
terminal.draw(|f| render_ui(f, app))?;
|
||||
|
||||
// Simple, safe event handling
|
||||
if event::poll(Duration::from_millis(100))? {
|
||||
if let Event::Key(key) = event::read()? {
|
||||
if key.kind == KeyEventKind::Press {
|
||||
if app.handle_key(key.code) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if app.should_quit {
|
||||
break;
|
||||
}
|
||||
|
||||
// Small delay to prevent excessive CPU usage
|
||||
sleep(Duration::from_millis(50)).await;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
// Initialize logging
|
||||
if args.verbose {
|
||||
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("debug")).init();
|
||||
} else {
|
||||
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init();
|
||||
}
|
||||
|
||||
info!("Hero Supervisor TUI - Fail-fast initialization");
|
||||
|
||||
// Step 1: Load and parse configuration
|
||||
info!("Step 1/4: Loading configuration from {:?}", args.config);
|
||||
let config_content = std::fs::read_to_string(&args.config)
|
||||
.map_err(|e| anyhow::anyhow!("Failed to read config file: {}", e))?;
|
||||
let config: Config = toml::from_str(&config_content)
|
||||
.map_err(|e| anyhow::anyhow!("Failed to parse config file: {}", e))?;
|
||||
info!("✓ Configuration loaded successfully");
|
||||
|
||||
// Step 2: Check if Zinit is running
|
||||
info!("Step 2/4: Checking if Zinit is running...");
|
||||
let zinit_client = ZinitClient::new("/tmp/zinit.sock");
|
||||
match zinit_client.status("_test_connectivity").await {
|
||||
Ok(_) => {
|
||||
info!("✓ Zinit is running and accessible");
|
||||
}
|
||||
Err(e) => {
|
||||
let error_msg = e.to_string();
|
||||
if error_msg.contains("Connection refused") || error_msg.contains("No such file") {
|
||||
eprintln!("Error: Zinit process manager is not running.");
|
||||
eprintln!("Please start Zinit before running the supervisor TUI.");
|
||||
eprintln!("Expected Zinit socket at: /tmp/zinit.sock");
|
||||
std::process::exit(1);
|
||||
} else {
|
||||
info!("✓ Zinit is running (service not found is expected)");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Step 3: Build supervisor
|
||||
info!("Step 3/4: Building supervisor...");
|
||||
let mut builder = SupervisorBuilder::new()
|
||||
.redis_url(&config.global.redis_url);
|
||||
|
||||
for (worker_name, worker_config) in &config.workers {
|
||||
match worker_name.as_str() {
|
||||
"osis_worker" => builder = builder.osis_worker(&worker_config.binary_path),
|
||||
"sal_worker" => builder = builder.sal_worker(&worker_config.binary_path),
|
||||
"v_worker" => builder = builder.v_worker(&worker_config.binary_path),
|
||||
"python_worker" => builder = builder.python_worker(&worker_config.binary_path),
|
||||
_ => log::warn!("Unknown worker type: {}", worker_name),
|
||||
}
|
||||
|
||||
if let Some(env_vars) = &worker_config.env_vars {
|
||||
for (key, value) in env_vars {
|
||||
builder = builder.worker_env_var(key, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let supervisor = Arc::new(builder.build()
|
||||
.map_err(|e| anyhow::anyhow!("Failed to build supervisor: {}", e))?);
|
||||
info!("✓ Supervisor built successfully");
|
||||
|
||||
// Step 4: Start supervisor and workers
|
||||
info!("Step 4/4: Starting supervisor and workers...");
|
||||
supervisor.start_workers().await
|
||||
.map_err(|e| anyhow::anyhow!("Failed to start workers: {}", e))?;
|
||||
info!("✓ All workers started successfully");
|
||||
|
||||
// All initialization successful - now start TUI
|
||||
info!("Initialization complete - starting TUI...");
|
||||
let mut app = App::new(Arc::clone(&supervisor));
|
||||
|
||||
// Setup terminal
|
||||
enable_raw_mode()?;
|
||||
let mut stdout = io::stdout();
|
||||
execute!(stdout, EnterAlternateScreen, EnableMouseCapture)?;
|
||||
let backend = CrosstermBackend::new(stdout);
|
||||
let mut terminal = Terminal::new(backend)?;
|
||||
|
||||
// Run the app
|
||||
let result = run_app(&mut terminal, &mut app).await;
|
||||
|
||||
// Cleanup
|
||||
disable_raw_mode()?;
|
||||
execute!(
|
||||
terminal.backend_mut(),
|
||||
LeaveAlternateScreen,
|
||||
DisableMouseCapture
|
||||
)?;
|
||||
terminal.show_cursor()?;
|
||||
|
||||
// Cleanup supervisor
|
||||
if let Err(e) = supervisor.cleanup_and_shutdown().await {
|
||||
error!("Error during cleanup: {}", e);
|
||||
}
|
||||
|
||||
info!("Hero Supervisor TUI shutdown complete");
|
||||
|
||||
result
|
||||
}
|
@@ -1,190 +0,0 @@
|
||||
# Architecture of the `rhai_supervisor` Crate
|
||||
|
||||
The `rhai_supervisor` crate provides a Redis-based client library for submitting Rhai scripts to distributed worker services and awaiting their execution results. It implements a request-reply pattern using Redis as the message broker.
|
||||
|
||||
## Core Architecture
|
||||
|
||||
The client follows a builder pattern design with clear separation of concerns:
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
A[RhaiSupervisorBuilder] --> B[RhaiSupervisor]
|
||||
B --> C[PlayRequestBuilder]
|
||||
C --> D[PlayRequest]
|
||||
D --> E[Redis Task Queue]
|
||||
E --> F[Worker Service]
|
||||
F --> G[Redis Reply Queue]
|
||||
G --> H[Client Response]
|
||||
|
||||
subgraph "Client Components"
|
||||
A
|
||||
B
|
||||
C
|
||||
D
|
||||
end
|
||||
|
||||
subgraph "Redis Infrastructure"
|
||||
E
|
||||
G
|
||||
end
|
||||
|
||||
subgraph "External Services"
|
||||
F
|
||||
end
|
||||
```
|
||||
|
||||
## Key Components
|
||||
|
||||
### 1. RhaiSupervisorBuilder
|
||||
|
||||
A builder pattern implementation for constructing `RhaiSupervisor` instances with proper configuration validation.
|
||||
|
||||
**Responsibilities:**
|
||||
- Configure Redis connection URL
|
||||
- Set caller ID for task attribution
|
||||
- Validate configuration before building client
|
||||
|
||||
**Key Methods:**
|
||||
- `caller_id(id: &str)` - Sets the caller identifier
|
||||
- `redis_url(url: &str)` - Configures Redis connection
|
||||
- `build()` - Creates the final `RhaiSupervisor` instance
|
||||
|
||||
### 2. RhaiSupervisor
|
||||
|
||||
The main client interface that manages Redis connections and provides factory methods for creating play requests.
|
||||
|
||||
**Responsibilities:**
|
||||
- Maintain Redis connection pool
|
||||
- Provide factory methods for request builders
|
||||
- Handle low-level Redis operations
|
||||
- Manage task status queries
|
||||
|
||||
**Key Methods:**
|
||||
- `new_play_request()` - Creates a new `PlayRequestBuilder`
|
||||
- `get_task_status(task_id)` - Queries task status from Redis
|
||||
- Internal methods for Redis operations
|
||||
|
||||
### 3. PlayRequestBuilder
|
||||
|
||||
A fluent builder for constructing and submitting script execution requests.
|
||||
|
||||
**Responsibilities:**
|
||||
- Configure script execution parameters
|
||||
- Handle script loading from files or strings
|
||||
- Manage request timeouts
|
||||
- Provide submission methods (fire-and-forget vs await-response)
|
||||
|
||||
**Key Methods:**
|
||||
- `worker_id(id: &str)` - Target worker queue (determines which worker processes the task)
|
||||
- `context_id(id: &str)` - Target context ID (determines execution context/circle)
|
||||
- `script(content: &str)` - Set script content directly
|
||||
- `script_path(path: &str)` - Load script from file
|
||||
- `timeout(duration: Duration)` - Set execution timeout
|
||||
- `submit()` - Fire-and-forget submission
|
||||
- `await_response()` - Submit and wait for result
|
||||
|
||||
**Architecture Note:** The decoupling of `worker_id` and `context_id` allows a single worker to process tasks for multiple contexts (circles), providing greater deployment flexibility.
|
||||
|
||||
### 4. Data Structures
|
||||
|
||||
#### RhaiTaskDetails
|
||||
Represents the complete state of a task throughout its lifecycle.
|
||||
|
||||
```rust
|
||||
pub struct RhaiTaskDetails {
|
||||
pub task_id: String,
|
||||
pub script: String,
|
||||
pub status: String, // "pending", "processing", "completed", "error"
|
||||
pub output: Option<String>,
|
||||
pub error: Option<String>,
|
||||
pub created_at: DateTime<Utc>,
|
||||
pub updated_at: DateTime<Utc>,
|
||||
pub caller_id: String,
|
||||
}
|
||||
```
|
||||
|
||||
#### RhaiSupervisorError
|
||||
Comprehensive error handling for various failure scenarios:
|
||||
- `RedisError` - Redis connection/operation failures
|
||||
- `SerializationError` - JSON serialization/deserialization issues
|
||||
- `Timeout` - Task execution timeouts
|
||||
- `TaskNotFound` - Missing tasks after submission
|
||||
|
||||
## Communication Protocol
|
||||
|
||||
### Task Submission Flow
|
||||
|
||||
1. **Task Creation**: Client generates unique UUID for task identification
|
||||
2. **Task Storage**: Task details stored in Redis hash: `rhailib:<task_id>`
|
||||
3. **Queue Submission**: Task ID pushed to worker queue: `rhailib:<worker_id>`
|
||||
4. **Reply Queue Setup**: Client listens on: `rhailib:reply:<task_id>`
|
||||
|
||||
### Redis Key Patterns
|
||||
|
||||
- **Task Storage**: `rhailib:<task_id>` (Redis Hash)
|
||||
- **Worker Queues**: `rhailib:<worker_id>` (Redis List)
|
||||
- **Reply Queues**: `rhailib:reply:<task_id>` (Redis List)
|
||||
|
||||
### Message Flow Diagram
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant C as Client
|
||||
participant R as Redis
|
||||
participant W as Worker
|
||||
|
||||
C->>R: HSET rhailib:task_id (task details)
|
||||
C->>R: LPUSH rhailib:worker_id task_id
|
||||
C->>R: BLPOP rhailib:reply:task_id (blocking)
|
||||
|
||||
W->>R: BRPOP rhailib:worker_id (blocking)
|
||||
W->>W: Execute Rhai Script
|
||||
W->>R: LPUSH rhailib:reply:task_id (result)
|
||||
|
||||
R->>C: Return result from BLPOP
|
||||
C->>R: DEL rhailib:reply:task_id (cleanup)
|
||||
```
|
||||
|
||||
## Concurrency and Async Design
|
||||
|
||||
The client is built on `tokio` for asynchronous operations:
|
||||
|
||||
- **Connection Pooling**: Uses Redis multiplexed connections for efficiency
|
||||
- **Non-blocking Operations**: All Redis operations are async
|
||||
- **Timeout Handling**: Configurable timeouts with proper cleanup
|
||||
- **Error Propagation**: Comprehensive error handling with context
|
||||
|
||||
## Configuration and Deployment
|
||||
|
||||
### Prerequisites
|
||||
- Redis server accessible to both client and workers
|
||||
- Proper network connectivity between components
|
||||
- Sufficient Redis memory for task storage
|
||||
|
||||
### Configuration Options
|
||||
- **Redis URL**: Connection string for Redis instance
|
||||
- **Caller ID**: Unique identifier for client instance
|
||||
- **Timeouts**: Per-request timeout configuration
|
||||
- **Worker Targeting**: Direct worker queue addressing
|
||||
|
||||
## Security Considerations
|
||||
|
||||
- **Task Isolation**: Each task uses unique identifiers
|
||||
- **Queue Separation**: Worker-specific queues prevent cross-contamination
|
||||
- **Cleanup**: Automatic cleanup of reply queues after completion
|
||||
- **Error Handling**: Secure error propagation without sensitive data leakage
|
||||
|
||||
## Performance Characteristics
|
||||
|
||||
- **Scalability**: Horizontal scaling through multiple worker instances
|
||||
- **Throughput**: Limited by Redis performance and network latency
|
||||
- **Memory Usage**: Efficient with connection pooling and cleanup
|
||||
- **Latency**: Low latency for local Redis deployments
|
||||
|
||||
## Integration Points
|
||||
|
||||
The client integrates with:
|
||||
- **Worker Services**: Via Redis queue protocol
|
||||
- **Monitoring Systems**: Through structured logging
|
||||
- **Application Code**: Via builder pattern API
|
||||
- **Configuration Systems**: Through environment variables and builders
|
185
core/supervisor/examples/cli/README.md
Normal file
185
core/supervisor/examples/cli/README.md
Normal file
@@ -0,0 +1,185 @@
|
||||
# Hero Supervisor CLI Example
|
||||
|
||||
This example demonstrates how to use the `hive-supervisor` CLI tool for managing workers and jobs in the Hero ecosystem.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. **Redis Server**: Make sure Redis is running on `localhost:6379`
|
||||
```bash
|
||||
# Install Redis (macOS)
|
||||
brew install redis
|
||||
|
||||
# Start Redis
|
||||
redis-server
|
||||
```
|
||||
|
||||
2. **Zinit Process Manager**: Install and configure Zinit
|
||||
```bash
|
||||
# Install Zinit (example for Linux/macOS)
|
||||
# Follow Zinit installation instructions for your platform
|
||||
```
|
||||
|
||||
3. **Worker Binaries**: The configuration references worker binaries that need to be available:
|
||||
- `/usr/local/bin/osis_worker`
|
||||
- `/usr/local/bin/sal_worker`
|
||||
- `/usr/local/bin/v_worker`
|
||||
- `/usr/local/bin/python_worker`
|
||||
|
||||
For testing purposes, you can create mock worker binaries or update the paths in `config.toml` to point to existing binaries.
|
||||
|
||||
## Configuration
|
||||
|
||||
The `config.toml` file contains the supervisor configuration:
|
||||
|
||||
- **Global settings**: Redis URL and Zinit socket path
|
||||
- **Worker configurations**: Binary paths and environment variables for each worker type
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### 1. Build the CLI
|
||||
|
||||
```bash
|
||||
# From the supervisor directory
|
||||
cargo build --bin hive-supervisor --release
|
||||
```
|
||||
|
||||
### 2. Worker Management
|
||||
|
||||
```bash
|
||||
# Show help
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml --help
|
||||
|
||||
# List all configured workers
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml workers list
|
||||
|
||||
# Start all workers
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml workers start
|
||||
|
||||
# Start specific workers
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml workers start osis_worker sal_worker
|
||||
|
||||
# Check worker status
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml workers status
|
||||
|
||||
# Stop all workers
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml workers stop
|
||||
|
||||
# Restart specific worker
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml workers restart osis_worker
|
||||
```
|
||||
|
||||
### 3. Job Management
|
||||
|
||||
```bash
|
||||
# Create a job with inline script
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml jobs create \
|
||||
--script 'print("Hello from OSIS worker!");' \
|
||||
--script-type osis \
|
||||
--caller-id "user123" \
|
||||
--context-id "session456"
|
||||
|
||||
# Create a job from file
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml jobs create \
|
||||
--file examples/cli/sample_script.rhai \
|
||||
--script-type osis \
|
||||
--caller-id "user123" \
|
||||
--context-id "session456"
|
||||
|
||||
# List all jobs
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml jobs list
|
||||
|
||||
# Check job status
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml jobs status <JOB_ID>
|
||||
|
||||
# View job logs
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml jobs logs <JOB_ID>
|
||||
|
||||
# Stop a job
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml jobs stop <JOB_ID>
|
||||
```
|
||||
|
||||
### 4. Interactive REPL Mode
|
||||
|
||||
```bash
|
||||
# Enter REPL mode for OSIS scripts
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml repl \
|
||||
--caller-id "user123" \
|
||||
--context-id "session456" \
|
||||
--script-type osis \
|
||||
--timeout 60
|
||||
|
||||
# In REPL mode, you can:
|
||||
# - Type scripts directly and press Enter to execute
|
||||
# - Type 'help' for available commands
|
||||
# - Type 'exit' or 'quit' to leave REPL mode
|
||||
```
|
||||
|
||||
### 5. Verbose Logging
|
||||
|
||||
```bash
|
||||
# Enable debug logging
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml -v workers status
|
||||
|
||||
# Enable trace logging
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml -vv workers status
|
||||
|
||||
# Disable timestamps
|
||||
./target/release/hive-supervisor --config examples/cli/config.toml --no-timestamp workers status
|
||||
```
|
||||
|
||||
## Sample Scripts
|
||||
|
||||
The `sample_scripts/` directory contains example scripts for different worker types:
|
||||
|
||||
- `hello_osis.rhai` - Simple OSIS/HeroScript example
|
||||
- `system_sal.rhai` - SAL system operation example
|
||||
- `math_v.v` - V language calculation example
|
||||
- `data_python.py` - Python data processing example
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
1. **Redis Connection Error**
|
||||
- Ensure Redis is running: `redis-cli ping`
|
||||
- Check the Redis URL in `config.toml`
|
||||
|
||||
2. **Zinit Socket Error**
|
||||
- Verify Zinit is running and the socket path is correct
|
||||
- Check permissions on the socket file
|
||||
|
||||
3. **Worker Binary Not Found**
|
||||
- Update binary paths in `config.toml` to match your system
|
||||
- Ensure worker binaries are executable
|
||||
|
||||
4. **Permission Denied**
|
||||
- Check file permissions on configuration and binary files
|
||||
- Ensure the user has access to the Zinit socket
|
||||
|
||||
### Debug Mode
|
||||
|
||||
Run with verbose logging to see detailed operation information:
|
||||
|
||||
```bash
|
||||
RUST_LOG=debug ./target/release/hive-supervisor --config examples/cli/config.toml -vv workers status
|
||||
```
|
||||
|
||||
## Configuration Customization
|
||||
|
||||
You can customize the configuration for your environment:
|
||||
|
||||
1. **Update Redis URL**: Change `redis_url` in the `[global]` section
|
||||
2. **Update Zinit Socket**: Change `zinit_socket_path` for your Zinit installation
|
||||
3. **Worker Paths**: Update binary paths in worker sections to match your setup
|
||||
4. **Environment Variables**: Add or modify environment variables for each worker type
|
||||
|
||||
## Integration with Hero Ecosystem
|
||||
|
||||
This CLI integrates with the broader Hero ecosystem:
|
||||
|
||||
- **Job Queue**: Uses Redis for job queuing and status tracking
|
||||
- **Process Management**: Uses Zinit for worker lifecycle management
|
||||
- **Script Execution**: Supports multiple script types (OSIS, SAL, V, Python)
|
||||
- **Monitoring**: Provides real-time status and logging capabilities
|
||||
|
||||
For more information about the Hero ecosystem, see the main project documentation.
|
19
core/supervisor/examples/cli/config.toml
Normal file
19
core/supervisor/examples/cli/config.toml
Normal file
@@ -0,0 +1,19 @@
|
||||
# Hero Supervisor CLI Configuration Example
|
||||
# This configuration demonstrates how to set up the hive-supervisor CLI
|
||||
# with different worker types for script execution.
|
||||
|
||||
[global]
|
||||
# Redis connection URL for job queuing
|
||||
redis_url = "redis://localhost:6379"
|
||||
|
||||
# OSIS Worker Configuration
|
||||
# Handles OSIS (HeroScript) execution
|
||||
[osis_worker]
|
||||
binary_path = "/Users/timurgordon/code/git.ourworld.tf/herocode/hero/target/debug/osis"
|
||||
env_vars = { "RUST_LOG" = "info", "WORKER_TYPE" = "osis", "MAX_CONCURRENT_JOBS" = "5" }
|
||||
|
||||
# SAL Worker Configuration
|
||||
# Handles System Abstraction Layer scripts
|
||||
[sal_worker]
|
||||
binary_path = "/Users/timurgordon/code/git.ourworld.tf/herocode/hero/target/debug/sal"
|
||||
env_vars = { "RUST_LOG" = "info", "WORKER_TYPE" = "sal", "MAX_CONCURRENT_JOBS" = "3" }
|
144
core/supervisor/examples/cli/run_examples.sh
Executable file
144
core/supervisor/examples/cli/run_examples.sh
Executable file
@@ -0,0 +1,144 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Hero Supervisor CLI Example Runner
|
||||
# This script demonstrates various CLI operations
|
||||
|
||||
set -e
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
BLUE='\033[0;34m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Configuration
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
SUPERVISOR_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
CONFIG_FILE="$SCRIPT_DIR/config.toml"
|
||||
CLI_BINARY="$SUPERVISOR_DIR/target/release/hive-supervisor"
|
||||
|
||||
echo -e "${BLUE}=== Hero Supervisor CLI Example Runner ===${NC}"
|
||||
echo "Script directory: $SCRIPT_DIR"
|
||||
echo "Supervisor directory: $SUPERVISOR_DIR"
|
||||
echo "Configuration file: $CONFIG_FILE"
|
||||
echo
|
||||
|
||||
# Function to run CLI command with error handling
|
||||
run_cli() {
|
||||
local description="$1"
|
||||
shift
|
||||
echo -e "${YELLOW}Running: $description${NC}"
|
||||
echo "Command: $CLI_BINARY --config $CONFIG_FILE $*"
|
||||
echo
|
||||
|
||||
if "$CLI_BINARY" --config "$CONFIG_FILE" "$@"; then
|
||||
echo -e "${GREEN}✓ Success${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ Failed${NC}"
|
||||
return 1
|
||||
fi
|
||||
echo
|
||||
}
|
||||
|
||||
# Check if CLI binary exists
|
||||
if [[ ! -f "$CLI_BINARY" ]]; then
|
||||
echo -e "${YELLOW}Building CLI binary...${NC}"
|
||||
cd "$SUPERVISOR_DIR"
|
||||
cargo build --bin hive-supervisor --release
|
||||
echo
|
||||
fi
|
||||
|
||||
# Check if config file exists
|
||||
if [[ ! -f "$CONFIG_FILE" ]]; then
|
||||
echo -e "${RED}Error: Configuration file not found: $CONFIG_FILE${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo -e "${BLUE}=== CLI Help and Information ===${NC}"
|
||||
run_cli "Show main help" --help
|
||||
|
||||
echo -e "${BLUE}=== Worker Management Examples ===${NC}"
|
||||
run_cli "List configured workers" workers list
|
||||
run_cli "Show worker management help" workers --help
|
||||
|
||||
# Note: These commands would require actual worker binaries and Zinit setup
|
||||
echo -e "${YELLOW}Note: The following commands require actual worker binaries and Zinit setup${NC}"
|
||||
echo -e "${YELLOW}They are shown for demonstration but may fail without proper setup${NC}"
|
||||
echo
|
||||
|
||||
# Uncomment these if you have the proper setup
|
||||
# run_cli "Check worker status" workers status
|
||||
# run_cli "Start all workers" workers start
|
||||
# run_cli "Check worker status after start" workers status
|
||||
|
||||
echo -e "${BLUE}=== Job Management Examples ===${NC}"
|
||||
run_cli "Show job management help" jobs --help
|
||||
|
||||
# Create sample jobs (these will also require workers to be running)
|
||||
echo -e "${YELLOW}Sample job creation commands (require running workers):${NC}"
|
||||
echo
|
||||
|
||||
echo "# Create OSIS job with inline script:"
|
||||
echo "$CLI_BINARY --config $CONFIG_FILE jobs create \\"
|
||||
echo " --script 'print(\"Hello from CLI!\");' \\"
|
||||
echo " --script-type osis \\"
|
||||
echo " --caller-id \"cli_demo\" \\"
|
||||
echo " --context-id \"example_session\""
|
||||
echo
|
||||
|
||||
echo "# Create job from sample script file:"
|
||||
echo "$CLI_BINARY --config $CONFIG_FILE jobs create \\"
|
||||
echo " --file \"$SCRIPT_DIR/sample_scripts/hello_osis.rhai\" \\"
|
||||
echo " --script-type osis \\"
|
||||
echo " --caller-id \"cli_demo\" \\"
|
||||
echo " --context-id \"example_session\""
|
||||
echo
|
||||
|
||||
echo "# List all jobs:"
|
||||
echo "$CLI_BINARY --config $CONFIG_FILE jobs list"
|
||||
echo
|
||||
|
||||
echo "# Check job status (replace JOB_ID with actual job ID):"
|
||||
echo "$CLI_BINARY --config $CONFIG_FILE jobs status JOB_ID"
|
||||
echo
|
||||
|
||||
echo -e "${BLUE}=== REPL Mode Example ===${NC}"
|
||||
echo -e "${YELLOW}REPL mode command (interactive):${NC}"
|
||||
echo "$CLI_BINARY --config $CONFIG_FILE repl \\"
|
||||
echo " --caller-id \"cli_demo\" \\"
|
||||
echo " --context-id \"example_session\" \\"
|
||||
echo " --script-type osis \\"
|
||||
echo " --timeout 60"
|
||||
echo
|
||||
|
||||
echo -e "${BLUE}=== Sample Scripts ===${NC}"
|
||||
echo "Available sample scripts in $SCRIPT_DIR/sample_scripts/:"
|
||||
for script in "$SCRIPT_DIR/sample_scripts"/*; do
|
||||
if [[ -f "$script" ]]; then
|
||||
basename "$script"
|
||||
fi
|
||||
done
|
||||
echo
|
||||
|
||||
echo -e "${BLUE}=== Verbose Logging Examples ===${NC}"
|
||||
echo "# Debug logging:"
|
||||
echo "$CLI_BINARY --config $CONFIG_FILE -v workers list"
|
||||
echo
|
||||
echo "# Trace logging:"
|
||||
echo "$CLI_BINARY --config $CONFIG_FILE -vv workers list"
|
||||
echo
|
||||
echo "# No timestamps:"
|
||||
echo "$CLI_BINARY --config $CONFIG_FILE --no-timestamp workers list"
|
||||
echo
|
||||
|
||||
echo -e "${GREEN}=== Example Runner Complete ===${NC}"
|
||||
echo -e "${YELLOW}To run actual commands, ensure you have:${NC}"
|
||||
echo "1. Redis server running on localhost:6379"
|
||||
echo "2. Zinit process manager installed and configured"
|
||||
echo "3. Worker binaries available at the paths specified in config.toml"
|
||||
echo
|
||||
echo -e "${YELLOW}For testing without full setup, you can:${NC}"
|
||||
echo "1. Update config.toml with paths to existing binaries"
|
||||
echo "2. Use the CLI help commands and configuration validation"
|
||||
echo "3. Test the REPL mode (requires workers to be running)"
|
90
core/supervisor/examples/cli/sample_scripts/data_python.py
Normal file
90
core/supervisor/examples/cli/sample_scripts/data_python.py
Normal file
@@ -0,0 +1,90 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Sample Python script for demonstration
|
||||
This script demonstrates Python worker functionality
|
||||
"""
|
||||
|
||||
import json
|
||||
import datetime
|
||||
from typing import List, Dict
|
||||
|
||||
def main():
|
||||
print("=== Python Worker Demo ===")
|
||||
print("Python data processing operations")
|
||||
|
||||
# Data structures
|
||||
print("\nData structures:")
|
||||
users = [
|
||||
{"id": 1, "name": "Alice", "age": 30, "role": "developer"},
|
||||
{"id": 2, "name": "Bob", "age": 25, "role": "designer"},
|
||||
{"id": 3, "name": "Charlie", "age": 35, "role": "manager"},
|
||||
{"id": 4, "name": "Diana", "age": 28, "role": "developer"}
|
||||
]
|
||||
|
||||
print(f"Total users: {len(users)}")
|
||||
|
||||
# Data filtering
|
||||
developers = [user for user in users if user["role"] == "developer"]
|
||||
print(f"Developers: {len(developers)}")
|
||||
for dev in developers:
|
||||
print(f" - {dev['name']} (age {dev['age']})")
|
||||
|
||||
# Statistical operations
|
||||
print("\nStatistical operations:")
|
||||
ages = [user["age"] for user in users]
|
||||
avg_age = sum(ages) / len(ages)
|
||||
min_age = min(ages)
|
||||
max_age = max(ages)
|
||||
|
||||
print(f"Average age: {avg_age:.1f}")
|
||||
print(f"Age range: {min_age} - {max_age}")
|
||||
|
||||
# Date/time operations
|
||||
print("\nDate/time operations:")
|
||||
now = datetime.datetime.now()
|
||||
print(f"Current time: {now.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
# Calculate birth years
|
||||
current_year = now.year
|
||||
for user in users:
|
||||
birth_year = current_year - user["age"]
|
||||
print(f"{user['name']} was born in {birth_year}")
|
||||
|
||||
# JSON processing
|
||||
print("\nJSON processing:")
|
||||
json_data = json.dumps(users, indent=2)
|
||||
print("User data as JSON:")
|
||||
print(json_data[:200] + "..." if len(json_data) > 200 else json_data)
|
||||
|
||||
# File operations simulation
|
||||
print("\nFile operations:")
|
||||
simulate_file_processing()
|
||||
|
||||
print("=== Python Demo Complete ===")
|
||||
|
||||
def simulate_file_processing():
|
||||
"""Simulate file processing operations"""
|
||||
files = [
|
||||
{"name": "data.csv", "size": 1024, "type": "csv"},
|
||||
{"name": "config.json", "size": 512, "type": "json"},
|
||||
{"name": "report.pdf", "size": 2048, "type": "pdf"},
|
||||
{"name": "script.py", "size": 768, "type": "python"}
|
||||
]
|
||||
|
||||
total_size = sum(file["size"] for file in files)
|
||||
print(f"Processing {len(files)} files, total size: {total_size} bytes")
|
||||
|
||||
# Group by type
|
||||
file_types = {}
|
||||
for file in files:
|
||||
file_type = file["type"]
|
||||
if file_type not in file_types:
|
||||
file_types[file_type] = []
|
||||
file_types[file_type].append(file["name"])
|
||||
|
||||
print("Files by type:")
|
||||
for file_type, file_names in file_types.items():
|
||||
print(f" {file_type}: {', '.join(file_names)}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
34
core/supervisor/examples/cli/sample_scripts/hello_osis.rhai
Normal file
34
core/supervisor/examples/cli/sample_scripts/hello_osis.rhai
Normal file
@@ -0,0 +1,34 @@
|
||||
// Sample OSIS/HeroScript for demonstration
|
||||
// This script demonstrates basic OSIS worker functionality
|
||||
|
||||
print("=== OSIS Worker Demo ===");
|
||||
print("Hello from the OSIS worker!");
|
||||
|
||||
// Basic variable operations
|
||||
let name = "Hero";
|
||||
let version = "1.0";
|
||||
print(`Running ${name} version ${version}`);
|
||||
|
||||
// Simple calculation
|
||||
let x = 10;
|
||||
let y = 20;
|
||||
let result = x + y;
|
||||
print(`Calculation: ${x} + ${y} = ${result}`);
|
||||
|
||||
// Array operations
|
||||
let numbers = [1, 2, 3, 4, 5];
|
||||
let sum = 0;
|
||||
for num in numbers {
|
||||
sum += num;
|
||||
}
|
||||
print(`Sum of array [1,2,3,4,5]: ${sum}`);
|
||||
|
||||
// Function definition and call
|
||||
fn greet(person) {
|
||||
return `Hello, ${person}! Welcome to Hero.`;
|
||||
}
|
||||
|
||||
let greeting = greet("Developer");
|
||||
print(greeting);
|
||||
|
||||
print("=== OSIS Demo Complete ===");
|
67
core/supervisor/examples/cli/sample_scripts/math_v.v
Normal file
67
core/supervisor/examples/cli/sample_scripts/math_v.v
Normal file
@@ -0,0 +1,67 @@
|
||||
// Sample V language script for demonstration
|
||||
// This script demonstrates V worker functionality
|
||||
|
||||
module main
|
||||
|
||||
import math
|
||||
|
||||
fn main() {
|
||||
println("=== V Worker Demo ===")
|
||||
println("V language mathematical operations")
|
||||
|
||||
// Basic arithmetic
|
||||
x := 15
|
||||
y := 25
|
||||
sum := x + y
|
||||
product := x * y
|
||||
println("Basic arithmetic:")
|
||||
println("${x} + ${y} = ${sum}")
|
||||
println("${x} * ${y} = ${product}")
|
||||
|
||||
// Mathematical functions
|
||||
println("\nMathematical functions:")
|
||||
angle := 45.0
|
||||
sin_val := math.sin(math.radians(angle))
|
||||
cos_val := math.cos(math.radians(angle))
|
||||
println("sin(${angle}°) = ${sin_val:.4f}")
|
||||
println("cos(${angle}°) = ${cos_val:.4f}")
|
||||
|
||||
// Array operations
|
||||
numbers := [1, 4, 9, 16, 25]
|
||||
println("\nArray operations:")
|
||||
println("Numbers: ${numbers}")
|
||||
|
||||
mut total := 0
|
||||
for num in numbers {
|
||||
total += num
|
||||
}
|
||||
println("Sum: ${total}")
|
||||
|
||||
// Square roots
|
||||
println("\nSquare roots:")
|
||||
for num in numbers {
|
||||
sqrt_val := math.sqrt(f64(num))
|
||||
println("√${num} = ${sqrt_val:.2f}")
|
||||
}
|
||||
|
||||
// Fibonacci sequence
|
||||
println("\nFibonacci sequence (first 10 numbers):")
|
||||
fib := fibonacci(10)
|
||||
println("${fib}")
|
||||
|
||||
println("=== V Demo Complete ===")
|
||||
}
|
||||
|
||||
fn fibonacci(n int) []int {
|
||||
mut fib := []int{len: n}
|
||||
if n >= 1 {
|
||||
fib[0] = 0
|
||||
}
|
||||
if n >= 2 {
|
||||
fib[1] = 1
|
||||
}
|
||||
for i in 2 .. n {
|
||||
fib[i] = fib[i-1] + fib[i-2]
|
||||
}
|
||||
return fib
|
||||
}
|
43
core/supervisor/examples/cli/sample_scripts/system_sal.rhai
Normal file
43
core/supervisor/examples/cli/sample_scripts/system_sal.rhai
Normal file
@@ -0,0 +1,43 @@
|
||||
// Sample SAL (System Abstraction Layer) script for demonstration
|
||||
// This script demonstrates system-level operations through SAL worker
|
||||
|
||||
print("=== SAL Worker Demo ===");
|
||||
print("System Abstraction Layer operations");
|
||||
|
||||
// System information gathering
|
||||
print("Gathering system information...");
|
||||
|
||||
// Simulated system operations (actual SAL would have real system calls)
|
||||
let hostname = "hero-system";
|
||||
let uptime = "2 days, 4 hours";
|
||||
let load_avg = "0.45, 0.52, 0.48";
|
||||
|
||||
print(`Hostname: ${hostname}`);
|
||||
print(`Uptime: ${uptime}`);
|
||||
print(`Load Average: ${load_avg}`);
|
||||
|
||||
// File system operations
|
||||
print("\nFile system operations:");
|
||||
let disk_usage = "45% used";
|
||||
let available_space = "120GB available";
|
||||
|
||||
print(`Disk Usage: ${disk_usage}`);
|
||||
print(`Available Space: ${available_space}`);
|
||||
|
||||
// Process management simulation
|
||||
print("\nProcess management:");
|
||||
let active_processes = 156;
|
||||
let memory_usage = "68%";
|
||||
|
||||
print(`Active Processes: ${active_processes}`);
|
||||
print(`Memory Usage: ${memory_usage}`);
|
||||
|
||||
// Network status
|
||||
print("\nNetwork status:");
|
||||
let network_interfaces = ["eth0", "lo"];
|
||||
let connectivity = "Connected";
|
||||
|
||||
print(`Network Interfaces: ${network_interfaces}`);
|
||||
print(`Connectivity: ${connectivity}`);
|
||||
|
||||
print("=== SAL Demo Complete ===");
|
@@ -17,7 +17,6 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
|
||||
// Configuration
|
||||
let redis_url = "redis://localhost:6379";
|
||||
let zinit_socket = "/var/run/zinit.sock";
|
||||
|
||||
// Create supervisor
|
||||
let supervisor = SupervisorBuilder::new()
|
||||
|
@@ -12,7 +12,6 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// Workers are automatically launched during build
|
||||
let supervisor = SupervisorBuilder::new()
|
||||
.redis_url("redis://localhost:6379")
|
||||
.zinit_socket_path("/var/run/zinit.sock")
|
||||
.osis_worker("/usr/local/bin/osis_worker")
|
||||
.sal_worker("/usr/local/bin/sal_worker")
|
||||
.v_worker("/usr/local/bin/v_worker")
|
||||
|
18
core/supervisor/examples/supervisor_config.toml
Normal file
18
core/supervisor/examples/supervisor_config.toml
Normal file
@@ -0,0 +1,18 @@
|
||||
[global]
|
||||
redis_url = "redis://localhost:6379"
|
||||
|
||||
[osis_worker]
|
||||
binary_path = "/path/to/osis_worker"
|
||||
env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
|
||||
|
||||
[sal_worker]
|
||||
binary_path = "/path/to/sal_worker"
|
||||
env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
|
||||
|
||||
[v_worker]
|
||||
binary_path = "/path/to/v_worker"
|
||||
env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
|
||||
|
||||
[python_worker]
|
||||
binary_path = "/path/to/python_worker"
|
||||
env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
|
@@ -31,6 +31,8 @@ pub enum SupervisorError {
|
||||
/// Zinit client operation error
|
||||
ZinitError(String),
|
||||
SupervisorNotConfigured,
|
||||
/// Configuration file parsing error
|
||||
ConfigError(String),
|
||||
}
|
||||
|
||||
impl From<redis::RedisError> for SupervisorError {
|
||||
@@ -95,6 +97,9 @@ impl std::fmt::Display for SupervisorError {
|
||||
SupervisorError::SupervisorNotConfigured => {
|
||||
write!(f, "Supervisor not configured for health monitoring")
|
||||
}
|
||||
SupervisorError::ConfigError(msg) => {
|
||||
write!(f, "Configuration error: {}", msg)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -1,9 +1,14 @@
|
||||
use log::{debug, error, info, warn};
|
||||
use redis::AsyncCommands;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use hero_job::NAMESPACE_PREFIX;
|
||||
use zinit_client::ZinitClient;
|
||||
use zinit_client::Client as ZinitClient;
|
||||
|
||||
|
||||
mod job;
|
||||
mod error;
|
||||
@@ -23,46 +28,209 @@ pub struct Supervisor {
|
||||
|
||||
pub struct SupervisorBuilder {
|
||||
redis_url: Option<String>,
|
||||
zinit_socket_path: Option<String>,
|
||||
osis_worker: Option<String>,
|
||||
sal_worker: Option<String>,
|
||||
v_worker: Option<String>,
|
||||
python_worker: Option<String>,
|
||||
worker_env_vars: HashMap<String, String>,
|
||||
websocket_config: Option<WebSocketServerConfig>,
|
||||
}
|
||||
|
||||
/// Helper struct to pass builder data to worker launch method
|
||||
#[derive(Clone)]
|
||||
struct SupervisorBuilderData {
|
||||
osis_worker: Option<String>,
|
||||
sal_worker: Option<String>,
|
||||
v_worker: Option<String>,
|
||||
python_worker: Option<String>,
|
||||
worker_env_vars: HashMap<String, String>,
|
||||
websocket_config: Option<WebSocketServerConfig>,
|
||||
}
|
||||
|
||||
/// TOML configuration structure for the supervisor
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct SupervisorConfig {
|
||||
pub global: GlobalConfig,
|
||||
pub websocket_server: Option<WebSocketServerConfig>,
|
||||
pub osis_worker: Option<WorkerConfigToml>,
|
||||
pub sal_worker: Option<WorkerConfigToml>,
|
||||
pub v_worker: Option<WorkerConfigToml>,
|
||||
pub python_worker: Option<WorkerConfigToml>,
|
||||
}
|
||||
|
||||
/// Global configuration section
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct GlobalConfig {
|
||||
pub redis_url: String,
|
||||
}
|
||||
|
||||
/// Worker configuration section in TOML
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct WorkerConfigToml {
|
||||
pub binary_path: String,
|
||||
#[serde(default)]
|
||||
pub env_vars: HashMap<String, String>,
|
||||
}
|
||||
|
||||
/// WebSocket server configuration section in TOML
|
||||
/// This mirrors the ServerConfig from hero_websocket_server but avoids circular dependency
|
||||
#[derive(Debug, Deserialize, Serialize, Clone)]
|
||||
pub struct WebSocketServerConfig {
|
||||
/// Server host address
|
||||
#[serde(default = "default_host")]
|
||||
pub host: String,
|
||||
|
||||
/// Server port
|
||||
#[serde(default = "default_port")]
|
||||
pub port: u16,
|
||||
|
||||
/// Redis connection URL
|
||||
#[serde(default = "default_redis_url")]
|
||||
pub redis_url: String,
|
||||
|
||||
/// Enable authentication
|
||||
#[serde(default)]
|
||||
pub auth: bool,
|
||||
|
||||
/// Enable TLS/WSS
|
||||
#[serde(default)]
|
||||
pub tls: bool,
|
||||
|
||||
/// Path to TLS certificate file
|
||||
pub cert: Option<String>,
|
||||
|
||||
/// Path to TLS private key file
|
||||
pub key: Option<String>,
|
||||
|
||||
/// Separate port for TLS connections
|
||||
pub tls_port: Option<u16>,
|
||||
|
||||
/// Circles configuration - maps circle names to lists of member public keys
|
||||
#[serde(default)]
|
||||
pub circles: HashMap<String, Vec<String>>,
|
||||
}
|
||||
|
||||
// Default value functions for WebSocket server config
|
||||
fn default_host() -> String {
|
||||
"127.0.0.1".to_string()
|
||||
}
|
||||
|
||||
fn default_port() -> u16 {
|
||||
8443
|
||||
}
|
||||
|
||||
fn default_redis_url() -> String {
|
||||
"redis://127.0.0.1/".to_string()
|
||||
}
|
||||
|
||||
impl SupervisorBuilder {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
redis_url: None,
|
||||
zinit_socket_path: Some("/var/run/zinit.sock".to_string()),
|
||||
osis_worker: None,
|
||||
sal_worker: None,
|
||||
v_worker: None,
|
||||
python_worker: None,
|
||||
worker_env_vars: HashMap::new(),
|
||||
websocket_config: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a SupervisorBuilder from a TOML configuration file
|
||||
pub fn from_toml<P: AsRef<Path>>(toml_path: P) -> Result<Self, SupervisorError> {
|
||||
let toml_content = fs::read_to_string(toml_path)
|
||||
.map_err(|e| SupervisorError::ConfigError(format!("Failed to read TOML file: {}", e)))?;
|
||||
|
||||
let config: SupervisorConfig = toml::from_str(&toml_content)
|
||||
.map_err(|e| SupervisorError::ConfigError(format!("Failed to parse TOML: {}", e)))?;
|
||||
|
||||
let mut builder = Self::new()
|
||||
.redis_url(&config.global.redis_url);
|
||||
|
||||
// Configure workers based on TOML config
|
||||
if let Some(osis_config) = config.osis_worker {
|
||||
builder = builder.osis_worker(&osis_config.binary_path)
|
||||
.worker_env_vars(osis_config.env_vars);
|
||||
}
|
||||
|
||||
if let Some(sal_config) = config.sal_worker {
|
||||
builder = builder.sal_worker(&sal_config.binary_path)
|
||||
.worker_env_vars(sal_config.env_vars);
|
||||
}
|
||||
|
||||
if let Some(v_config) = config.v_worker {
|
||||
builder = builder.v_worker(&v_config.binary_path)
|
||||
.worker_env_vars(v_config.env_vars);
|
||||
}
|
||||
|
||||
if let Some(python_config) = config.python_worker {
|
||||
builder = builder.python_worker(&python_config.binary_path)
|
||||
.worker_env_vars(python_config.env_vars);
|
||||
}
|
||||
|
||||
// Store WebSocket configuration for later use
|
||||
if let Some(ws_config) = config.websocket_server {
|
||||
builder.websocket_config = Some(ws_config);
|
||||
}
|
||||
|
||||
Ok(builder)
|
||||
}
|
||||
|
||||
/// Validate that all configured worker binaries exist and are executable
|
||||
fn validate_worker_binaries(&self) -> Result<(), SupervisorError> {
|
||||
let workers = [
|
||||
("OSIS", &self.osis_worker),
|
||||
("SAL", &self.sal_worker),
|
||||
("V", &self.v_worker),
|
||||
("Python", &self.python_worker),
|
||||
];
|
||||
|
||||
for (worker_type, binary_path) in workers {
|
||||
if let Some(path) = binary_path {
|
||||
let path_obj = Path::new(path);
|
||||
|
||||
if !path_obj.exists() {
|
||||
return Err(SupervisorError::ConfigError(
|
||||
format!("{} worker binary does not exist: {}", worker_type, path)
|
||||
));
|
||||
}
|
||||
|
||||
if !path_obj.is_file() {
|
||||
return Err(SupervisorError::ConfigError(
|
||||
format!("{} worker path is not a file: {}", worker_type, path)
|
||||
));
|
||||
}
|
||||
|
||||
// Check if the file is executable (Unix-like systems)
|
||||
#[cfg(unix)]
|
||||
{
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
let metadata = path_obj.metadata().map_err(|e| {
|
||||
SupervisorError::ConfigError(
|
||||
format!("Failed to read metadata for {} worker binary {}: {}", worker_type, path, e)
|
||||
)
|
||||
})?;
|
||||
|
||||
let permissions = metadata.permissions();
|
||||
if permissions.mode() & 0o111 == 0 {
|
||||
return Err(SupervisorError::ConfigError(
|
||||
format!("{} worker binary is not executable: {}", worker_type, path)
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
info!("Validated {} worker binary: {}", worker_type, path);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn redis_url(mut self, url: &str) -> Self {
|
||||
self.redis_url = Some(url.to_string());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn zinit_socket_path(mut self, path: &str) -> Self {
|
||||
self.zinit_socket_path = Some(path.to_string());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn osis_worker(mut self, binary_path: &str) -> Self {
|
||||
self.osis_worker = Some(binary_path.to_string());
|
||||
self
|
||||
@@ -95,21 +263,23 @@ impl SupervisorBuilder {
|
||||
|
||||
/// Builds the final `Supervisor` instance synchronously.
|
||||
///
|
||||
/// This method validates the configuration and creates the Redis client.
|
||||
/// Worker launching is deferred to the `start_workers()` method.
|
||||
/// This method validates the configuration, checks worker binary existence,
|
||||
/// and creates the Redis client. Worker launching is deferred to the `start_workers()` method.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// * `Ok(Supervisor)` - Successfully configured client
|
||||
/// * `Err(SupervisorError)` - Configuration or connection error
|
||||
pub fn build(self) -> Result<Supervisor, SupervisorError> {
|
||||
/// * `Ok(Supervisor)` - Successfully configured client with valid binaries
|
||||
/// * `Err(SupervisorError)` - Configuration, binary validation, or connection error
|
||||
pub async fn build(self) -> Result<Supervisor, SupervisorError> {
|
||||
// Validate that all configured worker binaries exist first
|
||||
Self::validate_worker_binaries(&self)?;
|
||||
|
||||
let url = self.redis_url
|
||||
.unwrap_or_else(|| "redis://127.0.0.1/".to_string());
|
||||
let client = redis::Client::open(url)?;
|
||||
|
||||
let zinit_socket = self.zinit_socket_path
|
||||
.unwrap_or_else(|| "/var/run/zinit.sock".to_string());
|
||||
let zinit_client = ZinitClient::new(&zinit_socket);
|
||||
let zinit_client = ZinitClient::unix_socket("/tmp/zinit.sock").await
|
||||
.map_err(|e| SupervisorError::ZinitError(format!("Failed to create Zinit client: {}", e)))?;
|
||||
|
||||
// Store builder data for later use in start_workers()
|
||||
let builder_data = SupervisorBuilderData {
|
||||
@@ -118,6 +288,7 @@ impl SupervisorBuilder {
|
||||
v_worker: self.v_worker,
|
||||
python_worker: self.python_worker,
|
||||
worker_env_vars: self.worker_env_vars,
|
||||
websocket_config: self.websocket_config,
|
||||
};
|
||||
|
||||
let supervisor = Supervisor {
|
||||
@@ -134,14 +305,33 @@ impl Supervisor {
|
||||
/// Start all configured workers asynchronously.
|
||||
/// This method should be called after build() to launch the workers.
|
||||
pub async fn start_workers(&self) -> Result<(), SupervisorError> {
|
||||
info!("Starting Hero Supervisor workers...");
|
||||
|
||||
// Test Zinit connection first
|
||||
info!("Testing Zinit connection at /tmp/zinit.sock...");
|
||||
match self.zinit_client.list().await {
|
||||
Ok(services) => {
|
||||
info!("Successfully connected to Zinit. Current services: {:?}", services);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to connect to Zinit: {:?}", e);
|
||||
return Err(SupervisorError::ZinitError(format!("Zinit connection failed: {}", e)));
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up any existing worker services first
|
||||
info!("Cleaning up existing worker services...");
|
||||
self.cleanup_existing_workers().await?;
|
||||
|
||||
// Launch configured workers if builder data is available
|
||||
if let Some(builder_data) = &self.builder_data {
|
||||
info!("Launching configured workers...");
|
||||
self.launch_configured_workers(builder_data).await?;
|
||||
} else {
|
||||
warn!("No builder data available, no workers to start");
|
||||
}
|
||||
|
||||
info!("All workers started successfully!");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -179,7 +369,11 @@ impl Supervisor {
|
||||
|
||||
for worker_name in worker_names {
|
||||
// Try to stop and delete, but don't fail if they don't exist
|
||||
let _ = self.stop_and_delete_worker(worker_name).await;
|
||||
info!("Attempting to cleanup worker: {}", worker_name);
|
||||
match self.stop_and_delete_worker(worker_name).await {
|
||||
Ok(_) => info!("Successfully cleaned up worker: {}", worker_name),
|
||||
Err(e) => debug!("Failed to cleanup worker {}: {}", worker_name, e),
|
||||
}
|
||||
}
|
||||
|
||||
info!("Existing worker cleanup completed");
|
||||
@@ -188,18 +382,33 @@ impl Supervisor {
|
||||
|
||||
/// Stop and delete a worker service from zinit
|
||||
async fn stop_and_delete_worker(&self, worker_name: &str) -> Result<(), SupervisorError> {
|
||||
info!("Starting cleanup for worker: {}", worker_name);
|
||||
|
||||
// First try to stop the worker
|
||||
info!("Attempting to stop worker: {}", worker_name);
|
||||
if let Err(e) = self.zinit_client.stop(worker_name).await {
|
||||
debug!("Worker {} was not running or failed to stop: {}", worker_name, e);
|
||||
} else {
|
||||
info!("Successfully stopped worker: {}", worker_name);
|
||||
}
|
||||
|
||||
// Then try to delete the service
|
||||
if let Err(e) = self.zinit_client.delete(worker_name).await {
|
||||
// Then forget the service to stop monitoring it
|
||||
info!("Attempting to forget worker: {}", worker_name);
|
||||
if let Err(e) = self.zinit_client.forget(worker_name).await {
|
||||
info!("Worker {} was not being monitored or failed to forget: {}", worker_name, e);
|
||||
} else {
|
||||
info!("Successfully forgot worker service: {}", worker_name);
|
||||
}
|
||||
|
||||
// Finally, delete the service configuration
|
||||
info!("Attempting to delete service for worker: {}", worker_name);
|
||||
if let Err(e) = self.zinit_client.delete_service(worker_name).await {
|
||||
debug!("Worker {} service did not exist or failed to delete: {}", worker_name, e);
|
||||
} else {
|
||||
info!("Successfully deleted worker service: {}", worker_name);
|
||||
}
|
||||
|
||||
info!("Completed cleanup for worker: {}", worker_name);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -211,6 +420,157 @@ impl Supervisor {
|
||||
pub fn new_job(&self) -> JobBuilder {
|
||||
JobBuilder::new(self)
|
||||
}
|
||||
|
||||
/// Get WebSocket server configuration from TOML config
|
||||
pub fn get_websocket_config(&self) -> Result<WebSocketServerConfig, SupervisorError> {
|
||||
let builder_data = self.builder_data.as_ref().ok_or_else(|| {
|
||||
SupervisorError::ConfigError("No builder data available for WebSocket config".to_string())
|
||||
})?;
|
||||
|
||||
builder_data.websocket_config.clone().ok_or_else(|| {
|
||||
SupervisorError::ConfigError("No WebSocket server configuration found in TOML config".to_string())
|
||||
})
|
||||
}
|
||||
|
||||
/// Extract worker configurations from the supervisor's builder data
|
||||
pub fn get_worker_configs(&self) -> Result<Vec<WorkerConfig>, SupervisorError> {
|
||||
let builder_data = self.builder_data.as_ref().ok_or_else(|| {
|
||||
SupervisorError::ConfigError("No builder data available for worker configs".to_string())
|
||||
})?;
|
||||
|
||||
let mut configs = Vec::new();
|
||||
let env_vars = builder_data.worker_env_vars.clone();
|
||||
|
||||
if let Some(osis_path) = &builder_data.osis_worker {
|
||||
configs.push(
|
||||
WorkerConfig::new("osis_worker_1".to_string(), PathBuf::from(osis_path), ScriptType::OSIS)
|
||||
.with_env(env_vars.clone())
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(sal_path) = &builder_data.sal_worker {
|
||||
configs.push(
|
||||
WorkerConfig::new("sal_worker_1".to_string(), PathBuf::from(sal_path), ScriptType::SAL)
|
||||
.with_env(env_vars.clone())
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(v_path) = &builder_data.v_worker {
|
||||
configs.push(
|
||||
WorkerConfig::new("v_worker_1".to_string(), PathBuf::from(v_path), ScriptType::V)
|
||||
.with_env(env_vars.clone())
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(python_path) = &builder_data.python_worker {
|
||||
configs.push(
|
||||
WorkerConfig::new("python_worker_1".to_string(), PathBuf::from(python_path), ScriptType::Python)
|
||||
.with_env(env_vars.clone())
|
||||
);
|
||||
}
|
||||
|
||||
Ok(configs)
|
||||
}
|
||||
|
||||
/// Spawn a background lifecycle manager that continuously monitors and maintains worker health
|
||||
/// Returns a JoinHandle that can be used to stop the lifecycle manager
|
||||
pub fn spawn_lifecycle_manager(
|
||||
self: Arc<Self>,
|
||||
worker_configs: Vec<WorkerConfig>,
|
||||
health_check_interval: Duration,
|
||||
) -> tokio::task::JoinHandle<Result<(), SupervisorError>> {
|
||||
let supervisor = self;
|
||||
|
||||
tokio::spawn(async move {
|
||||
info!("Starting background lifecycle manager with {} workers", worker_configs.len());
|
||||
info!("Health check interval: {:?}", health_check_interval);
|
||||
|
||||
// Initial worker startup
|
||||
info!("Performing initial worker startup...");
|
||||
if let Err(e) = supervisor.start_workers().await {
|
||||
error!("Failed to start workers during initialization: {}", e);
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
// Start the monitoring loop
|
||||
let mut interval = tokio::time::interval(health_check_interval);
|
||||
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
|
||||
|
||||
loop {
|
||||
interval.tick().await;
|
||||
|
||||
info!("Running periodic worker health check...");
|
||||
|
||||
// Check each worker's health and restart if needed
|
||||
for worker_config in &worker_configs {
|
||||
if let Err(e) = supervisor.check_and_restart_worker(worker_config).await {
|
||||
error!("Failed to check/restart worker {}: {}", worker_config.name, e);
|
||||
}
|
||||
}
|
||||
|
||||
info!("Health check cycle completed");
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Check a single worker's health and restart if needed
|
||||
async fn check_and_restart_worker(&self, worker_config: &WorkerConfig) -> Result<(), SupervisorError> {
|
||||
let worker_name = &worker_config.name;
|
||||
|
||||
// Get worker status
|
||||
match self.zinit_client.status(worker_name).await {
|
||||
Ok(status) => {
|
||||
let is_healthy = status.state == "running" && status.pid > 0;
|
||||
|
||||
if is_healthy {
|
||||
debug!("Worker {} is healthy (state: {}, pid: {})", worker_name, status.state, status.pid);
|
||||
|
||||
// Optionally send a ping job for deeper health check
|
||||
if let Err(e) = self.send_ping_job(worker_config.script_type.clone()).await {
|
||||
warn!("Ping job failed for worker {}: {}", worker_name, e);
|
||||
// Note: We don't restart on ping failure as it might be temporary
|
||||
}
|
||||
} else {
|
||||
warn!("Worker {} is unhealthy (state: {}, pid: {}), restarting...",
|
||||
worker_name, status.state, status.pid);
|
||||
|
||||
// Attempt to restart the worker
|
||||
if let Err(e) = self.restart_worker(worker_name).await {
|
||||
error!("Failed to restart unhealthy worker {}: {}", worker_name, e);
|
||||
|
||||
// If restart fails, try a full stop/start cycle
|
||||
warn!("Attempting full stop/start cycle for worker: {}", worker_name);
|
||||
if let Err(e) = self.stop_and_delete_worker(worker_name).await {
|
||||
error!("Failed to stop worker {} during recovery: {}", worker_name, e);
|
||||
}
|
||||
|
||||
if let Err(e) = self.start_worker(worker_config).await {
|
||||
error!("Failed to start worker {} during recovery: {}", worker_name, e);
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
info!("Successfully recovered worker: {}", worker_name);
|
||||
} else {
|
||||
info!("Successfully restarted worker: {}", worker_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Could not get status for worker {} (may not exist): {}", worker_name, e);
|
||||
|
||||
// Worker doesn't exist, try to start it
|
||||
info!("Attempting to start missing worker: {}", worker_name);
|
||||
if let Err(e) = self.start_worker(worker_config).await {
|
||||
error!("Failed to start missing worker {}: {}", worker_name, e);
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
info!("Successfully started missing worker: {}", worker_name);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Internal helper to submit script details and push to work queue
|
||||
async fn create_job_using_connection(
|
||||
|
@@ -8,7 +8,7 @@ use serde_json::json;
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
use zinit_client::{ZinitClient, ServiceStatus, ServiceState};
|
||||
use zinit_client::{Client as ZinitClient, Status};
|
||||
use hero_job::ScriptType;
|
||||
use crate::{Supervisor, SupervisorError};
|
||||
|
||||
@@ -16,7 +16,7 @@ use crate::{Supervisor, SupervisorError};
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct WorkerInfo {
|
||||
pub config: WorkerConfig,
|
||||
pub status: Option<ServiceStatus>,
|
||||
pub status: Option<Status>,
|
||||
pub is_running: bool,
|
||||
}
|
||||
|
||||
@@ -90,7 +90,7 @@ impl Supervisor {
|
||||
for config in worker_configs {
|
||||
let status = self.zinit_client.status(&config.name).await.ok();
|
||||
let is_running = status.as_ref()
|
||||
.map(|s| matches!(s.state, ServiceState::Running) && s.pid > 0)
|
||||
.map(|s| s.state == "running" && s.pid > 0)
|
||||
.unwrap_or(false);
|
||||
|
||||
workers.push(WorkerInfo {
|
||||
@@ -117,6 +117,10 @@ impl Supervisor {
|
||||
self.zinit_client.create_service(&worker_config.name, service_config).await
|
||||
.map_err(|e| SupervisorError::ZinitError(format!("Failed to create service: {}", e)))?;
|
||||
|
||||
// Monitor the service so Zinit starts managing it
|
||||
self.zinit_client.monitor(&worker_config.name).await
|
||||
.map_err(|e| SupervisorError::ZinitError(format!("Failed to monitor service: {}", e)))?;
|
||||
|
||||
// Start the service
|
||||
self.zinit_client.start(&worker_config.name).await
|
||||
.map_err(|e| SupervisorError::ZinitError(format!("Failed to start worker: {}", e)))?;
|
||||
@@ -168,7 +172,7 @@ impl Supervisor {
|
||||
&self,
|
||||
worker_name: &str,
|
||||
zinit_client: &ZinitClient,
|
||||
) -> Result<ServiceStatus, SupervisorError> {
|
||||
) -> Result<Status, SupervisorError> {
|
||||
match zinit_client.status(worker_name).await {
|
||||
Ok(status) => Ok(status),
|
||||
Err(e) => {
|
||||
@@ -183,7 +187,7 @@ impl Supervisor {
|
||||
&self,
|
||||
worker_configs: &[WorkerConfig],
|
||||
zinit_client: &ZinitClient,
|
||||
) -> Result<HashMap<String, ServiceStatus>, SupervisorError> {
|
||||
) -> Result<HashMap<String, Status>, SupervisorError> {
|
||||
let mut status_map = HashMap::new();
|
||||
|
||||
for worker in worker_configs {
|
||||
@@ -200,19 +204,7 @@ impl Supervisor {
|
||||
Ok(status_map)
|
||||
}
|
||||
|
||||
/// Start multiple workers
|
||||
pub async fn start_workers(
|
||||
&self,
|
||||
worker_configs: &[WorkerConfig],
|
||||
) -> Result<(), SupervisorError> {
|
||||
info!("Starting {} workers", worker_configs.len());
|
||||
|
||||
for worker in worker_configs {
|
||||
self.start_worker(worker).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Stop multiple workers
|
||||
pub async fn stop_workers(
|
||||
@@ -240,7 +232,7 @@ impl Supervisor {
|
||||
for worker in worker_configs {
|
||||
if worker.script_type == *script_type {
|
||||
if let Ok(status) = zinit_client.status(&worker.name).await {
|
||||
if status.state == ServiceState::Running {
|
||||
if status.state == "running" {
|
||||
running_count += 1;
|
||||
}
|
||||
}
|
||||
@@ -277,26 +269,35 @@ impl Supervisor {
|
||||
}
|
||||
|
||||
/// Create Zinit service configuration from worker config
|
||||
fn create_service_config(&self, worker: &WorkerConfig) -> serde_json::Value {
|
||||
let mut config = json!({
|
||||
"exec": format!("{} {}",
|
||||
fn create_service_config(&self, worker: &WorkerConfig) -> serde_json::Map<String, serde_json::Value> {
|
||||
use serde_json::{Map, Value};
|
||||
|
||||
let mut config = Map::new();
|
||||
|
||||
config.insert(
|
||||
"exec".to_string(),
|
||||
Value::String(format!("{} {}",
|
||||
worker.binary_path.display(),
|
||||
worker.args.join(" ")
|
||||
),
|
||||
"oneshot": !worker.restart_on_exit,
|
||||
});
|
||||
))
|
||||
);
|
||||
|
||||
config.insert(
|
||||
"oneshot".to_string(),
|
||||
Value::Bool(!worker.restart_on_exit)
|
||||
);
|
||||
|
||||
if let Some(health_check) = &worker.health_check {
|
||||
config["test"] = json!(health_check);
|
||||
config.insert("test".to_string(), Value::String(health_check.clone()));
|
||||
}
|
||||
|
||||
if !worker.dependencies.is_empty() {
|
||||
config["after"] = json!(worker.dependencies);
|
||||
config.insert("after".to_string(), json!(worker.dependencies));
|
||||
}
|
||||
|
||||
// Add environment variables if any
|
||||
if !worker.env.is_empty() {
|
||||
config["env"] = json!(worker.env);
|
||||
config.insert("env".to_string(), json!(worker.env));
|
||||
}
|
||||
|
||||
config
|
||||
@@ -307,6 +308,8 @@ impl Supervisor {
|
||||
use hero_job::ScriptType;
|
||||
use std::path::PathBuf;
|
||||
|
||||
let mut errors = Vec::new();
|
||||
|
||||
// Launch OSIS worker if configured
|
||||
if let Some(binary_path) = &builder.osis_worker {
|
||||
let worker_id = "osis_worker_1";
|
||||
@@ -318,7 +321,11 @@ impl Supervisor {
|
||||
config.env.extend(builder.worker_env_vars.clone());
|
||||
|
||||
info!("Launching OSIS worker: {}", worker_id);
|
||||
self.start_worker(&config).await?;
|
||||
if let Err(e) = self.start_worker(&config).await {
|
||||
let error_msg = format!("Failed to start OSIS worker: {}", e);
|
||||
warn!("{}", error_msg);
|
||||
errors.push(error_msg);
|
||||
}
|
||||
}
|
||||
|
||||
// Launch SAL worker if configured
|
||||
@@ -332,7 +339,11 @@ impl Supervisor {
|
||||
config.env.extend(builder.worker_env_vars.clone());
|
||||
|
||||
info!("Launching SAL worker: {}", worker_id);
|
||||
self.start_worker(&config).await?;
|
||||
if let Err(e) = self.start_worker(&config).await {
|
||||
let error_msg = format!("Failed to start SAL worker: {}", e);
|
||||
warn!("{}", error_msg);
|
||||
errors.push(error_msg);
|
||||
}
|
||||
}
|
||||
|
||||
// Launch V worker if configured
|
||||
@@ -346,7 +357,11 @@ impl Supervisor {
|
||||
config.env.extend(builder.worker_env_vars.clone());
|
||||
|
||||
info!("Launching V worker: {}", worker_id);
|
||||
self.start_worker(&config).await?;
|
||||
if let Err(e) = self.start_worker(&config).await {
|
||||
let error_msg = format!("Failed to start V worker: {}", e);
|
||||
warn!("{}", error_msg);
|
||||
errors.push(error_msg);
|
||||
}
|
||||
}
|
||||
|
||||
// Launch Python worker if configured
|
||||
@@ -360,9 +375,21 @@ impl Supervisor {
|
||||
config.env.extend(builder.worker_env_vars.clone());
|
||||
|
||||
info!("Launching Python worker: {}", worker_id);
|
||||
self.start_worker(&config).await?;
|
||||
if let Err(e) = self.start_worker(&config).await {
|
||||
let error_msg = format!("Failed to start Python worker: {}", e);
|
||||
warn!("{}", error_msg);
|
||||
errors.push(error_msg);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
// Return result based on whether any workers started successfully
|
||||
if errors.is_empty() {
|
||||
info!("All configured workers started successfully");
|
||||
Ok(())
|
||||
} else {
|
||||
let combined_error = format!("Some workers failed to start: {}", errors.join("; "));
|
||||
warn!("{}", combined_error);
|
||||
Err(SupervisorError::ZinitError(combined_error))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user