# 0_orchestrator.py

import subprocess
import sys
from pathlib import Path
import argparse
from urllib.parse import urlparse
import shutil
import json
from typing import Optional, Dict, List, Union

# --- Configuration ---
PIPELINE_SCRIPTS = [
    "1_website_scraper.py",
    "2_structure_extractor.py",
    "3_design_generator.py", 
    "4_code_generator.py"
]

BASE_OUTPUT_DIR = "../data/website_data"

# Available models across all scripts
AVAILABLE_MODELS = {
    0: "Gemini 2.5 Pro (GA)",
    1: "Gemini 2.5 Flash (GA)", 
    2: "Gemini 2.5 Flash-Lite (Preview)"
}

# --- MODEL CONFIGURATION ---
MODEL_CONFIG = {
    "scraper": 0,              # Gemini 2.5 Flash for URL discovery/filtering
    "extractor_brand": 0,      # Gemini 2.5 Pro for brand analysis  
    "extractor_content": 2,    # Gemini 2.5 Flash-Lite for content extraction
    "design": 0,               # Gemini 2.5 Pro for design generation
    "code": 0                  # Gemini 2.5 Pro for code generation
}

def run_step_with_live_output(command: List[str]) -> Optional[Dict[str, Union[float, int]]]:
    """
    Runs a script as a subprocess, streaming its stdout in real-time,
    and captures the final JSON line for cost analysis.
    """
    try:
        # Force unbuffered output by setting PYTHONUNBUFFERED
        import os
        env = os.environ.copy()
        env['PYTHONUNBUFFERED'] = '1'
        
        process = subprocess.Popen(
            command,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            text=True,
            encoding='utf-8',
            bufsize=0,  # Unbuffered
            universal_newlines=True,
            env=env
        )

        last_line = ""
        if process.stdout:
            for line in iter(process.stdout.readline, ''):
                if line:  # Only print non-empty lines
                    print(line, end='', flush=True)
                    if line.strip():
                        last_line = line.strip()

        process.wait()

        if process.returncode != 0:
            print(f"\n❌ ERROR: Script '{Path(command[1]).name}' failed with exit code {process.returncode}.")
            return None

        # Try to parse the last line as JSON for cost data
        try:
            usage_data = json.loads(last_line)
            if isinstance(usage_data, dict) and "cost" in usage_data:
                return usage_data
        except json.JSONDecodeError:
            pass
        
        return {"cost": 0, "tokens": 0}

    except FileNotFoundError:
        print(f"❌ Error: Could not find script at '{command[1]}'")
        return None
    except Exception as e:
        print(f"❌ An unexpected error occurred: {e}")
        return None

def get_script_command(script_name: str, url_or_dir: str) -> List[str]:
    """Generate the appropriate command for each script."""
    base_command = [sys.executable, script_name]
    
    if script_name == "1_website_scraper.py":
        return base_command + [url_or_dir, "--planner-model-index", str(MODEL_CONFIG['scraper'])]
    
    elif script_name == "2_structure_extractor.py":
        return base_command + [url_or_dir, 
                              "--brand-model-index", str(MODEL_CONFIG['extractor_brand']),
                              "--content-model-index", str(MODEL_CONFIG['extractor_content'])]
    
    elif script_name == "3_design_generator.py":
        return base_command + [url_or_dir, "--model-index", str(MODEL_CONFIG['design'])]
    
    elif script_name == "4_code_generator.py":
        return base_command + [url_or_dir, "--model-index", str(MODEL_CONFIG['code'])]
    
    else:
        # Generic fallback
        return base_command + [url_or_dir]

def check_pipeline_readiness(site_dir: Path) -> Dict[str, bool]:
    """Check what outputs exist to determine pipeline state."""
    checks = {
        "manifest": site_dir / "manifest.json",
        "brief": site_dir / "site_brief.json", 
        "designs": site_dir / "designs.json",
        "final": site_dir / "final_output" / "index.html"
    }
    
    return {name: path.exists() for name, path in checks.items()}

def main():
    """Orchestrates the enabled pipeline scripts for each website."""
    parser = argparse.ArgumentParser(description="Runs the enabled website redesign pipeline scripts.")
    parser.add_argument("url_file", type=str, help="Path to file containing URLs.")
    parser.add_argument("-r", "--replace", action="store_true", help="Force re-processing and replace existing data.")
    
    args = parser.parse_args()

    url_file_path = Path(args.url_file)
    if not url_file_path.is_file():
        print(f"❌ Error: URL file not found at '{url_file_path}'")
        return

    # Check all enabled scripts exist
    missing_scripts = [script for script in PIPELINE_SCRIPTS if not Path(script).is_file()]
    if missing_scripts:
        print(f"❌ Error: Missing pipeline scripts: {missing_scripts}")
        return

    with open(url_file_path, "r") as f:
        urls_to_process = [line.strip() for line in f if line.strip() and not line.startswith('#')]

    if not urls_to_process:
        print(f"🤷‍♂️ The file '{url_file_path}' is empty. Nothing to do.")
        return

    total_pipeline_cost = 0.0

    print(f"🤖 Model Configuration:")
    print(f"   - Scraper: {AVAILABLE_MODELS[MODEL_CONFIG['scraper']]}")
    print(f"   - Brand Analysis: {AVAILABLE_MODELS[MODEL_CONFIG['extractor_brand']]}")
    print(f"   - Content Extraction: {AVAILABLE_MODELS[MODEL_CONFIG['extractor_content']]}")
    print(f"   - Design Generation: {AVAILABLE_MODELS[MODEL_CONFIG['design']]}")
    print(f"   - Code Generation: {AVAILABLE_MODELS[MODEL_CONFIG['code']]}")
    print(f"\n📝 Enabled Scripts: {len(PIPELINE_SCRIPTS)} of 4")
    for i, script in enumerate(PIPELINE_SCRIPTS, 1):
        print(f"   - Step {i}: {script}")
    print()

    for i, url in enumerate(urls_to_process, 1):
        print(f"\n▶️ PROCESSING SITE {i} of {len(urls_to_process)}: {url}")
        print("=" * 50)
        
        domain = urlparse(url).netloc.replace('www.', '')
        site_output_dir = Path(BASE_OUTPUT_DIR) / domain

        # Check current pipeline state
        pipeline_state = check_pipeline_readiness(site_output_dir)
        
        # Determine if we should skip (only if final output exists and not replacing)
        if pipeline_state["final"] and not args.replace:
            print(f"🟡 SKIPPING SITE: Final output already exists")
            print("-" * 50)
            continue
        
        if site_output_dir.is_dir() and args.replace:
            print(f"   -> --replace active. Removing existing directory: '{site_output_dir}'")
            shutil.rmtree(site_output_dir, ignore_errors=True)

        # Run each enabled script in sequence
        for step_num, script_name in enumerate(PIPELINE_SCRIPTS, 1):
            print(f"\n--- STEP {step_num} of {len(PIPELINE_SCRIPTS)}: {script_name} ---")
            
            # Determine input (URL for first script, directory for others)
            if script_name == "1_website_scraper.py":
                script_input = url
                # Check if already done
                if pipeline_state["manifest"] and not args.replace:
                    print(f"✅ SKIPPING: Scraper output already exists.")
                    continue
            else:
                script_input = str(site_output_dir)
                # Check if already done based on expected outputs
                if script_name == "2_structure_extractor.py" and pipeline_state["brief"] and not args.replace:
                    print(f"✅ SKIPPING: Extractor output already exists.")
                    continue
                elif script_name == "3_design_generator.py" and pipeline_state["designs"] and not args.replace:
                    print(f"✅ SKIPPING: Design output already exists.")
                    continue
                elif script_name == "4_code_generator.py" and pipeline_state["final"] and not args.replace:
                    print(f"✅ SKIPPING: Code output already exists.")
                    continue

            # Generate and run command
            command = get_script_command(script_name, script_input)
            usage = run_step_with_live_output(command)
            
            if usage is None:
                print(f"   Stopping pipeline for {url} due to error in {script_name}")
                print("-" * 50)
                break
            
            # Safely add cost even if usage data is incomplete
            script_cost = usage.get("cost", 0) if isinstance(usage, dict) else 0
            total_pipeline_cost += script_cost
            
            if script_cost > 0:
                print(f"   💰 Step cost: ${script_cost:.6f}")
            else:
                print(f"   💰 Step cost: $0.000000 (no usage data)")
            
            # Update pipeline state after successful run
            pipeline_state = check_pipeline_readiness(site_output_dir)

        else:
            # This runs if the for loop completes without breaking
            print(f"\n✔️ Pipeline completed successfully for site {i} of {len(urls_to_process)}: {url}")
        
        print("-" * 50)

    print(f"\n🎉🎉 Pipeline completed for all sites! 🎉🎉")
    print(f"💰 Total Estimated Cost: ${total_pipeline_cost:.6f}")

if __name__ == "__main__":
    main()