# 2_structure_extractor.py

import os
import json
import argparse
from pathlib import Path
from urllib.parse import urlparse
import base64
from io import BytesIO
import re
import asyncio
from bs4 import BeautifulSoup, Comment

from PIL import Image
from dotenv import load_dotenv

import vertexai
from vertexai.generative_models import GenerativeModel, Part, GenerationResponse

# --- Model & Pricing Configuration ---
MODELS_CONFIG = [
    {
        "name": "Gemini 2.5 Pro (GA)",
        "model_id": "gemini-2.5-pro",
        "location": "us-east1",
        "pricing": { "input": 1.25 / 1_000_000, "output": 10.00 / 1_000_000 }
    },
    {
        "name": "Gemini 2.5 Flash (GA)",
        "model_id": "gemini-2.5-flash",
        "location": "us-east1",
        "pricing": { "input": 0.30 / 1_000_000, "output": 2.50 / 1_000_000 }
    },
    {
        "name": "Gemini 2.5 Flash Lite (Preview)",
        "model_id": "gemini-2.5-flash-lite-preview-06-17",
        "location": "global",
        "pricing": { "input": 0.10 / 1_000_000, "output": 0.40 / 1_000_000 }
    }
]
# --- Set the default models for each stage ---
DEFAULT_BRAND_MODEL_INDEX = 0
DEFAULT_CONTENT_MODEL_INDEX = 2
# -------------------------------------------------------------

def setup_gcp_credentials():
    """Auto-setup GCP credentials from default location or environment."""
    import os
    import json
    from pathlib import Path
    
    # Default paths to check for credentials
    # Default paths to check for credentials (multiple structures)
    credential_paths = [
        Path("../data/credentials/gcp_key.json"),           # Development: from dashboard/scripts
        Path("./data/credentials/gcp_key.json"),            # Alternative: from project root
        Path("../dashboard/data/credentials/gcp_key.json"), # Production with dashboard: from /test/scripts
        Path("dashboard/data/credentials/gcp_key.json"),    # Alternative production with dashboard
        Path("data/credentials/gcp_key.json")               # Fallback
    ]
    
    # Check if GOOGLE_APPLICATION_CREDENTIALS is already set
    if not os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        # Try to find the key file
        key_path = None
        for path in credential_paths:
            if path.exists():
                key_path = path
                break
        
        if key_path:
            os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = str(key_path.absolute())
            print(f"🔑 Auto-detected GCP key: {key_path}")
    
    # Check if GCLOUD_PROJECT is already set
    if not os.getenv("GCLOUD_PROJECT"):
        # Try to read project ID from the key file
        creds_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
        if creds_path and Path(creds_path).exists():
            try:
                with open(creds_path, 'r') as f:
                    key_data = json.load(f)
                    project_id = key_data.get("project_id")
                    if project_id:
                        os.environ["GCLOUD_PROJECT"] = project_id
                        print(f"🌐 Auto-detected GCP project: {project_id}")
            except Exception as e:
                print(f"⚠️ Could not read project ID from key file: {e}")

def get_gemini_model():
    """Get the configured Gemini model."""
    # Auto-setup credentials if not already configured
    setup_gcp_credentials()
    
    active_model_config = MODELS_CONFIG[DEFAULT_BRAND_MODEL_INDEX]
    gcloud_project = os.getenv("GCLOUD_PROJECT")
    if not gcloud_project: 
        raise ValueError("GCLOUD_PROJECT not found. Please check your GCP setup.")
    vertexai.init(project=gcloud_project, location=active_model_config["location"])
    return GenerativeModel(active_model_config["model_id"])

def print_usage_and_cost(response: GenerationResponse, model_index: int, stage_name: str) -> dict:
    """Calculates and prints usage and returns a dict with cost/token info."""
    usage_data = {"cost": 0, "tokens": 0}
    try:
        active_model_config = MODELS_CONFIG[model_index]
        usage = response.usage_metadata
        pricing = active_model_config["pricing"]
        input_tokens = usage.prompt_token_count
        output_tokens = usage.candidates_token_count
        usage_data["cost"] = (input_tokens * pricing["input"]) + (output_tokens * pricing["output"])
        usage_data["tokens"] = input_tokens + output_tokens
        print(f"   - Usage for {stage_name}: {(usage_data['tokens']):,} tokens, cost: ${usage_data['cost']:.6f}")
    except Exception: pass
    return usage_data

def clean_html_for_analysis(html_content: str) -> str:
    """Clean HTML by removing scripts, styles, and non-content elements while preserving structure and SPA content."""
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # For SPAs, preserve important meta tags that contain content
    important_meta_content = []
    meta_tags = soup.find_all('meta')
    for meta in meta_tags:
        if meta.get('name') in ['description', 'keywords'] or meta.get('property') in ['og:title', 'og:description', 'twitter:title', 'twitter:description']:
            content = meta.get('content', '')
            if content and len(content) > 10:  # Only meaningful content
                important_meta_content.append(f"Meta {meta.get('name') or meta.get('property')}: {content}")
    
    # Look for SPA enhanced content that our scraper added
    spa_enhanced = soup.find('div', {'id': 'spa-enhanced-content'})
    spa_content = ""
    if spa_enhanced:
        spa_content = spa_enhanced.get_text(strip=True)
    
    # Look for SPA discovered routes in meta tags
    spa_routes = soup.find('meta', {'name': 'spa-discovered-routes'})
    spa_routes_content = ""
    if spa_routes:
        routes = spa_routes.get('content', '').split(',')
        spa_routes_content = f"SPA Navigation Routes: {', '.join(routes)}"
    
    # Remove elements that don't contain useful content (but keep title for SPAs)
    for element in soup(["script", "style", "noscript", "meta", "link"]):
        element.decompose()
    
    # Remove comments
    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
        comment.extract()
    
    # Remove common tracking/analytics elements
    for element in soup.find_all(attrs={"class": re.compile(r"(analytics|tracking|gtm|fbpixel)", re.I)}):
        element.decompose()
    
    # Remove elements with common tracking IDs
    for element in soup.find_all(attrs={"id": re.compile(r"(google|facebook|twitter|linkedin).*track", re.I)}):
        element.decompose()
    
    # Clean up empty elements but preserve structure
    for element in soup.find_all():
        if not element.get_text(strip=True) and not element.find_all(['img', 'br', 'hr', 'input']):
            # Only remove if it's truly empty and not a structural element
            if element.name not in ['div', 'section', 'article', 'main', 'header', 'footer', 'nav']:
                element.decompose()
    
    # Get the cleaned HTML
    cleaned_html = str(soup)
    
    # For SPAs with minimal body content, prepend the extracted meta content
    body_text = soup.get_text(strip=True)
    if len(body_text) < 200 and (important_meta_content or spa_content or spa_routes_content):
        # This looks like an SPA - add the extracted content at the beginning
        spa_content_section = "\n".join([
            "=== SPA CONTENT EXTRACTED ===",
            *important_meta_content,
            spa_content,
            spa_routes_content,
            "=== END SPA CONTENT ===",
            cleaned_html
        ])
        return spa_content_section
    
    return cleaned_html

def extract_site_icons(html_content: str, base_url: str) -> dict:
    """Extract site icons and metadata from HTML."""
    soup = BeautifulSoup(html_content, 'html.parser')
    icons = {}
    
    # Standard favicon
    favicon = soup.find('link', rel='icon')
    if favicon and favicon.get('href'):
        icons['favicon'] = favicon['href']
    
    # Apple touch icons
    apple_icons = soup.find_all('link', rel=lambda x: x and 'apple-touch-icon' in x)
    if apple_icons:
        icons['apple_touch_icons'] = []
        for icon in apple_icons:
            if icon.get('href'):
                icon_data = {'href': icon['href']}
                if icon.get('sizes'):
                    icon_data['sizes'] = icon['sizes']
                icons['apple_touch_icons'].append(icon_data)
    
    # Other icon types
    icon_rels = ['shortcut icon', 'icon', 'mask-icon', 'fluid-icon']
    for rel in icon_rels:
        icon = soup.find('link', rel=rel)
        if icon and icon.get('href'):
            icons[rel.replace(' ', '_')] = icon['href']
    
    # Manifest file
    manifest = soup.find('link', rel='manifest')
    if manifest and manifest.get('href'):
        icons['manifest'] = manifest['href']
    
    # Open Graph image
    og_image = soup.find('meta', property='og:image')
    if og_image and og_image.get('content'):
        icons['og_image'] = og_image['content']
    
    # Twitter card image
    twitter_image = soup.find('meta', attrs={'name': 'twitter:image'})
    if twitter_image and twitter_image.get('content'):
        icons['twitter_image'] = twitter_image['content']
    
    return icons

def run_initial_analysis(text_content: str, encoded_image: str, model_index: int, site_icons: dict) -> tuple[dict, dict]:
    """Gets the brand profile AND navigation structure from the homepage."""
    model = get_gemini_model()
    if not model: raise ValueError("Could not initialize brand analysis model.")
    
    active_model_config = MODELS_CONFIG[model_index]
    print(f"🧠 Contacting Gemini ('{active_model_config['name']}') for Brand & Nav analysis...")
    
    icons_context = f"\n\nSITE ICONS FOUND:\n{json.dumps(site_icons, indent=2)}" if site_icons else ""
    
    prompt = f"""Analyze the provided website HTML and screenshot to create a comprehensive brand profile and extract the site's navigation structure.

    Return a single, valid JSON object with these top-level keys: `brand_profile`, `navigation_links`, and `site_metadata`.

    1. `brand_profile`: An object containing:
       - colors: Array of color objects with name and hex values
       - typography: Object with primary_font, secondary_font, and styles array
       - design_style: String describing overall aesthetic
       - keywords: Array of relevant business/brand keywords
       - brand_identity: Detailed string describing the brand personality and positioning
       - logo_description: Description of logo design and placement
       - visual_elements: Array describing key visual design patterns

    2. `navigation_links`: Array of objects with `text` and `href` properties for main navigation

    3. `site_metadata`: Object containing:
       - site_icons: The icons found on the site
       - primary_domain: The main domain name
       - site_title: The main site title
       - meta_description: Site description if available

    Focus on extracting actionable brand insights that will inform a redesign process.{icons_context}

    Return ONLY the valid JSON object.
    """
    
    image_part = Part.from_data(data=base64.b64decode(encoded_image), mime_type="image/jpeg")
    response = model.generate_content([prompt, text_content, image_part])
    usage_data = print_usage_and_cost(response, model_index, "Brand/Nav Analysis")
    cleaned_response_text = re.search(r'\{.*\}', response.text, re.DOTALL).group(0)
    return json.loads(cleaned_response_text), usage_data

def run_page_content_extraction(page_html: str, page_info: dict, model_index: int) -> tuple[dict, dict]:
    """Extracts content with embedded image placeholders and AI-powered quality assessment."""
    model = get_gemini_model()
    if not model: raise ValueError("Could not initialize content extraction model.")

    # Clean the HTML first
    cleaned_html = clean_html_for_analysis(page_html)
    
    prompt = f"""
    You are an expert content editor analyzing a webpage for content extraction and quality assessment.

    PAGE INFO: {json.dumps(page_info, indent=2)}

    Your tasks:
    1. Extract meaningful content with embedded image references
    2. Assess the extraction quality and determine if manual processing is needed
    3. Identify any issues that prevent proper content extraction

    CONTENT EXTRACTION:
    - Create `content_sections` where keys are descriptive section titles
    - For each section's text, insert `[image: filename.ext]` directly WHERE images appear in the content
    - Preserve natural flow - images should be embedded in sentences/paragraphs logically
    - Focus on main content - ignore headers, footers, navigation unless they contain unique information
    - If this appears to be iframe/embedded content, note it in the section title

    **IMPORTANT FOR SPAs (Single Page Applications):**
    - If you see "=== SPA CONTENT EXTRACTED ===" this indicates a React/Vue/Angular site
    - The meta descriptions and SPA content should be treated as the MAIN page content
    - Use meta descriptions to create meaningful content sections
    - SPA Navigation Routes indicate what this page is about
    - Don't penalize for minimal HTML body - SPAs load content dynamically
    - Focus on the extracted meta content and SPA-enhanced content

    QUALITY ASSESSMENT:
    Evaluate if this page was successfully extracted or needs manual processing by checking:
    - Is there sufficient meaningful content (not just navigation links)?
    - Did the scraping capture the actual page content or just shell/navigation?
    - Are there signs the page didn't load properly (minimal text, error messages)?
    - Is this an iframe/external content that may not have rendered correctly?
    - Does the content make sense for what this page should contain?
    - **For SPAs**: If meta content and SPA routes are present, consider this a SUCCESSFUL extraction

    EXAMPLE OUTPUT:
    {{
        "content_sections": {{
            "Hero Welcome": "Welcome to our restaurant! [image: hero-photo.jpg] We serve the best seafood on the island.",
            "Menu Highlights": "Try our famous fish tacos [image: fish-tacos.jpg] or signature burger [image: burger-special.jpg]."
        }},
        "extraction_quality": {{
            "needs_manual_processing": false,
            "confidence_score": 85,
            "issues_detected": [],
            "content_summary": "Successfully extracted hero section and menu highlights with good detail",
            "extraction_notes": "Page loaded completely with all expected content sections"
        }}
    }}

    For quality assessment:
    - `needs_manual_processing`: boolean - true if human review is recommended
    - `confidence_score`: 0-100 - how confident you are in the extraction quality
    - `issues_detected`: array of specific problems found (e.g., "Minimal content - only navigation links", "Page appears to be error/404", "Menu content missing - may be in iframe")
    - `content_summary`: brief description of what was successfully extracted
    - `extraction_notes`: any observations about the extraction process

    Return ONLY a valid JSON object with `content_sections` and `extraction_quality` keys.

    CLEANED HTML TO ANALYZE:
    ---
    {cleaned_html[:35000]}  # Limit to avoid token issues
    """
    
    response = model.generate_content(prompt)
    usage_data = print_usage_and_cost(response, model_index, "Page Content")
    cleaned_response_text = re.search(r'\{.*\}', response.text, re.DOTALL).group(0)
    extracted_content = json.loads(cleaned_response_text)
    
    return extracted_content, usage_data

def main():
    """Main function to run the enhanced two-stage analysis and aggregation process."""
    load_dotenv()
    model_options_text = ', '.join([f"{i}='{config['name']}'" for i, config in enumerate(MODELS_CONFIG)])
    parser = argparse.ArgumentParser(description="Extracts a structured design brief from scraped website data with AI-powered quality assessment.")
    parser.add_argument("site_dir", type=str, help="Path to a website's data directory.")
    parser.add_argument("--brand-model-index", type=int, default=DEFAULT_BRAND_MODEL_INDEX, choices=range(len(MODELS_CONFIG)))
    parser.add_argument("--content-model-index", type=int, default=DEFAULT_CONTENT_MODEL_INDEX, choices=range(len(MODELS_CONFIG)))
    args = parser.parse_args()

    site_data_path = Path(args.site_dir)
    if not site_data_path.is_dir():
        print(f"❌ Error: Directory not found at '{site_data_path}'")
        return
    
    total_usage_data = {"cost": 0, "tokens": 0}

    print("\n--- Stage 1: Enhanced Brand & Navigation Analysis ---")
    manifest_path = site_data_path / "manifest.json"
    if not manifest_path.is_file():
        print(f"❌ Error: manifest.json not found.")
        return

    with open(manifest_path, "r", encoding='utf-8') as f:
        manifest = json.load(f)

    pages_dir = site_data_path / "pages"
    homepage_info = next((p for p in manifest.get('pages', []) if urlparse(p.get('url', '')).path in ['/', '']), manifest.get('pages', [{}])[0])
    homepage_html_path = pages_dir / (Path(homepage_info.get('local_path', '')).name or "index.html")
    
    if not homepage_html_path.exists():
        print(f"❌ Error: Homepage HTML not found at {homepage_html_path}")
        return
        
    homepage_html_content = homepage_html_path.read_text(encoding='utf-8')
    homepage_text_content = homepage_html_content[:40000]  # Limit for API
    
    # Extract site icons from homepage
    site_icons = extract_site_icons(homepage_html_content, homepage_info.get('url', ''))
    print(f"   - Found {len(site_icons)} types of site icons/metadata")
    
    screenshot_path = site_data_path / "screenshots" / Path(homepage_info.get('screenshot', '')).name
    if not screenshot_path.is_file():
        print(f"❌ Error: Homepage screenshot not found.")
        return
    
    with Image.open(screenshot_path) as img:
        img.thumbnail((1024, 1024))
        buffered = BytesIO()
        img.save(buffered, format="JPEG", quality=85)
        encoded_image = base64.b64encode(buffered.getvalue()).decode('utf-8')

    try:
        initial_data, brand_usage = run_initial_analysis(homepage_text_content, encoded_image, args.brand_model_index, site_icons)
        total_usage_data["cost"] += brand_usage.get("cost", 0)
        total_usage_data["tokens"] += brand_usage.get("tokens", 0)
        print(f"✅ Stage 1 Complete.")
    except Exception as e:
        print(f"❌ Failed during Stage 1 brand analysis: {e}")
        return

    print(f"\n--- Stage 2: AI-Powered Content Extraction & Quality Assessment ---")
    
    extracted_pages = []
    pages_needing_manual = []
    total_pages = len(manifest.get('pages', []))
    successful_extractions = 0
    
    for page_info in manifest.get('pages', []):
        page_path = pages_dir / Path(page_info.get('local_path', '')).name
        if page_path.is_file():
            print(f"   - Processing content from: {page_path.name}")
            html_content = page_path.read_text(encoding='utf-8')
            try:
                page_content, page_usage = run_page_content_extraction(html_content, page_info, args.content_model_index)
                total_usage_data["cost"] += page_usage.get("cost", 0)
                total_usage_data["tokens"] += page_usage.get("tokens", 0)
                
                # Check AI quality assessment
                quality_assessment = page_content.get('extraction_quality', {})
                confidence_score = quality_assessment.get('confidence_score', 0)
                needs_manual = quality_assessment.get('needs_manual_processing', False)
                issues = quality_assessment.get('issues_detected', [])
                
                print(f"     📊 AI Confidence: {confidence_score}% | Manual needed: {needs_manual}")
                
                if needs_manual or confidence_score < 70:
                    pages_needing_manual.append({
                        "page_key": page_path.stem.replace('.html', ''),
                        "url": page_info.get('url'),
                        "confidence_score": confidence_score,
                        "issues_detected": issues,
                        "content_summary": quality_assessment.get('content_summary', ''),
                        "extraction_notes": quality_assessment.get('extraction_notes', '')
                    })
                    print(f"     ⚠️  Flagged for manual review: {', '.join(issues)}")
                else:
                    successful_extractions += 1
                    print(f"     ✅ Successfully extracted")
                
                extracted_page = {
                    "page_key": page_path.stem.replace('.html', ''),
                    "title": page_info.get('title'),
                    "url": page_info.get('url'),
                    "content_sections": page_content.get('content_sections', {}),
                    "extraction_quality": quality_assessment
                }
                
                extracted_pages.append(extracted_page)
                
            except Exception as e:
                print(f"   - ❌ Failed to process page {page_path.name}: {e}")
                pages_needing_manual.append({
                    "page_key": page_path.stem.replace('.html', ''),
                    "url": page_info.get('url'),
                    "confidence_score": 0,
                    "issues_detected": [f"Processing error: {str(e)}"],
                    "content_summary": "Failed to process",
                    "extraction_notes": "Error during AI analysis"
                })

    # --- Final Assembly ---
    final_brief = {
        "brand_profile": initial_data.get("brand_profile", {}),
        "navigation_links": initial_data.get("navigation_links", []),
        "site_metadata": initial_data.get("site_metadata", {}),
        "site_pages": extracted_pages,
        "extraction_summary": {
            "total_pages_processed": total_pages,
            "successful_extractions": successful_extractions,
            "pages_needing_manual_review": len(pages_needing_manual),
            "overall_success_rate": f"{(successful_extractions/total_pages*100):.1f}%" if total_pages > 0 else "0%"
        },
        "pages_requiring_manual_processing": pages_needing_manual
    }

    # Only save site_brief.json if we have successful extractions
    if successful_extractions > 0:
        output_path = site_data_path / "site_brief.json"
        try:
            with open(output_path, "w", encoding='utf-8') as f:
                json.dump(final_brief, f, indent=4)
            print(f"\n✅ All stages complete. Enhanced site brief saved to '{output_path}'")
        except Exception as e:
            print(f"\n❌ Error saving site brief: {e}")
            return
    else:
        print(f"\n❌ No successful extractions - not saving site_brief.json")
    print(f"📊 Extraction Summary:")
    print(f"   - Total pages: {total_pages}")
    print(f"   - Successfully extracted: {successful_extractions}")
    print(f"   - Flagged for manual review: {len(pages_needing_manual)}")
    print(f"   - Success rate: {(successful_extractions/total_pages*100):.1f}%" if total_pages > 0 else "0%")
    
    if pages_needing_manual:
        print(f"\n⚠️  Pages requiring manual attention:")
        for page in pages_needing_manual[:3]:  # Show first 3
            print(f"   - {page['page_key']}: {page.get('content_summary', 'No summary')}")
        if len(pages_needing_manual) > 3:
            print(f"   - ... and {len(pages_needing_manual) - 3} more (see site_brief.json)")
    
    # Print final JSON line for orchestrator to capture
    print(json.dumps(total_usage_data))

if __name__ == "__main__":
    main()