# 1_website_scraper.py

import os
import json
import argparse
from pathlib import Path
from urllib.parse import urljoin, urlparse, parse_qs
import datetime
import mimetypes
import re
import asyncio
import platform
import shutil

# Environment detection
def detect_cpanel_environment():
    """Detect if we're running on cPanel shared hosting."""
    cpanel_indicators = [
        os.path.exists('/usr/local/cpanel'),
        'cpanel' in platform.node().lower(),
        os.getenv('CPANEL_USER') is not None,
        os.path.exists('/home') and not os.path.exists('/Applications'),  # Linux without macOS
    ]
    return any(cpanel_indicators)

def setup_gcp_credentials():
    """Auto-setup GCP credentials from default location or environment."""
    # Default paths to check for credentials
    default_key_path = Path("../data/credentials/gcp_key.json")
    alt_key_path = Path("./data/credentials/gcp_key.json")
    
    # Check if GOOGLE_APPLICATION_CREDENTIALS is already set
    if not os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        # Try to find the key file
        key_path = None
        if default_key_path.exists():
            key_path = default_key_path
        elif alt_key_path.exists():
            key_path = alt_key_path
        
        if key_path:
            os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = str(key_path.absolute())
            print(f"🔑 Auto-detected GCP key: {key_path}")
    
    # Check if GCLOUD_PROJECT is already set
    if not os.getenv("GCLOUD_PROJECT"):
        # Try to read project ID from the key file
        creds_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
        if creds_path and Path(creds_path).exists():
            try:
                with open(creds_path, 'r') as f:
                    key_data = json.load(f)
                    project_id = key_data.get("project_id")
                    if project_id:
                        os.environ["GCLOUD_PROJECT"] = project_id
                        print(f"🌐 Auto-detected GCP project: {project_id}")
            except Exception as e:
                print(f"⚠️ Could not read project ID from key file: {e}")

IS_CPANEL = detect_cpanel_environment()

# Setup GCP credentials automatically
setup_gcp_credentials()

# Try to import Playwright, fall back to httpx-only mode if not available
try:
    from playwright.async_api import async_playwright, Page, BrowserContext
    PLAYWRIGHT_AVAILABLE = True
except ImportError:
    PLAYWRIGHT_AVAILABLE = False
    print("🔄 Playwright not available, using httpx-only mode (cPanel compatible)")
    # Define dummy types for type hints when Playwright isn't available
    Page = None
    BrowserContext = None

# Force httpx mode always (for consistent behavior)
PLAYWRIGHT_AVAILABLE = False
print("🌍 Using httpx-only mode for consistent behavior across all environments")

from bs4 import BeautifulSoup
from tqdm import tqdm
from dotenv import load_dotenv
import httpx
import xml.etree.ElementTree as ET

import vertexai
from vertexai.generative_models import GenerativeModel, GenerationResponse
import asyncio

# --- Configuration ---
PAGE_LOAD_TIMEOUT = 60000
ASSET_TIMEOUT = 20000
RENDER_DELAY = 3000
MAX_RETRIES = 3
RETRY_DELAYS = [5, 10, 20]  # Seconds to wait between retries
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
VIEWPORT_SIZE = {"width": 1920, "height": 1080}
OUTPUT_DIR = Path("../data/website_data")
MAX_CRAWL_DEPTH = 3
MAX_URLS_PER_DOMAIN = 500

# --- AI Configuration with Pricing ---
MODELS_CONFIG = [
    {
        "name": "Gemini 2.5 Pro (GA)", 
        "model_id": "gemini-2.5-pro", 
        "location": "us-east1",
        "pricing": {"input": 1.25 / 1_000_000, "output": 10.00 / 1_000_000}
    },
    {
        "name": "Gemini 2.5 Flash (GA)", 
        "model_id": "gemini-2.5-flash", 
        "location": "us-east1",
        "pricing": {"input": 0.30 / 1_000_000, "output": 2.50 / 1_000_000}
    },
    {
        "name": "Gemini 2.5 Flash-Lite (Preview)", 
        "model_id": "gemini-2.5-flash-lite-preview-06-17", 
        "location": "global",
        "pricing": {"input": 0.10 / 1_000_000, "output": 0.40 / 1_000_000}
    }
]
DEFAULT_PLANNER_MODEL_INDEX = 1

# --- Helper Functions ---

def sanitize_filename(url: str) -> str:
    """Cleans a URL path to be a valid filename."""
    url_path = urlparse(url).path
    if not url_path or url_path == '/':
        return "index.html"
    filename = url_path.strip('/').replace('/', '_')
    if not Path(filename).suffix:
        return f"{filename}.html"
    return filename

def normalize_url(url: str) -> str:
    """Normalize URL by removing fragments, sorting query params, etc."""
    parsed = urlparse(url)
    # Remove common tracking parameters
    query_params = parse_qs(parsed.query)
    filtered_params = {k: v for k, v in query_params.items() 
                      if k not in ['utm_source', 'utm_medium', 'utm_campaign', 'fbclid', 'gclid']}
    
    # Reconstruct URL without fragment and with sorted params
    from urllib.parse import urlencode
    clean_query = urlencode(sorted(filtered_params.items()), doseq=True)
    
    return f"{parsed.scheme}://{parsed.netloc}{parsed.path}" + (f"?{clean_query}" if clean_query else "")

def is_valid_internal_url(url: str, base_netloc: str) -> bool:
    """Checks if URL is valid, internal, and not a file/anchor."""
    try:
        parsed_url = urlparse(url)
        url_netloc_norm = parsed_url.netloc.replace('www.', '')
        base_netloc_norm = base_netloc.replace('www.', '')
        
        # Skip obvious non-content URLs
        skip_patterns = [
            r'\.pdf$', r'\.zip$', r'\.mp4$', r'\.mov$', r'\.jpg$', r'\.png$', r'\.gif$',
            r'/wp-admin/', r'/admin/', r'/login', r'/logout', r'/register',
            r'mailto:', r'tel:', r'#', r'javascript:'
        ]
        
        for pattern in skip_patterns:
            if re.search(pattern, url, re.IGNORECASE):
                return False
        
        return (
            parsed_url.scheme in ['http', 'https'] and
            url_netloc_norm == base_netloc_norm and
            len(parsed_url.path) < 200  # Avoid extremely long URLs
        )
    except Exception:
        return False

async def retry_with_backoff(func, *args, max_retries=MAX_RETRIES, delays=RETRY_DELAYS, **kwargs):
    """Retry a function with exponential backoff on connection errors."""
    last_exception = None
    
    for attempt in range(max_retries + 1):
        try:
            return await func(*args, **kwargs)
        except Exception as e:
            last_exception = e
            error_msg = str(e).lower()
            
            # Check if this is a retryable error
            retryable_errors = [
                'connection_timed_out', 'net::err_connection_timed_out',
                'net::err_connection_refused', 'net::err_name_not_resolved',
                'timeout', 'connection refused', 'connection reset'
            ]
            
            is_retryable = any(error in error_msg for error in retryable_errors)
            
            if not is_retryable or attempt == max_retries:
                # Don't retry on non-connection errors or if max retries reached
                raise e
            
            delay = delays[min(attempt, len(delays) - 1)]
            print(f"     🔄 Connection error (attempt {attempt + 1}/{max_retries + 1}): {str(e)[:100]}...")
            print(f"     ⏳ Retrying in {delay} seconds...")
            await asyncio.sleep(delay)
    
    # This shouldn't be reached, but just in case
    raise last_exception

def classify_iframe_url(src: str, title: str = "", aria_label: str = "", class_name: str = "") -> dict:
    """Classify an iframe URL to determine if it's likely to contain useful content."""
    context = f"{title} {aria_label} {class_name}".lower()
    
    # Skip obvious tracking/social media iframes
    skip_patterns = [
        r'google.*analytics', r'facebook\.com/plugins', r'twitter\.com', r'instagram\.com',
        r'youtube\.com/embed', r'maps\.google', r'googletagmanager',
        r'doubleclick\.net', r'googlesyndication', r'amazon-adsystem'
    ]
    
    if any(re.search(pattern, src, re.IGNORECASE) for pattern in skip_patterns):
        return {"should_scrape": False, "reason": "tracking/ads/social", "priority": 0}
    
    # High priority content indicators
    high_priority_patterns = [
        (r'menu', r'menu|food|dining|restaurant'),
        (r'shop|store|product', r'shop|store|product|catalog|ecommerce'),
        (r'book|reservation', r'book|reservation|appointment|schedule'),
        (r'gallery|photo', r'gallery|photo|image|portfolio'),
        (r'form|contact', r'form|contact|inquiry|message'),
        (r'event|calendar', r'event|calendar|schedule|booking')
    ]
    
    # Medium priority content indicators  
    medium_priority_patterns = [
        (r'app\.|widget', r'app|widget|tool|calculator'),
        (r'embed', r'embed|content|article'),
        (r'player|media', r'player|media|audio|video')
    ]
    
    priority = 0
    content_type = "unknown"
    
    # Check for high priority content
    for url_pattern, context_pattern in high_priority_patterns:
        if re.search(url_pattern, src, re.IGNORECASE) or re.search(context_pattern, context):
            priority = 3
            content_type = url_pattern.split('|')[0]
            break
    
    # Check for medium priority content
    if priority == 0:
        for url_pattern, context_pattern in medium_priority_patterns:
            if re.search(url_pattern, src, re.IGNORECASE) or re.search(context_pattern, context):
                priority = 2
                content_type = url_pattern.split('|')[0] 
                break
    
    # Default low priority for other external content
    if priority == 0 and not src.startswith('data:'):
        priority = 1
        content_type = "external_content"
    
    return {
        "should_scrape": priority >= 2,  # Only scrape medium+ priority
        "reason": content_type,
        "priority": priority,
        "context": context.strip()
    }

async def discover_iframe_urls(page, page_url: str) -> list[dict]:
    """Discover iframe URLs on a page and classify them for potential scraping."""
    iframe_discoveries = []
    
    try:
        # Get all iframe elements with their attributes
        iframes_data = await page.evaluate('''() => {
            const iframes = Array.from(document.querySelectorAll('iframe'));
            return iframes.map((iframe, index) => ({
                src: iframe.src,
                title: iframe.title || '',
                ariaLabel: iframe.getAttribute('aria-label') || '',
                className: iframe.className || '',
                id: iframe.id || '',
                width: iframe.width || '',
                height: iframe.height || ''
            }));
        }''')
        
        if not iframes_data:
            return iframe_discoveries
        
        print(f"    🔍 Found {len(iframes_data)} iframes, analyzing for content...")
        
        for i, iframe_data in enumerate(iframes_data):
            src = iframe_data.get('src', '')
            if not src or src.startswith('data:'):
                continue
            
            # Make URL absolute
            absolute_src = urljoin(page_url, src)
            
            # Classify the iframe
            classification = classify_iframe_url(
                absolute_src,
                iframe_data.get('title', ''),
                iframe_data.get('ariaLabel', ''),
                iframe_data.get('className', '')
            )
            
            discovery = {
                "index": i,
                "src": absolute_src,
                "found_on_page": page_url,
                "title": iframe_data.get('title', ''),
                "aria_label": iframe_data.get('ariaLabel', ''),
                "class_name": iframe_data.get('className', ''),
                "classification": classification
            }
            
            iframe_discoveries.append(discovery)
            
            # Log the decision
            status = "✅ INCLUDE" if classification["should_scrape"] else "⏭️ SKIP"
            print(f"      {status} Iframe {i}: {classification['reason']} (priority: {classification['priority']}) - {absolute_src[:80]}...")
            if classification.get("context"):
                print(f"        Context: {classification['context'][:100]}")
    
    except Exception as e:
        print(f"    ❌ Iframe discovery error: {str(e)[:100]}")
    
    return iframe_discoveries

async def fetch_sitemap_urls(base_url: str) -> set[str]:
    """Fetch URLs from sitemap.xml and robots.txt."""
    found_urls = set()
    
    print("🗺️  Phase 1: Checking for sitemaps...")
    
    async with httpx.AsyncClient(timeout=30.0) as client:
        # Check common sitemap locations
        sitemap_urls = [
            f"{base_url}/sitemap.xml",
            f"{base_url}/sitemap_index.xml",
            f"{base_url}/sitemap.txt",
            f"{base_url}/robots.txt"
        ]
        
        # Check robots.txt for sitemap references
        try:
            robots_response = await client.get(f"{base_url}/robots.txt")
            if robots_response.status_code == 200:
                robots_content = robots_response.text
                sitemap_matches = re.findall(r'Sitemap:\s*(.+)', robots_content, re.IGNORECASE)
                sitemap_urls.extend(sitemap_matches)
                print(f"   - Found {len(sitemap_matches)} sitemaps in robots.txt")
        except Exception:
            pass
        
        # Process all sitemap URLs
        for sitemap_url in set(sitemap_urls):
            try:
                response = await client.get(sitemap_url)
                if response.status_code == 200:
                    content = response.text
                    
                    if sitemap_url.endswith('.xml'):
                        # Parse XML sitemap
                        try:
                            root = ET.fromstring(content)
                            # Handle different sitemap namespaces
                            for url_elem in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
                                loc_elem = url_elem.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
                                if loc_elem is not None:
                                    found_urls.add(loc_elem.text.strip())
                            
                            # Handle sitemap index files
                            for sitemap_elem in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap'):
                                loc_elem = sitemap_elem.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
                                if loc_elem is not None:
                                    # Recursively fetch sub-sitemaps
                                    sub_response = await client.get(loc_elem.text.strip())
                                    if sub_response.status_code == 200:
                                        sub_root = ET.fromstring(sub_response.text)
                                        for url_elem in sub_root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
                                            loc_elem = url_elem.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
                                            if loc_elem is not None:
                                                found_urls.add(loc_elem.text.strip())
                                                
                        except ET.ParseError:
                            pass
                    
                    elif sitemap_url.endswith('.txt'):
                        # Parse text sitemap
                        for line in content.split('\n'):
                            line = line.strip()
                            if line and line.startswith('http'):
                                found_urls.add(line)
                                
            except Exception as e:
                print(f"   - Could not fetch {sitemap_url}: {e}")
    
    print(f"   - Found {len(found_urls)} URLs in sitemaps")
    return found_urls

async def crawl_site_manually(base_url: str, context) -> tuple[set[str], list[dict]]:
    """Manual crawl to discover additional URLs and iframe content."""
    print("🕷️  Phase 2: Manual site crawling with iframe discovery...")
    
    found_urls = set()
    visited_urls = set()
    urls_to_visit = {base_url}
    base_netloc = urlparse(base_url).netloc
    all_iframe_discoveries = []
    
    depth = 0
    while urls_to_visit and depth < MAX_CRAWL_DEPTH and len(found_urls) < MAX_URLS_PER_DOMAIN:
        current_level_urls = urls_to_visit.copy()
        urls_to_visit.clear()
        depth += 1
        
        print(f"   - Crawling depth {depth}: {len(current_level_urls)} URLs")
        
        for url in current_level_urls:
            if url in visited_urls:
                continue
                
            visited_urls.add(url)
            page = None
            
            try:
                page = await context.new_page()
                
                # Navigate with retry logic
                async def navigate_crawl_page():
                    return await page.goto(url, wait_until="domcontentloaded", timeout=PAGE_LOAD_TIMEOUT)
                
                await retry_with_backoff(navigate_crawl_page)
                await page.wait_for_timeout(2000)  # Brief wait for dynamic content
                
                # Discover iframes on this page
                iframe_discoveries = await discover_iframe_urls(page, url)
                all_iframe_discoveries.extend(iframe_discoveries)
                
                # Extract all links from the page including SPA navigation
                links = await page.evaluate("""
                    () => {
                        const links = new Set();
                        
                        // Standard links
                        document.querySelectorAll('a[href]').forEach(link => {
                            const href = link.href;
                            if (href && !href.startsWith('javascript:') && !href.startsWith('mailto:') && !href.startsWith('tel:') && !href.startsWith('#')) {
                                links.add(href);
                            }
                        });
                        
                        // Navigation and footer links
                        document.querySelectorAll('nav a, .menu a, .navigation a, footer a, .footer a, header a').forEach(link => {
                            const href = link.href;
                            if (href && !href.startsWith('javascript:') && !href.startsWith('mailto:') && !href.startsWith('tel:') && !href.startsWith('#')) {
                                links.add(href);
                            }
                        });
                        
                        // Look for SPA-style navigation by extracting href attributes that might be relative paths
                        document.querySelectorAll('a[href^="/"], a[href^="./"], a[href^="../"]').forEach(link => {
                            const href = link.getAttribute('href');
                            if (href && !href.startsWith('#') && !href.includes('javascript:')) {
                                // Convert relative URLs to absolute
                                const absoluteUrl = new URL(href, window.location.origin).href;
                                links.add(absoluteUrl);
                            }
                        });
                        
                        // Also check for common navigation patterns in text content
                        const navTexts = ['about', 'contact', 'services', 'products', 'menu', 'gallery', 'team', 'blog', 'news'];
                        document.querySelectorAll('a').forEach(link => {
                            const text = link.textContent.toLowerCase().trim();
                            const href = link.getAttribute('href');
                            if (href && navTexts.some(navText => text.includes(navText))) {
                                if (href.startsWith('/') || href.startsWith('./') || href.startsWith('../')) {
                                    const absoluteUrl = new URL(href, window.location.origin).href;
                                    links.add(absoluteUrl);
                                } else if (href.startsWith('http')) {
                                    links.add(href);
                                }
                            }
                        });
                        
                        return Array.from(links);
                    }
                """)
                
                for link in links:
                    normalized_link = normalize_url(link)
                    if is_valid_internal_url(normalized_link, base_netloc):
                        found_urls.add(normalized_link)
                        if normalized_link not in visited_urls and depth < MAX_CRAWL_DEPTH:
                            urls_to_visit.add(normalized_link)
                            
            except Exception as e:
                print(f"   - Failed to crawl {url}: {str(e)[:100]}...")
            finally:
                if page:
                    await page.close()
    
    # Add iframe URLs that should be scraped
    iframe_urls_to_scrape = [
        discovery["src"] for discovery in all_iframe_discoveries 
        if discovery["classification"]["should_scrape"]
    ]
    
    found_urls.update(iframe_urls_to_scrape)
    
    print(f"   - Manual crawl found {len(found_urls)} total URLs")
    print(f"   - Including {len(iframe_urls_to_scrape)} iframe content URLs")
    
    return found_urls, all_iframe_discoveries

async def get_gemini_model(model_index: int):
    """Initialize and return Gemini model."""
    try:
        # Auto-setup credentials if not already configured
        setup_gcp_credentials()
        
        model_config = MODELS_CONFIG[model_index]
        gcloud_project = os.getenv("GCLOUD_PROJECT")
        if not gcloud_project:
            print("🔴 Error: GCLOUD_PROJECT not found. Please check your GCP setup.")
            return None
        
        vertexai.init(project=gcloud_project, location=model_config["location"])
        return GenerativeModel(model_config["model_id"])
    except Exception as e:
        print(f"🔴 Error initializing Vertex AI: {e}")
        return None

async def ai_filter_urls_for_redesign(all_urls: set[str], iframe_discoveries: list[dict], site_dir: Path, model_index: int) -> tuple[dict, dict]:
    """Use AI to intelligently select URLs most important for website redesign, including iframe context."""
    
    model = await get_gemini_model(model_index)
    if not model:
        raise ConnectionError("Could not initialize AI model for URL filtering.")
    
    # Convert URLs to list and prepare iframe context
    url_list = sorted(list(all_urls))
    
    # Create iframe context for AI
    iframe_context = {}
    for discovery in iframe_discoveries:
        if discovery["classification"]["should_scrape"]:
            page_url = discovery["found_on_page"]
            if page_url not in iframe_context:
                iframe_context[page_url] = []
            iframe_context[page_url].append({
                "iframe_url": discovery["src"],
                "title": discovery["title"],
                "type": discovery["classification"]["reason"],
                "context": discovery["classification"]["context"]
            })
    
    print(f"🧠 Phase 3: AI filtering {len(all_urls)} URLs for redesign relevance...")
    
    prompt = f"""
You are an expert web designer planning a complete website redesign. Your task is to select the 5-10 MOST IMPORTANT pages from this website that will give you everything needed to understand the business and create an amazing new design.

CRITICAL SELECTION RULES:
1. **MAXIMUM 5-10 PAGES**: You must be highly selective and choose only the most essential pages
2. **ALWAYS INCLUDE**: Homepage (highest priority), About, Contact, Main Services/Menu page
3. **BE SELECTIVE**: Only include pages that provide unique, essential information about the business
4. **CONTACT IS ESSENTIAL**: Any page that could be contact info (even with "copy-of" in URL) should be included
5. **IFRAME CONTENT**: Some pages have important content in iframes (like menus, shops, booking forms) - these are valuable!
6. **AVOID DUPLICATES**: Skip similar/repetitive pages - choose only the best representative example
7. **FOCUS ON CORE BUSINESS**: Prioritize pages that show what the company does and how to contact them

IFRAME CONTENT DISCOVERED:
{json.dumps(iframe_context, indent=2)}

DISCOVERED URLs ({len(url_list)} total):
{json.dumps(url_list, indent=2)}

INSTRUCTIONS:
Analyze each URL carefully and select ONLY the 5-10 most essential pages for a complete website redesign. Be ruthless in your selection - quality over quantity. Pay special attention to:
- Homepage (MUST include)
- About/Company information (MUST include if exists)
- Contact information (MUST include if exists)
- Main services/products/menu (select ONE best page)
- Pages with iframe content (especially menus, shops, booking forms)
- One example of each unique content type (not multiple similar pages)

Return a JSON object with:
- "page_limit": integer (5-10 maximum, be conservative)
- "selection_reasoning": string explaining your strategy for choosing these specific pages
- "included_pages": array of selected URLs (5-10 pages maximum)
- "excluded_reasoning": string explaining what you skipped and why (focus on avoiding duplicates)

REMEMBER: Less is more! Choose only the pages absolutely essential to understand the business and its offerings.
"""
    
    response = await model.generate_content_async(prompt)
    
    # Calculate usage with proper pricing
    usage_data = {"cost": 0, "tokens": 0}
    try:
        usage = response.usage_metadata
        model_config = MODELS_CONFIG[model_index]
        pricing = model_config.get("pricing", {"input": 0, "output": 0})
        
        input_tokens = usage.prompt_token_count
        output_tokens = usage.candidates_token_count
        
        input_cost = input_tokens * pricing["input"]
        output_cost = output_tokens * pricing["output"]
        
        usage_data["cost"] = input_cost + output_cost
        usage_data["tokens"] = input_tokens + output_tokens
        
        print(f"   - AI Usage: {usage_data['tokens']:,} tokens, ${usage_data['cost']:.6f}")
        
    except Exception as e:
        print(f"   - Cost calculation error: {e}")
    
    # Parse JSON response
    json_match = re.search(r'\{.*\}', response.text, re.DOTALL)
    if not json_match:
        raise ValueError("AI did not return valid JSON response.")
    
    plan = json.loads(json_match.group(0))
    
    # Save the comprehensive analysis
    analysis_data = {
        "total_urls_found": len(all_urls),
        "all_discovered_urls": url_list,
        "iframe_discoveries": iframe_discoveries,
        "iframe_context": iframe_context,
        "ai_selection_plan": plan,
        "excluded_pages": list(set(all_urls) - set(plan.get("included_pages", [])))
    }
    
    analysis_path = site_dir / "url_analysis.json"
    try:
        with open(analysis_path, "w", encoding='utf-8') as f:
            json.dump(analysis_data, f, indent=4)
        print(f"📊 Full URL analysis saved to: {analysis_path}")
    except Exception as e:
        print(f"⚠️ Warning: Could not save URL analysis: {e}")
    
    print(f"✅ AI selected {len(plan.get('included_pages', []))} pages for redesign analysis")
    
    return plan, usage_data

async def discover_and_download_assets(page, base_url: str, assets_dir: Path, downloaded_urls: set) -> tuple[BeautifulSoup, set[str]]:
    """Enhanced asset discovery and download."""
    image_dir, css_dir = assets_dir / "images", assets_dir / "css"
    images_on_page = set()
    
    # Get all asset URLs from the page
    asset_urls = await page.evaluate('''() => {
        const urls = new Set();
        
        // CSS files
        document.querySelectorAll('link[rel="stylesheet"]').forEach(l => urls.add(l.href));
        document.querySelectorAll('style[data-href]').forEach(s => urls.add(s.dataset.href));
        
        // Images - including lazy loaded and responsive images
        document.querySelectorAll('img, wow-image, picture source').forEach(i => {
            if (i.src && !i.src.startsWith('data:')) urls.add(i.src);
            if (i.dataset.src && !i.dataset.src.startsWith('data:')) urls.add(i.dataset.src);
            if (i.srcset) {
                i.srcset.split(',').forEach(p => {
                    const u = p.trim().split(' ')[0];
                    if (u && !u.startsWith('data:')) urls.add(u);
                });
            }
        });
        
        // Background images from CSS
        document.querySelectorAll('*').forEach(e => {
            const styles = window.getComputedStyle(e);
            const bgImage = styles.backgroundImage;
            if (bgImage && bgImage !== 'none') {
                const matches = bgImage.match(/url\\("?([^")]+)"?\\)/g);
                if (matches) {
                    matches.forEach(match => {
                        const url = match.match(/url\\("?([^")]+)"?\\)/)[1];
                        if (!url.startsWith('data:')) urls.add(url);
                    });
                }
            }
        });
        
        return Array.from(urls);
    }''')
    
    # Download assets
    for asset_url in asset_urls:
        if not asset_url or asset_url in downloaded_urls:
            continue
            
        asset_page = None
        try:
            absolute_asset_url = urljoin(base_url, asset_url)
            if absolute_asset_url.startswith('data:'):
                continue
            
            asset_page = await page.context.new_page()
            response = await asset_page.goto(absolute_asset_url, timeout=ASSET_TIMEOUT)
            
            if response and response.ok:
                content_type = response.headers.get('content-type', '')
                body = await response.body()
                filename_str = Path(urlparse(absolute_asset_url).path).name
                
                if not filename_str:
                    continue
                
                if 'css' in content_type:
                    (css_dir / filename_str).write_text(body.decode('utf-8', errors='ignore'))
                elif 'image' in content_type:
                    ext = mimetypes.guess_extension(content_type) or ''
                    filename = Path(f"{Path(filename_str).stem}{ext}" if not Path(filename_str).suffix and ext else filename_str)
                    (image_dir / filename).write_bytes(body)
                    images_on_page.add(filename.name)
                
                downloaded_urls.add(asset_url)
                
        except Exception:
            pass
        finally:
            if asset_page:
                await asset_page.close()
    
    # Update HTML with local asset paths
    soup = BeautifulSoup(await page.content(), 'html.parser')
    
    # Update CSS links
    for tag in soup.find_all(True, href=True):
        if tag.has_attr('href'):
            filename = Path(urlparse(tag['href']).path).name
            if (css_dir / filename).exists():
                tag['href'] = f'../assets/css/{filename}'
    
    # Update image sources
    for tag in soup.find_all(True, src=True):
        if tag.has_attr('src'):
            filename = Path(urlparse(tag['src']).path).name
            if (image_dir / filename).exists():
                tag['src'] = f'../assets/images/{filename}'
    
    # Update srcset attributes
    for tag in soup.find_all(True, srcset=True):
        if tag.has_attr('srcset'):
            new_srcset = []
            for part in tag['srcset'].split(','):
                url_part = part.strip().split(' ')[0]
                filename = Path(urlparse(url_part).path).name
                if (image_dir / filename).exists():
                    size_part = part.strip().split(' ')[1] if ' ' in part.strip() else ''
                    new_srcset.append(f"../assets/images/{filename} {size_part}".strip())
            if new_srcset:
                tag['srcset'] = ', '.join(new_srcset)
    
    # Update inline styles with background images
    for element in soup.find_all(style=re.compile(r'background-image')):
        if element.has_attr('style'):
            style = element['style']
            matches = re.findall(r'url\("?([^")]+)"?\)', style)
            for match in matches:
                filename = Path(urlparse(match).path).name
                if (image_dir / filename).exists():
                    style = style.replace(match, f'../assets/images/{filename}')
            element['style'] = style
    
    return soup, images_on_page

async def crawl_site_manually_httpx(base_url: str) -> set[str]:
    """Deep crawling with httpx to discover URLs by following links of links."""
    print("🕷️  Phase 2: Deep crawling with httpx (links of links)...")
    
    found_urls = set()
    visited_urls = set()
    urls_to_visit = {base_url}
    base_netloc = urlparse(base_url).netloc
    
    depth = 0
    async with httpx.AsyncClient(
        timeout=30.0, 
        headers={'User-Agent': USER_AGENT}, 
        follow_redirects=True
    ) as client:
        
        while urls_to_visit and depth < MAX_CRAWL_DEPTH and len(found_urls) < MAX_URLS_PER_DOMAIN:
            current_level_urls = urls_to_visit.copy()
            urls_to_visit.clear()
            depth += 1
            
            print(f"   - Crawling depth {depth}: {len(current_level_urls)} URLs")
            
            for url in current_level_urls:
                if url in visited_urls:
                    continue
                    
                visited_urls.add(url)
                found_urls.add(url)
                
                try:
                    response = await client.get(url)
                    if response.status_code == 200:
                        soup = BeautifulSoup(response.text, 'html.parser')
                        
                        # Extract all links from the current page
                        page_links = set()
                        
                        # Standard links
                        for link in soup.find_all('a', href=True):
                            href = link['href']
                            if href and not href.startswith(('javascript:', 'mailto:', 'tel:', '#')):
                                absolute_url = urljoin(url, href)
                                if is_valid_internal_url(absolute_url, base_netloc):
                                    page_links.add(normalize_url(absolute_url))
                        
                        # Navigation and footer links (higher priority)
                        for nav_selector in ['nav a', '.menu a', '.navigation a', 'footer a', '.footer a', 'header a']:
                            for link in soup.select(nav_selector):
                                href = link.get('href')
                                if href and not href.startswith(('javascript:', 'mailto:', 'tel:', '#')):
                                    absolute_url = urljoin(url, href)
                                    if is_valid_internal_url(absolute_url, base_netloc):
                                        page_links.add(normalize_url(absolute_url))
                        
                        # Look for common navigation patterns in text content
                        nav_texts = ['about', 'contact', 'services', 'products', 'menu', 'gallery', 'team', 'blog', 'news', 'portfolio']
                        for link in soup.find_all('a', href=True):
                            text = link.get_text().lower().strip()
                            href = link['href']
                            if href and any(nav_text in text for nav_text in nav_texts):
                                if not href.startswith(('javascript:', 'mailto:', 'tel:', '#')):
                                    absolute_url = urljoin(url, href)
                                    if is_valid_internal_url(absolute_url, base_netloc):
                                        page_links.add(normalize_url(absolute_url))
                        
                        # Look for SPA/React Router patterns in JavaScript
                        html_content = response.text
                        
                        # Check for React Router routes in script tags or data attributes
                        import re
                        
                        # Look for route patterns like "/about", "/contact", etc.
                        route_patterns = [
                            r'["\']\/([a-zA-Z0-9\-_]+)["\']',  # "/about", "/contact"
                            r'to=["\']\/([a-zA-Z0-9\-_]+)["\']',  # React Router Link to="/about"
                            r'href=["\']\/([a-zA-Z0-9\-_]+)["\']',  # href="/about"
                            r'path=["\']\/([a-zA-Z0-9\-_]+)["\']',  # Route path="/about"
                        ]
                        
                        for pattern in route_patterns:
                            matches = re.findall(pattern, html_content)
                            for match in matches:
                                # Skip common non-page routes
                                if match not in ['api', 'assets', 'static', 'js', 'css', 'images', 'img', 'fonts', 'favicon']:
                                    potential_url = f"{base_url.rstrip('/')}/{match}"
                                    if is_valid_internal_url(potential_url, base_netloc):
                                        page_links.add(normalize_url(potential_url))
                        
                        # Look for JavaScript bundles (Vite, Webpack, etc.) and analyze them
                        js_files = []
                        for script in soup.find_all('script', src=True):
                            js_src = script.get('src')
                            if js_src and ('.js' in js_src):
                                js_url = urljoin(url, js_src)
                                js_files.append(js_url)
                        
                        # Analyze JavaScript bundles for routes
                        for js_url in js_files[:3]:  # Limit to first 3 JS files to avoid too many requests
                            try:
                                js_response = await client.get(js_url)
                                if js_response.status_code == 200:
                                    js_content = js_response.text
                                    
                                    # Look for React Router patterns in JS bundles
                                    js_route_patterns = [
                                        r'path:\s*["\']\/([a-zA-Z0-9\-_]+)["\']',  # path: "/about"
                                        r'["\']\/([a-zA-Z0-9\-_]+)["\'](?=\s*[,}])',  # "/about" followed by comma or }
                                        r'to=["\']\/([a-zA-Z0-9\-_]+)["\']',  # to="/about"
                                        r'href=["\']\/([a-zA-Z0-9\-_]+)["\']',  # href="/about"
                                        r'navigate\(["\']\/([a-zA-Z0-9\-_]+)["\']',  # navigate("/about")
                                    ]
                                    
                                    js_routes_found = set()
                                    for pattern in js_route_patterns:
                                        matches = re.findall(pattern, js_content)
                                        for match in matches:
                                            # Skip common non-page routes
                                            if match not in ['api', 'assets', 'static', 'js', 'css', 'images', 'img', 'fonts', 'favicon', 'login', 'logout', 'admin']:
                                                js_routes_found.add(match)
                                    
                                    # Add found routes to page_links
                                    for route in js_routes_found:
                                        potential_url = f"{base_url.rstrip('/')}/{route}"
                                        if is_valid_internal_url(potential_url, base_netloc):
                                            page_links.add(normalize_url(potential_url))
                                    
                                    if js_routes_found:
                                        print(f"     - Found {len(js_routes_found)} routes in JS bundle: {', '.join(sorted(js_routes_found))}")
                                        
                            except Exception as e:
                                print(f"     - Could not analyze JS bundle {js_url}: {str(e)[:50]}...")
                        
                        # Look for menu/navigation data in JSON-LD or data attributes
                        for script in soup.find_all('script', type='application/json'):
                            try:
                                import json
                                data = json.loads(script.string or '{}')
                                # Look for navigation items in JSON data
                                def extract_urls_from_json(obj, prefix=''):
                                    urls = set()
                                    if isinstance(obj, dict):
                                        for key, value in obj.items():
                                            if key in ['href', 'url', 'link', 'path'] and isinstance(value, str):
                                                if value.startswith('/'):
                                                    full_url = f"{base_url.rstrip('/')}{value}"
                                                    if is_valid_internal_url(full_url, base_netloc):
                                                        urls.add(normalize_url(full_url))
                                            elif isinstance(value, (dict, list)):
                                                urls.update(extract_urls_from_json(value, prefix))
                                    elif isinstance(obj, list):
                                        for item in obj:
                                            if isinstance(item, (dict, list)):
                                                urls.update(extract_urls_from_json(item, prefix))
                                    return urls
                                
                                json_urls = extract_urls_from_json(data)
                                page_links.update(json_urls)
                            except:
                                pass
                        
                        # Add new URLs to visit in next depth level
                        for new_url in page_links:
                            if new_url not in visited_urls and depth < MAX_CRAWL_DEPTH:
                                urls_to_visit.add(new_url)
                        
                        print(f"     - Found {len(page_links)} links on {urlparse(url).path or '/'}")
                        
                    else:
                        print(f"     - HTTP {response.status_code} for {urlparse(url).path or '/'}")
                        
                except Exception as e:
                    print(f"     - Error crawling {urlparse(url).path or '/'}: {str(e)[:50]}...")
    
    print(f"   - Deep crawling complete: {len(found_urls)} unique URLs discovered")
    return found_urls

async def run_enhanced_scraper_httpx(base_url: str, limit_override: int | None, model_index: int):
    """Simplified scraper using httpx only (cPanel compatible)."""
    load_dotenv()
    
    if not base_url.startswith(('http://', 'https://')):
        base_url = 'https://' + base_url
    
    print(f"🚀 [START] httpx-only scraping (cPanel mode): {base_url}")
    domain = urlparse(base_url).netloc.replace('www.', '')
    site_output_dir = OUTPUT_DIR / domain
    
    # Create directory structure
    pages_dir = site_output_dir / "pages"
    assets_dir = site_output_dir / "assets"
    screenshots_dir = site_output_dir / "screenshots"
    
    for d in [pages_dir, assets_dir / "images", assets_dir / "css", screenshots_dir]:
        d.mkdir(parents=True, exist_ok=True)
    
    manifest = {"site_name": domain, "site_url": base_url, "pages": []}
    downloaded_urls = set()
    
    # Phase 1: Sitemap discovery
    sitemap_urls = await fetch_sitemap_urls(base_url)
    
    # Phase 2: Deep crawling with httpx (links of links)
    discovered_urls = await crawl_site_manually_httpx(base_url)
    
    # Combine all discovered URLs
    all_urls = sitemap_urls.union(discovered_urls)
    if not all_urls:
        # Fallback to just the homepage if no links found
        all_urls = {base_url}
    
    print(f"📊 Total unique URLs discovered: {len(all_urls)}")
    
    # Phase 3: AI filtering for redesign relevance
    usage_data = {"cost": 0, "tokens": 0}
    try:
        print(f"🧠 Phase 3: AI filtering {len(all_urls)} URLs for redesign relevance...")
        ai_plan, ai_usage = await ai_filter_urls_for_redesign(all_urls, [], site_output_dir, model_index)
        usage_data["cost"] += ai_usage.get("cost", 0)
        usage_data["tokens"] += ai_usage.get("tokens", 0)
        
        limit = limit_override if limit_override is not None else ai_plan.get("page_limit", 10)
        urls_to_process = ai_plan.get("included_pages", [])[:limit]
        print(f"   - AI Usage: {ai_usage.get('tokens', 0)} tokens, ${ai_usage.get('cost', 0):.6f}")
        print(f"✅ AI selected {len(urls_to_process)} pages for redesign analysis")
    except Exception as e:
        print(f"❌ AI filtering failed: {e}. Using top discovered URLs.")
        urls_to_process = list(all_urls)[:limit_override or 10]
    
    if not urls_to_process:
        urls_to_process = [base_url]
    
    print(f"📥 Processing {len(urls_to_process)} pages with httpx...")
    
    async with httpx.AsyncClient(
        timeout=30.0,
        headers={'User-Agent': USER_AGENT},
        follow_redirects=True
    ) as client:
        
        with tqdm(total=len(urls_to_process), desc=f"[{domain}] Pages") as pbar:
            for url in urls_to_process:
                try:
                    pbar.write(f"  -> Scraping: {url}")
                    
                    response = await client.get(url)
                    
                    if response.status_code >= 400:
                        pbar.write(f"     ⚠️  HTTP {response.status_code} error")
                        continue
                    
                    html_content = response.text
                    soup = BeautifulSoup(html_content, 'html.parser')
                    
                    # Basic asset download
                    images_on_page = set()
                    for img in soup.find_all(['img']):
                        src = img.get('src')
                        if src and not src.startswith('data:'):
                            try:
                                absolute_url = urljoin(url, src)
                                asset_response = await client.get(absolute_url)
                                if asset_response.status_code == 200:
                                    filename = Path(urlparse(src).path).name
                                    if filename and filename.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
                                        (assets_dir / "images" / filename).write_bytes(asset_response.content)
                                        images_on_page.add(filename)
                                        img['src'] = f'../assets/images/{filename}'
                            except:
                                pass
                    
                    # Save HTML
                    page_filename = sanitize_filename(url)
                    (pages_dir / page_filename).write_text(str(soup.prettify()), encoding='utf-8')
                    
                    # Create placeholder screenshot
                    screenshot_filename = f"{Path(page_filename).stem}.png"
                    screenshot_path = screenshots_dir / screenshot_filename
                    
                    # Simple placeholder file
                    try:
                        from PIL import Image, ImageDraw
                        img = Image.new('RGB', (1200, 800), color='white')
                        draw = ImageDraw.Draw(img)
                        draw.text((50, 50), f"Screenshot placeholder\n{url}", fill='black')
                        img.save(screenshot_path)
                    except:
                        screenshot_path.touch()
                    
                    # Add to manifest
                    page_info = {
                        "url": url,
                        "final_url": str(response.url),
                        "local_path": f"pages/{page_filename}",
                        "screenshot": f"screenshots/{screenshot_filename}",
                        "title": soup.title.string if soup.title else "",
                        "images": sorted(list(images_on_page)),
                        "is_iframe_content": False,
                        "status_code": response.status_code,
                        "scrape_method": "httpx_simple"
                    }
                    
                    manifest["pages"].append(page_info)
                    
                except Exception as e:
                    pbar.write(f"❌ Failed: {url} - {str(e)[:100]}...")
                
                pbar.update(1)
    
    # Save manifest if we got any pages
    if manifest["pages"]:
        manifest['assets'] = {
            'images': sorted([p.name for p in (assets_dir / "images").iterdir()]),
            'css': []
        }
        manifest['scrape_date'] = datetime.datetime.now().isoformat()
        manifest['total_urls_discovered'] = len(all_urls)
        manifest['iframe_urls_included'] = 0
        manifest['ai_selection_used'] = usage_data['cost'] > 0
        
        (site_output_dir / "manifest.json").write_text(
            json.dumps(manifest, indent=4), encoding='utf-8'
        )
        
        print(f"✅ [DONE] httpx scraping complete: {site_output_dir}")
        print(f"📊 Scraped {len(manifest['pages'])} pages")
        print(f"   💰 Step cost: ${usage_data['cost']:.6f} ({usage_data['tokens']} tokens)")
    else:
        print("❌ No pages were successfully scraped")
        raise Exception("No pages scraped")

async def run_enhanced_scraper_playwright(base_url: str, limit_override: int | None, model_index: int):
    """Enhanced scraper with iframe URL discovery and smart AI filtering (Playwright version)."""
    load_dotenv()
    
    if not base_url.startswith(('http://', 'https://')):
        base_url = 'https://' + base_url
    
    print(f"🚀 [START] Enhanced scraping with iframe URL discovery: {base_url}")
    domain = urlparse(base_url).netloc.replace('www.', '')
    site_output_dir = OUTPUT_DIR / domain
    
    # Create directory structure
    pages_dir = site_output_dir / "pages"
    assets_dir = site_output_dir / "assets"
    screenshots_dir = site_output_dir / "screenshots"
    
    for d in [pages_dir, assets_dir / "images", assets_dir / "css", screenshots_dir]:
        d.mkdir(parents=True, exist_ok=True)
    
    manifest = {"site_name": domain, "site_url": base_url, "pages": []}
    downloaded_urls = set()
    usage_data = {"cost": 0, "tokens": 0}
    
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(user_agent=USER_AGENT, viewport=VIEWPORT_SIZE)
        
        # Phase 1: Sitemap discovery
        sitemap_urls = await fetch_sitemap_urls(base_url)
        
        # Phase 2: Manual crawling with iframe discovery
        crawled_urls, iframe_discoveries = await crawl_site_manually(base_url, context)
        
        # Combine all discovered URLs
        all_urls = sitemap_urls.union(crawled_urls)
        base_netloc = urlparse(base_url).netloc
        
        # Filter to only valid URLs (but keep iframe URLs even if external)
        iframe_urls = {discovery["src"] for discovery in iframe_discoveries if discovery["classification"]["should_scrape"]}
        
        valid_urls = {normalize_url(url) for url in all_urls 
                     if is_valid_internal_url(url, base_netloc) or url in iframe_urls}
        
        print(f"📊 Total unique URLs discovered: {len(valid_urls)}")
        print(f"    - Including {len(iframe_urls)} iframe content URLs")
        
        # Special handling for known SPA sites like fullgen.ai
        if 'fullgen.ai' in base_url:
            print(f"   🎯 Detected SPA site (fullgen.ai) - adding known navigation pages")
            spa_pages = [
                f"{base_url.rstrip('/')}/",
                f"{base_url.rstrip('/')}/about",
                f"{base_url.rstrip('/')}/services", 
                f"{base_url.rstrip('/')}/contact"
            ]
            valid_urls.update(spa_pages)
            print(f"   📄 Added SPA pages: {len(spa_pages)} pages")
        
        # Phase 3: AI filtering for redesign relevance
        try:
            ai_plan, ai_usage = await ai_filter_urls_for_redesign(valid_urls, iframe_discoveries, site_output_dir, model_index)
            usage_data["cost"] += ai_usage.get("cost", 0)
            usage_data["tokens"] += ai_usage.get("tokens", 0)
            
            limit = limit_override if limit_override is not None else ai_plan.get("page_limit", 25)
            urls_to_process = ai_plan.get("included_pages", [])[:limit]
        except Exception as e:
            print(f"❌ AI filtering failed: {e}. Using top discovered URLs.")
            urls_to_process = list(valid_urls)[:25]
        
        if not urls_to_process:
            urls_to_process = [base_url]
        
        print(f"\n📥 Phase 4: Processing {len(urls_to_process)} selected pages...")
        
        # Process selected pages
        with tqdm(total=len(urls_to_process), desc=f"[{domain}] Pages") as pbar:
            for url in urls_to_process:
                page = None
                try:
                    # Determine if this is an iframe URL
                    is_iframe_url = url in iframe_urls
                    iframe_context = None
                    
                    if is_iframe_url:
                        # Find the iframe context
                        for discovery in iframe_discoveries:
                            if discovery["src"] == url:
                                iframe_context = discovery
                                break
                    
                    pbar.write(f"  -> Scraping: {url[:100]}{'...' if len(url) > 100 else ''}")
                    if iframe_context:
                        pbar.write(f"     📱 Iframe content: {iframe_context['classification']['reason']} from {iframe_context['found_on_page']}")
                    
                    page = await context.new_page()
                    
                    # Special handling for iframe URLs
                    if is_iframe_url and iframe_context:
                        parent_page_url = iframe_context['found_on_page']
                        pbar.write(f"     🎯 Setting iframe-specific headers and referrer...")
                        
                        # Set headers that mimic iframe embedding
                        await page.set_extra_http_headers({
                            'Referer': parent_page_url,
                            'Sec-Fetch-Dest': 'iframe',
                            'Sec-Fetch-Mode': 'navigate',
                            'Sec-Fetch-Site': 'cross-site',
                            'X-Requested-With': 'iframe'
                        })
                    
                    # Navigate to the page with retry logic
                    async def navigate_to_page():
                        return await page.goto(url, wait_until="domcontentloaded", timeout=PAGE_LOAD_TIMEOUT)
                    
                    response = await retry_with_backoff(navigate_to_page)
                    
                    # Check if we got redirected or got an error
                    if response:
                        final_url = page.url
                        status_code = response.status
                        
                        if final_url != url:
                            pbar.write(f"     🔄 Redirected from {url[:80]}... to {final_url[:80]}...")
                        
                        if status_code >= 400:
                            pbar.write(f"     ⚠️  HTTP {status_code} error for {url}")
                            continue
                        
                        # For iframe URLs, log what we actually got
                        if is_iframe_url:
                            page_title = await page.title()
                            page_content_sample = await page.evaluate('() => document.body ? document.body.innerText.substring(0, 200) : "No body content"')
                            pbar.write(f"     📄 Page title: '{page_title}'")
                            pbar.write(f"     📝 Content preview: {page_content_sample[:100]}...")
                            
                            # Check if this looks like the expected content
                            if iframe_context['classification']['reason'] == 'menu':
                                # Look for menu-like content
                                has_menu_content = await page.evaluate('''() => {
                                    const text = document.body ? document.body.innerText.toLowerCase() : '';
                                    const menuKeywords = ['menu', 'food', 'price', 'appetizer', 'entree', 'dessert', 'drink', 'beverage', '$'];
                                    return menuKeywords.some(keyword => text.includes(keyword));
                                }''')
                                
                                if not has_menu_content:
                                    pbar.write(f"     ⚠️  Warning: Expected menu content but didn't find menu keywords")
                    
                    await page.wait_for_timeout(RENDER_DELAY)
                    
                    # Download assets and update HTML
                    final_soup, images_on_page = await discover_and_download_assets(page, url, assets_dir, downloaded_urls)
                    
                    # Save HTML
                    page_filename = sanitize_filename(url)
                    (pages_dir / page_filename).write_text(str(final_soup.prettify()), encoding='utf-8')
                    
                    # Take screenshot
                    screenshot_filename = f"{Path(page_filename).stem}.png"
                    await page.screenshot(path=screenshots_dir / screenshot_filename, full_page=True)
                    
                    # Add to manifest
                    page_info = {
                        "url": url,
                        "final_url": page.url if response else url,  # Track redirects
                        "local_path": f"pages/{page_filename}",
                        "screenshot": f"screenshots/{screenshot_filename}",
                        "title": final_soup.title.string if final_soup.title else "",
                        "images": sorted(list(images_on_page)),
                        "is_iframe_content": is_iframe_url,
                        "status_code": response.status if response else None
                    }
                    
                    if iframe_context:
                        page_info["iframe_context"] = {
                            "found_on_page": iframe_context["found_on_page"],
                            "content_type": iframe_context["classification"]["reason"],
                            "title": iframe_context["title"],
                            "aria_label": iframe_context["aria_label"]
                        }
                    
                    manifest["pages"].append(page_info)
                    
                except Exception as e:
                    pbar.write(f"\n❌ [{domain}] Failed: {url} - {str(e)[:150]}...")
                    
                    # For iframe URLs, try an alternative approach
                    if is_iframe_url and iframe_context:
                        pbar.write(f"     🔄 Trying alternative approach for iframe URL...")
                        try:
                            # Try with different user agent or headers
                            alt_page = await context.new_page()
                            await alt_page.set_extra_http_headers({
                                'Referer': iframe_context['found_on_page'],
                                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
                            })
                            
                            alt_response = await alt_page.goto(url, wait_until="networkidle", timeout=PAGE_LOAD_TIMEOUT)
                            if alt_response and alt_response.status < 400:
                                pbar.write(f"     ✅ Alternative approach worked!")
                                
                                # Process this page
                                await alt_page.wait_for_timeout(RENDER_DELAY)
                                final_soup, images_on_page = await discover_and_download_assets(alt_page, url, assets_dir, downloaded_urls)
                                
                                page_filename = sanitize_filename(url)
                                (pages_dir / page_filename).write_text(str(final_soup.prettify()), encoding='utf-8')
                                
                                screenshot_filename = f"{Path(page_filename).stem}.png"
                                await alt_page.screenshot(path=screenshots_dir / screenshot_filename, full_page=True)
                                
                                page_info = {
                                    "url": url,
                                    "final_url": alt_page.url,
                                    "local_path": f"pages/{page_filename}",
                                    "screenshot": f"screenshots/{screenshot_filename}",
                                    "title": final_soup.title.string if final_soup.title else "",
                                    "images": sorted(list(images_on_page)),
                                    "is_iframe_content": True,
                                    "status_code": alt_response.status,
                                    "iframe_context": {
                                        "found_on_page": iframe_context["found_on_page"],
                                        "content_type": iframe_context["classification"]["reason"],
                                        "title": iframe_context["title"],
                                        "aria_label": iframe_context["aria_label"]
                                    }
                                }
                                manifest["pages"].append(page_info)
                                
                            await alt_page.close()
                        except Exception as alt_e:
                            pbar.write(f"     ❌ Alternative approach also failed: {str(alt_e)[:100]}...")
                finally:
                    if page:
                        await page.close()
                pbar.update(1)
        
        # Only save manifest if we successfully scraped at least one page
        if manifest["pages"]:
            # Finalize manifest
            manifest['assets'] = {
                'images': sorted([p.name for p in (assets_dir / "images").iterdir()]),
                'css': sorted([p.name for p in (assets_dir / "css").iterdir()])
            }
            manifest['scrape_date'] = datetime.datetime.now().isoformat()
            manifest['total_urls_discovered'] = len(all_urls)
            manifest['iframe_urls_included'] = len([p for p in manifest["pages"] if p.get("is_iframe_content")])
            manifest['ai_selection_used'] = True
            
            # Save manifest
            (site_output_dir / "manifest.json").write_text(
                json.dumps(manifest, indent=4), encoding='utf-8'
            )
        else:
            print("❌ No pages were successfully scraped")
            print("🔄 Attempting alternative scraping approaches...")
            
            # Try alternative approaches for stubborn sites
            alternative_success = False
            
            # Approach 1: Try with different browser settings
            try:
                print("   🔄 Trying with relaxed browser settings...")
                alt_context = await browser.new_context(
                    user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
                    viewport={"width": 1280, "height": 720},
                    ignore_https_errors=True,
                    java_script_enabled=True
                )
                
                alt_page = await alt_context.new_page()
                
                # Try a very simple navigation with longer timeout
                try:
                    await alt_page.goto(base_url, wait_until="load", timeout=90000)
                    await alt_page.wait_for_timeout(5000)
                    
                    # If we get here, try to extract basic content
                    title = await alt_page.title()
                    content = await alt_page.content()
                    
                    if title and len(content) > 1000:
                        print(f"   ✅ Alternative approach successful! Got page with title: '{title}'")
                        
                        # Save basic content
                        soup = BeautifulSoup(content, 'html.parser')
                        (pages_dir / "index.html").write_text(str(soup.prettify()), encoding='utf-8')
                        
                        # Take screenshot
                        await alt_page.screenshot(path=screenshots_dir / "index.png", full_page=True)
                        
                        # Create minimal manifest
                        manifest["pages"] = [{
                            "url": base_url,
                            "final_url": alt_page.url,
                            "local_path": "pages/index.html",
                            "screenshot": "screenshots/index.png",
                            "title": title,
                            "images": [],
                            "is_iframe_content": False,
                            "status_code": 200,
                            "scrape_method": "alternative_browser_settings"
                        }]
                        
                        manifest['assets'] = {'images': [], 'css': []}
                        manifest['scrape_date'] = datetime.datetime.now().isoformat()
                        manifest['total_urls_discovered'] = 1
                        manifest['iframe_urls_included'] = 0
                        manifest['ai_selection_used'] = False
                        manifest['scrape_notes'] = "Used alternative browser settings due to connection issues"
                        
                        (site_output_dir / "manifest.json").write_text(
                            json.dumps(manifest, indent=4), encoding='utf-8'
                        )
                        
                        alternative_success = True
                        
                except Exception as alt_e:
                    print(f"   ❌ Alternative browser settings failed: {str(alt_e)[:100]}...")
                
                await alt_context.close()
                
            except Exception as e:
                print(f"   ❌ Alternative approach failed: {str(e)[:100]}...")
            
            if not alternative_success:
                print("❌ All scraping approaches failed - not saving manifest.json")
                # Clean up empty directories if no content was scraped
                import shutil
                if site_output_dir.exists():
                    shutil.rmtree(site_output_dir)
                raise Exception("No pages were successfully scraped")
        
        await browser.close()
    
    print(f"✅ [DONE] Enhanced scraping complete: {site_output_dir}")
    print(json.dumps(usage_data))

async def run_enhanced_scraper(base_url: str, limit_override: int | None, model_index: int):
    """Main scraper function that chooses the appropriate method based on environment."""
    if PLAYWRIGHT_AVAILABLE and not IS_CPANEL:
        print("🎭 Using Playwright mode (full features)")
        await run_enhanced_scraper_playwright(base_url, limit_override, model_index)
    else:
        print("🌐 Using httpx mode (cPanel compatible)")
        await run_enhanced_scraper_httpx(base_url, limit_override, model_index)

if __name__ == "__main__":
    model_options_text = ', '.join([f"{i}='{config['name']}'" for i, config in enumerate(MODELS_CONFIG)])
    
    parser = argparse.ArgumentParser(description="Enhanced website scraper with iframe URL discovery and AI-powered page selection.")
    parser.add_argument("url", type=str, help="Base URL to scrape")
    parser.add_argument("--limit", type=int, help="Override AI-suggested page limit")
    parser.add_argument("--planner-model-index", type=int, default=DEFAULT_PLANNER_MODEL_INDEX, 
                       choices=range(len(MODELS_CONFIG)), 
                       help=f"AI model for planning ({model_options_text})")
    
    args = parser.parse_args()
    
    if not args.url:
        print("❌ URL required")
        parser.print_help()
    else:
        asyncio.run(run_enhanced_scraper(args.url, args.limit, args.planner_model_index))