#!/usr/bin/env python3
"""
Draft Polishing Tool

Polish draft chapters with writing style prompts while preserving citations.
Designed for Claude Code to enhance generated drafts without losing source references.

OPT-IN GENERATION MODEL:
    This tool follows the "Opt-in Generation" architecture:

    DEFAULT MODE (suggestions):
        Returns editing suggestions for Claude Code to apply.
        Claude Code maintains voice consistency and authorship.

    DELEGATED MODE (--use-internal-model):
        Uses the internal LLM (configured in project.yaml) to rewrite text.
        Use this for batch processing or when explicitly delegating to the CLI.

Usage:
    # Get suggestions for Claude Code to apply (DEFAULT)
    python polish_draft.py chapter_01.md --style "academic but accessible"

    # Have the CLI polish using internal model (OPT-IN)
    python polish_draft.py chapter_01.md --style "academic" --use-internal-model

    # Use a style prompt file
    python polish_draft.py chapter_01.md --prompt prompts/narrative_style.txt

    # Output to specific file (with internal model)
    python polish_draft.py chapter_01.md --style "engaging" --use-internal-model --output chapter_01_polished.md

    # Validate after polishing (recommended)
    python polish_draft.py chapter_01.md --style "academic" --use-internal-model --validate

Output:
    Default: JSON/text suggestions for Claude Code to apply
    With --use-internal-model: Polished chapter with preserved citations and quoted material
"""

import argparse
import json
import logging
import re
import sys
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass, field, asdict

# Setup path for imports
sys.path.insert(0, str(Path(__file__).parent))

from config import (
    LOGGING_CONFIG, OPENAI_ENABLED,
    GENERATION_CONFIG, get_generation_model, is_cli_generation_enabled
)

# Setup logging
LOG_LEVEL = LOGGING_CONFIG.get('level', 'INFO')
logging.basicConfig(
    level=getattr(logging, LOG_LEVEL, logging.INFO),
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


# =============================================================================
# DATA STRUCTURES
# =============================================================================

@dataclass
class PreservedElement:
    """An element that should be preserved exactly during polishing."""
    placeholder: str          # Unique placeholder in text
    original: str             # Original text to restore
    element_type: str         # citation, quote, footnote, reference
    start_pos: int            # Position in original text
    end_pos: int


@dataclass
class PolishResult:
    """Result of polishing a draft."""
    original_path: Optional[str]
    polished_text: str
    style_used: str
    preserved_count: int      # Number of preserved elements
    word_count_before: int
    word_count_after: int
    sections_polished: int
    validation_passed: Optional[bool] = None
    validation_issues: List[str] = field(default_factory=list)


@dataclass
class SuggestionResult:
    """Result of analyzing a draft for suggestions (without rewriting)."""
    original_path: Optional[str]
    suggestions: List[Dict[str, Any]]
    style_target: str
    word_count: int
    section_count: int
    preserved_elements: int
    summary: str


# =============================================================================
# LLM INTERFACE
# =============================================================================

class PolishingLLM:
    """LLM interface for draft polishing."""

    def __init__(self):
        self.client = None
        if OPENAI_ENABLED:
            try:
                from openai import OpenAI
                from config import OPENAI_API_KEY
                self.client = OpenAI(api_key=OPENAI_API_KEY)
            except Exception as e:
                logger.warning(f"OpenAI not available: {e}")

    def available(self) -> bool:
        return self.client is not None

    def polish(self, text: str, style: str, temperature: float = None) -> str:
        """Polish text with given style."""
        if not self.client:
            raise RuntimeError("LLM not available - OpenAI API key required")

        if temperature is None:
            temperature = GENERATION_CONFIG.get('temperature', 0.7)

        model = get_generation_model('primary')

        messages = [
            {"role": "system", "content": self._get_system_prompt(style)},
            {"role": "user", "content": self._get_user_prompt(text)}
        ]

        response = self.client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=temperature,
            max_tokens=4096,
        )
        return response.choices[0].message.content

    def analyze_for_suggestions(self, text: str, style: str) -> List[Dict[str, Any]]:
        """Analyze text and return editing suggestions without rewriting."""
        if not self.client:
            raise RuntimeError("LLM not available - OpenAI API key required")

        model = get_generation_model('utility')  # Use utility model for analysis

        messages = [
            {"role": "system", "content": self._get_suggestion_system_prompt(style)},
            {"role": "user", "content": f"Analyze this text and provide editing suggestions:\n\n{text}"}
        ]

        response = self.client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=0.3,
            max_tokens=2048,
        )

        # Parse response into structured suggestions
        return self._parse_suggestions(response.choices[0].message.content)

    def _parse_suggestions(self, response_text: str) -> List[Dict[str, Any]]:
        """Parse LLM response into structured suggestions."""
        suggestions = []

        # Try to parse as JSON first
        try:
            if '```json' in response_text:
                json_str = response_text.split('```json')[1].split('```')[0].strip()
                return json.loads(json_str)
            elif response_text.strip().startswith('['):
                return json.loads(response_text)
        except json.JSONDecodeError:
            pass

        # Fall back to line-by-line parsing
        lines = response_text.strip().split('\n')
        current_suggestion = None

        for line in lines:
            line = line.strip()
            if not line:
                continue

            # Look for numbered suggestions or bullet points
            if re.match(r'^(\d+[\.\)]\s*|\*\s*|-\s*)', line):
                if current_suggestion:
                    suggestions.append(current_suggestion)
                content = re.sub(r'^(\d+[\.\)]\s*|\*\s*|-\s*)', '', line).strip()

                # Try to categorize
                category = 'general'
                if any(w in content.lower() for w in ['sentence', 'structure', 'flow']):
                    category = 'structure'
                elif any(w in content.lower() for w in ['word', 'vocabulary', 'term']):
                    category = 'vocabulary'
                elif any(w in content.lower() for w in ['clarity', 'clear', 'ambiguous']):
                    category = 'clarity'
                elif any(w in content.lower() for w in ['tone', 'voice', 'style']):
                    category = 'style'

                current_suggestion = {
                    'category': category,
                    'suggestion': content,
                    'location': None
                }
            elif current_suggestion:
                current_suggestion['suggestion'] += ' ' + line

        if current_suggestion:
            suggestions.append(current_suggestion)

        return suggestions

    def _get_suggestion_system_prompt(self, style: str) -> str:
        return f"""You are an expert editor analyzing text for improvement opportunities.

Your task is to identify specific, actionable editing suggestions WITHOUT rewriting the text.

TARGET STYLE:
{style}

Provide 5-15 specific suggestions as a JSON array. Each suggestion should have:
- "category": one of "structure", "vocabulary", "clarity", "style", "flow", "grammar"
- "suggestion": specific actionable recommendation
- "location": quote a short phrase to locate (or null for general suggestions)
- "priority": "high", "medium", or "low"

Example output:
```json
[
  {{"category": "clarity", "suggestion": "Replace 'utilized' with 'used' for accessibility", "location": "the methods utilized", "priority": "low"}},
  {{"category": "structure", "suggestion": "Break this paragraph into two - separate the historical context from the analysis", "location": "The guild system...", "priority": "medium"}}
]
```

Focus on improvements that align with the target style while preserving the author's voice and all factual content."""

    def _get_system_prompt(self, style: str) -> str:
        return f"""You are a skilled editor polishing academic draft text.

Your task is to improve the writing style while preserving ALL factual content exactly.

STYLE GUIDELINES:
{style}

CRITICAL RULES:
1. NEVER change facts, dates, names, or claims
2. NEVER remove or alter text in [[PRESERVE:...]] markers - leave these EXACTLY as they appear
3. Keep all paragraph breaks and section structure
4. Improve clarity, flow, and engagement
5. Maintain scholarly tone unless instructed otherwise
6. Fix awkward phrasing and improve transitions
7. Output ONLY the polished text - no explanations or commentary"""

    def _get_user_prompt(self, text: str) -> str:
        return f"""Polish the following text according to the style guidelines.

Remember: [[PRESERVE:xxx]] markers must remain EXACTLY as written - do not modify them in any way.

TEXT TO POLISH:
{text}

POLISHED TEXT:"""


# =============================================================================
# CITATION & QUOTE PRESERVATION
# =============================================================================

# Patterns for elements to preserve
CITATION_PATTERNS = [
    r'\[DOC_\d+(?:,\s*p\.?\s*\d+(?:-\d+)?)?\]',  # [DOC_001, p.23]
    r'\(DOC_\d+(?:,\s*p\.?\s*\d+(?:-\d+)?)?\)',  # (DOC_001, p.23)
    r'DOC_\d+',                                    # DOC_001
    r'\[\d+\]',                                    # [1] (footnote numbers)
    r'\^\[\d+\]',                                  # ^[1] (markdown footnotes)
    r'\*\*\[.+?\]\*\*',                           # **[citation]**
]

QUOTE_PATTERNS = [
    r'>\s*"[^"]+"\s*\n',                          # Block quote with quotes
    r'>\s*[^\n]+\n(?:>\s*[^\n]+\n)*',             # Multi-line block quotes
    r'"[^"]{50,}"',                               # Long inline quotes (50+ chars)
    r'「[^」]+」',                                 # Japanese quotes
    r'«[^»]+»',                                   # French quotes
]

FOOTNOTE_PATTERNS = [
    r'\[\^.+?\]:\s*.+?(?=\n\n|\n\[|\Z)',          # [^1]: footnote definition
    r'---\s*\n(?:\[\^.+?\]:.+?\n)+',              # Footnote section
]


def identify_preserved_elements(text: str) -> List[PreservedElement]:
    """Identify all elements that should be preserved during polishing."""
    elements = []
    counter = 0

    def add_matches(patterns: List[str], element_type: str):
        nonlocal counter
        for pattern in patterns:
            for match in re.finditer(pattern, text, re.MULTILINE | re.DOTALL):
                # Check if this overlaps with existing element
                overlaps = False
                for existing in elements:
                    if (match.start() < existing.end_pos and match.end() > existing.start_pos):
                        overlaps = True
                        break

                if not overlaps:
                    counter += 1
                    elements.append(PreservedElement(
                        placeholder=f"[[PRESERVE:{counter:04d}]]",
                        original=match.group(0),
                        element_type=element_type,
                        start_pos=match.start(),
                        end_pos=match.end()
                    ))

    # Add citations first (highest priority)
    add_matches(CITATION_PATTERNS, 'citation')
    # Then quotes
    add_matches(QUOTE_PATTERNS, 'quote')
    # Then footnotes
    add_matches(FOOTNOTE_PATTERNS, 'footnote')

    # Sort by position (reverse for replacement)
    elements.sort(key=lambda x: x.start_pos, reverse=True)

    return elements


def replace_with_placeholders(text: str, elements: List[PreservedElement]) -> str:
    """Replace preserved elements with placeholders."""
    result = text
    # Elements are sorted in reverse order, so we can replace from end to start
    for elem in elements:
        result = result[:elem.start_pos] + elem.placeholder + result[elem.end_pos:]
    return result


def restore_preserved_elements(text: str, elements: List[PreservedElement]) -> str:
    """Restore original elements from placeholders."""
    result = text
    for elem in elements:
        result = result.replace(elem.placeholder, elem.original)
    return result


def verify_preservation(original: str, polished: str, elements: List[PreservedElement]) -> List[str]:
    """Verify that all preserved elements are intact in polished text."""
    issues = []

    for elem in elements:
        if elem.original not in polished:
            issues.append(f"Missing {elem.element_type}: {elem.original[:50]}...")

    return issues


# =============================================================================
# SECTION HANDLING
# =============================================================================

def split_into_sections(text: str) -> List[Tuple[str, str]]:
    """Split text into sections based on headers.

    Returns list of (header, content) tuples.
    """
    sections = []

    # Match markdown headers
    header_pattern = r'^(#{1,6}\s+.+)$'
    parts = re.split(header_pattern, text, flags=re.MULTILINE)

    if not parts[0].strip():
        parts = parts[1:]  # Remove empty first part

    # Handle front matter
    if text.startswith('---'):
        fm_match = re.match(r'^---\n.*?\n---\n', text, re.DOTALL)
        if fm_match:
            sections.append(('__frontmatter__', fm_match.group(0)))
            text = text[fm_match.end():]
            parts = re.split(header_pattern, text, flags=re.MULTILINE)
            if parts and not parts[0].strip():
                parts = parts[1:]

    # Pair headers with content
    i = 0
    while i < len(parts):
        if re.match(r'^#{1,6}\s+', parts[i]):
            header = parts[i]
            content = parts[i + 1] if i + 1 < len(parts) else ''
            sections.append((header, content))
            i += 2
        else:
            # Content without header (intro)
            sections.append(('__intro__', parts[i]))
            i += 1

    return sections


def reassemble_sections(sections: List[Tuple[str, str]]) -> str:
    """Reassemble sections into full text."""
    result = []

    for header, content in sections:
        if header == '__frontmatter__':
            result.append(content)
        elif header == '__intro__':
            result.append(content)
        else:
            result.append(f"{header}\n{content}")

    return '\n'.join(result)


# =============================================================================
# MAIN POLISHING LOGIC
# =============================================================================

def analyze_text_for_suggestions(
    text: str,
    style: str,
) -> SuggestionResult:
    """
    Analyze draft text and return editing suggestions without rewriting.

    This is the DEFAULT mode when --use-internal-model is NOT specified.
    Claude Code receives suggestions and performs the edits itself.

    Args:
        text: Draft text to analyze
        style: Style description or instructions

    Returns:
        SuggestionResult with suggestions for Claude Code to apply
    """
    llm = PolishingLLM()
    if not llm.available():
        raise RuntimeError("LLM not available - OpenAI API key required for analysis")

    word_count = len(text.split())
    preserved_elements = identify_preserved_elements(text)
    sections = split_into_sections(text)

    # Analyze each section for suggestions
    all_suggestions = []
    for header, content in sections:
        if header in ['__frontmatter__'] or len(content.strip()) < 50:
            continue

        section_name = header if header not in ['__intro__'] else 'Introduction'
        try:
            section_suggestions = llm.analyze_for_suggestions(content, style)
            for s in section_suggestions:
                s['section'] = section_name
            all_suggestions.extend(section_suggestions)
        except Exception as e:
            logger.warning(f"Failed to analyze section {section_name}: {e}")

    # Generate summary
    high_priority = sum(1 for s in all_suggestions if s.get('priority') == 'high')
    categories = {}
    for s in all_suggestions:
        cat = s.get('category', 'general')
        categories[cat] = categories.get(cat, 0) + 1

    summary_parts = [f"Found {len(all_suggestions)} suggestions"]
    if high_priority:
        summary_parts.append(f"{high_priority} high priority")
    if categories:
        cat_summary = ", ".join(f"{k}: {v}" for k, v in sorted(categories.items(), key=lambda x: -x[1])[:3])
        summary_parts.append(f"({cat_summary})")

    return SuggestionResult(
        original_path=None,
        suggestions=all_suggestions,
        style_target=style,
        word_count=word_count,
        section_count=len(sections),
        preserved_elements=len(preserved_elements),
        summary=" ".join(summary_parts)
    )


def analyze_file_for_suggestions(
    input_path: Path,
    style: str,
) -> SuggestionResult:
    """Analyze a draft file and return suggestions."""
    logger.info(f"Analyzing for suggestions: {input_path}")

    with open(input_path, 'r', encoding='utf-8') as f:
        text = f.read()

    result = analyze_text_for_suggestions(text=text, style=style)
    result.original_path = str(input_path)
    return result


def polish_text(
    text: str,
    style: str,
    preserve_citations: bool = True,
    section_by_section: bool = True,
    validate_after: bool = False
) -> PolishResult:
    """
    Polish draft text with a writing style while preserving citations.

    Args:
        text: Draft text to polish
        style: Style description or instructions
        preserve_citations: Whether to preserve citation markers
        section_by_section: Process each section separately (better for long docs)
        validate_after: Run validation after polishing

    Returns:
        PolishResult with polished text and metadata
    """
    llm = PolishingLLM()
    if not llm.available():
        raise RuntimeError("LLM not available - OpenAI API key required for polishing")

    word_count_before = len(text.split())
    preserved_elements = []

    # Identify elements to preserve
    if preserve_citations:
        preserved_elements = identify_preserved_elements(text)
        logger.info(f"Identified {len(preserved_elements)} elements to preserve")

    # Replace with placeholders
    working_text = replace_with_placeholders(text, preserved_elements)

    # Polish text
    sections_polished = 0
    if section_by_section:
        sections = split_into_sections(working_text)
        polished_sections = []

        for header, content in sections:
            if header == '__frontmatter__':
                # Don't polish front matter
                polished_sections.append((header, content))
            elif len(content.strip()) < 50:
                # Skip very short sections
                polished_sections.append((header, content))
            else:
                logger.info(f"Polishing section: {header[:50] if header not in ['__intro__', '__frontmatter__'] else header}")
                try:
                    polished_content = llm.polish(content, style)
                    polished_sections.append((header, polished_content))
                    sections_polished += 1
                except Exception as e:
                    logger.error(f"Failed to polish section: {e}")
                    polished_sections.append((header, content))

        polished_text = reassemble_sections(polished_sections)
    else:
        polished_text = llm.polish(working_text, style)
        sections_polished = 1

    # Restore preserved elements
    final_text = restore_preserved_elements(polished_text, preserved_elements)

    # Verify preservation
    preservation_issues = verify_preservation(text, final_text, preserved_elements)
    if preservation_issues:
        logger.warning(f"Preservation issues found: {len(preservation_issues)}")
        for issue in preservation_issues[:5]:
            logger.warning(f"  - {issue}")

    word_count_after = len(final_text.split())

    result = PolishResult(
        original_path=None,
        polished_text=final_text,
        style_used=style,
        preserved_count=len(preserved_elements),
        word_count_before=word_count_before,
        word_count_after=word_count_after,
        sections_polished=sections_polished,
    )

    # Optional validation
    if validate_after:
        result.validation_passed, result.validation_issues = run_validation(text, final_text)

    return result


def polish_file(
    input_path: Path,
    style: str,
    output_path: Optional[Path] = None,
    validate_after: bool = False
) -> PolishResult:
    """Polish a draft file."""
    logger.info(f"Polishing: {input_path}")

    with open(input_path, 'r', encoding='utf-8') as f:
        text = f.read()

    result = polish_text(
        text=text,
        style=style,
        preserve_citations=True,
        section_by_section=True,
        validate_after=validate_after
    )

    result.original_path = str(input_path)

    # Write output
    if output_path is None:
        stem = input_path.stem
        output_path = input_path.parent / f"{stem}_polished.md"

    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(result.polished_text)

    logger.info(f"Written to: {output_path}")
    return result


def run_validation(original: str, polished: str) -> Tuple[bool, List[str]]:
    """Run validation to ensure polishing didn't change claims."""
    try:
        from validate_draft import extract_claims, ValidationLLM

        # Extract claims from both versions
        original_claims = extract_claims(original, use_llm=True)
        polished_claims = extract_claims(polished, use_llm=True)

        issues = []

        # Check claim count
        if len(polished_claims) < len(original_claims) * 0.8:
            issues.append(f"Significant claim reduction: {len(original_claims)} → {len(polished_claims)}")

        # Check for missing key claims (simplified check)
        original_texts = set(c.text.lower()[:50] for c in original_claims)
        polished_texts = set(c.text.lower()[:50] for c in polished_claims)

        missing = original_texts - polished_texts
        if len(missing) > len(original_texts) * 0.2:
            issues.append(f"Many claims may have been altered ({len(missing)} differences)")

        return len(issues) == 0, issues

    except ImportError:
        logger.warning("validate_draft not available for post-polish validation")
        return None, ["Validation skipped - validate_draft.py not available"]
    except Exception as e:
        logger.error(f"Validation failed: {e}")
        return None, [f"Validation error: {str(e)}"]


# =============================================================================
# PREDEFINED STYLES
# =============================================================================

def analyze_reference_style(reference_text: str) -> str:
    """
    Analyze a reference text sample and generate a style prompt for mimicry.

    Uses LLM to analyze the writing style and generate instructions.
    """
    llm = PolishingLLM()
    if not llm.available():
        # Fallback: basic heuristic analysis
        logger.warning("LLM not available, using heuristic style analysis")
        return _heuristic_style_analysis(reference_text)

    # Truncate reference if too long
    sample = reference_text[:4000] if len(reference_text) > 4000 else reference_text

    messages = [
        {"role": "system", "content": """You are an expert literary analyst.
Analyze the writing style of the given text sample and produce a detailed style guide
that could be used to mimic this author's voice and style.

Focus on:
1. Sentence structure (length, complexity, variety)
2. Vocabulary level and word choice patterns
3. Tone (formal/informal, serious/playful, etc.)
4. Paragraph structure and transitions
5. Use of figurative language, metaphors
6. How the author handles technical/complex topics
7. Distinctive stylistic quirks or patterns

Output a concise style guide (5-10 bullet points) that captures the essence of this voice."""},
        {"role": "user", "content": f"Analyze this writing sample and produce a style guide:\n\n{sample}"}
    ]

    try:
        response = llm.client.chat.completions.create(
            model="gpt-4o-mini",  # Use mini for analysis
            messages=messages,
            temperature=0.3,
        )
        style_analysis = response.choices[0].message.content

        # Format as style prompt
        return f"""Mimic the following writing voice and style:

{style_analysis}

Additional guidelines:
- Maintain the cadence and rhythm of the reference author
- Use similar sentence structures and paragraph lengths
- Match the vocabulary level and formality
- Preserve the distinctive voice while improving clarity
"""
    except Exception as e:
        logger.error(f"Style analysis failed: {e}")
        return _heuristic_style_analysis(reference_text)


def _heuristic_style_analysis(text: str) -> str:
    """Basic heuristic style analysis when LLM is unavailable."""
    sentences = re.split(r'(?<=[.!?])\s+', text)
    avg_sentence_length = sum(len(s.split()) for s in sentences) / max(len(sentences), 1)

    paragraphs = text.split('\n\n')
    avg_para_length = sum(len(p.split()) for p in paragraphs) / max(len(paragraphs), 1)

    # Detect formality
    formal_markers = len(re.findall(r'\b(therefore|moreover|furthermore|consequently|thus)\b', text, re.I))
    informal_markers = len(re.findall(r'\b(you|we|I\'m|don\'t|can\'t|won\'t)\b', text, re.I))

    formality = "formal" if formal_markers > informal_markers else "conversational"

    return f"""Write in this style:
- Average sentence length: {avg_sentence_length:.0f} words (match this rhythm)
- Average paragraph length: {avg_para_length:.0f} words
- Tone: {formality}
- Match the pacing and flow of the reference sample
- Use similar vocabulary complexity
"""


PREDEFINED_STYLES = {
    'academic': """
Write in formal academic style:
- Use precise, scholarly language
- Maintain objective tone
- Include appropriate hedging for uncertain claims
- Use topic sentences and clear paragraph structure
- Prefer active voice where appropriate
""",

    'accessible': """
Write in academic but accessible style:
- Explain technical terms when first used
- Use concrete examples to illustrate abstract concepts
- Vary sentence length for readability
- Avoid unnecessary jargon
- Maintain scholarly rigor while being approachable
""",

    'narrative': """
Write in engaging narrative style:
- Use storytelling techniques where appropriate
- Create flow between paragraphs
- Vary rhythm and pacing
- Make historical figures come alive
- Balance narrative with scholarly content
""",

    'concise': """
Write concisely:
- Eliminate redundancy
- Prefer shorter sentences
- Remove filler words
- One idea per paragraph
- Get to the point quickly
""",

    'popular': """
Write for general readers:
- Assume no prior knowledge of the subject
- Define all technical terms
- Use analogies and comparisons to familiar concepts
- Engage the reader's curiosity
- Break up dense content with examples
""",

    'technical': """
Write in technical documentation style:
- Use precise terminology consistently
- Define terms on first use
- Structure content hierarchically
- Include clear examples and explanations
- Avoid ambiguity
""",

    'journalistic': """
Write in journalistic feature style:
- Lead with the most compelling angle
- Use concrete details and anecdotes
- Keep paragraphs short
- Balance exposition with narrative
- Engage readers with human interest elements
""",

    'literary': """
Write in literary nonfiction style:
- Employ rich, evocative language
- Use metaphor and imagery thoughtfully
- Create atmosphere and mood
- Balance craft with clarity
- Let ideas breathe through elegant prose
""",

    'textbook': """
Write in textbook style:
- Organize content with clear learning objectives
- Use numbered lists and bullet points for key concepts
- Include summary sections
- Define key terms in bold
- Progress from simple to complex
""",

    'conversational': """
Write in conversational academic style:
- Use first/second person where appropriate
- Ask rhetorical questions to engage readers
- Include asides and digressions when illuminating
- Balance informality with substance
- Write as if explaining to a curious friend
"""
}


# =============================================================================
# CLI
# =============================================================================

def create_parser() -> argparse.ArgumentParser:
    """Create CLI argument parser."""
    parser = argparse.ArgumentParser(
        prog='polish_draft',
        description='Polish draft chapters with writing style prompts while preserving citations',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Polish with style description
  python polish_draft.py chapter_01.md --style "academic but accessible"

  # Use predefined style
  python polish_draft.py chapter_01.md --preset narrative

  # Use style prompt from file
  python polish_draft.py chapter_01.md --prompt prompts/style.txt

  # Polish and validate
  python polish_draft.py chapter_01.md --style "engaging" --validate

  # Output to specific file
  python polish_draft.py chapter_01.md --style "concise" --output polished.md

Predefined styles: academic, accessible, narrative, concise, popular,
                   technical, journalistic, literary, textbook, conversational
        """
    )

    # Input
    parser.add_argument(
        'input',
        type=str,
        help='Path to draft markdown file'
    )

    # Style options (mutually exclusive)
    style_group = parser.add_argument_group('Style Options')
    style_exclusive = style_group.add_mutually_exclusive_group(required=True)
    style_exclusive.add_argument(
        '--style', '-s',
        type=str,
        help='Style description (free text)'
    )
    style_exclusive.add_argument(
        '--preset', '-p',
        type=str,
        choices=list(PREDEFINED_STYLES.keys()),
        help='Use predefined style'
    )
    style_exclusive.add_argument(
        '--prompt',
        type=str,
        help='Path to file containing style prompt'
    )
    style_exclusive.add_argument(
        '--reference-style',
        type=str,
        metavar='FILE',
        help='Path to a text file whose writing style should be mimicked (voice cloning)'
    )

    # Output options
    output_group = parser.add_argument_group('Output Options')
    output_group.add_argument(
        '--output', '-o',
        type=str,
        help='Output file path (default: input_polished.md)'
    )
    output_group.add_argument(
        '--format', '-f',
        type=str,
        choices=['text', 'json'],
        default='text',
        help='Output format for results summary'
    )

    # Processing options
    process_group = parser.add_argument_group('Processing Options')
    process_group.add_argument(
        '--use-internal-model',
        action='store_true',
        help='Use internal LLM to rewrite text (default: return suggestions for Claude Code)'
    )
    process_group.add_argument(
        '--no-preserve',
        action='store_true',
        help='Do not preserve citations (not recommended)'
    )
    process_group.add_argument(
        '--whole-file',
        action='store_true',
        help='Process entire file at once instead of section by section'
    )
    process_group.add_argument(
        '--validate',
        action='store_true',
        help='Run validation after polishing to check for claim changes'
    )
    process_group.add_argument(
        '--dry-run',
        action='store_true',
        help='Show what would be preserved without polishing'
    )

    return parser


def main():
    """Main entry point."""
    parser = create_parser()
    args = parser.parse_args()

    input_path = Path(args.input)
    if not input_path.exists():
        print(f"Error: File not found: {input_path}")
        return 1

    # Determine style
    if args.preset:
        style = PREDEFINED_STYLES[args.preset]
    elif args.prompt:
        prompt_path = Path(args.prompt)
        if not prompt_path.exists():
            print(f"Error: Prompt file not found: {prompt_path}")
            return 1
        with open(prompt_path, 'r') as f:
            style = f.read()
    elif args.reference_style:
        # Analyze reference style file and generate style prompt
        ref_path = Path(args.reference_style)
        if not ref_path.exists():
            print(f"Error: Reference style file not found: {ref_path}")
            return 1
        with open(ref_path, 'r', encoding='utf-8') as f:
            reference_text = f.read()
        print(f"Analyzing writing style from: {ref_path}")
        style = analyze_reference_style(reference_text)
        print(f"Style analysis complete. Polishing to match voice...")
    else:
        style = args.style

    # Dry run - show what would be preserved
    if args.dry_run:
        with open(input_path, 'r') as f:
            text = f.read()

        elements = identify_preserved_elements(text)
        print(f"\n{'='*60}")
        print(f"DRY RUN: {input_path}")
        print(f"{'='*60}")
        print(f"Would preserve {len(elements)} elements:\n")

        for elem in sorted(elements, key=lambda x: x.start_pos)[:20]:
            preview = elem.original[:60].replace('\n', '\\n')
            print(f"  [{elem.element_type}] {preview}...")

        if len(elements) > 20:
            print(f"\n  ... and {len(elements) - 20} more")

        return 0

    # Determine mode: suggestion (default) vs polish (--use-internal-model)
    use_internal_model = getattr(args, 'use_internal_model', False)

    try:
        if use_internal_model:
            # DELEGATED MODE: Use internal LLM to polish the text
            output_path = Path(args.output) if args.output else None

            result = polish_file(
                input_path=input_path,
                style=style,
                output_path=output_path,
                validate_after=args.validate
            )

            # Output results
            if args.format == 'json':
                output = {
                    'mode': 'polish',
                    'input': result.original_path,
                    'output': str(output_path) if output_path else f"{input_path.stem}_polished.md",
                    'style': result.style_used[:100] + '...' if len(result.style_used) > 100 else result.style_used,
                    'preserved_elements': result.preserved_count,
                    'sections_polished': result.sections_polished,
                    'word_count': {
                        'before': result.word_count_before,
                        'after': result.word_count_after,
                        'change': result.word_count_after - result.word_count_before
                    },
                    'validation': {
                        'passed': result.validation_passed,
                        'issues': result.validation_issues
                    } if result.validation_passed is not None else None
                }
                print(json.dumps(output, indent=2))
            else:
                print(f"\n{'='*60}")
                print("POLISH COMPLETE (Internal Model)")
                print(f"{'='*60}")
                print(f"Input:  {result.original_path}")
                print(f"Output: {output_path or f'{input_path.stem}_polished.md'}")
                print(f"\nPreserved: {result.preserved_count} citations/quotes")
                print(f"Sections:  {result.sections_polished} polished")
                print(f"Words:     {result.word_count_before} → {result.word_count_after} ({result.word_count_after - result.word_count_before:+d})")

                if result.validation_passed is not None:
                    print(f"\nValidation: {'PASSED' if result.validation_passed else 'ISSUES FOUND'}")
                    if result.validation_issues:
                        for issue in result.validation_issues:
                            print(f"  - {issue}")

        else:
            # DEFAULT MODE: Return suggestions for Claude Code to apply
            result = analyze_file_for_suggestions(
                input_path=input_path,
                style=style
            )

            # Output results
            if args.format == 'json':
                output = {
                    'mode': 'suggestions',
                    'input': result.original_path,
                    'style_target': result.style_target[:100] + '...' if len(result.style_target) > 100 else result.style_target,
                    'word_count': result.word_count,
                    'section_count': result.section_count,
                    'preserved_elements': result.preserved_elements,
                    'summary': result.summary,
                    'suggestions': result.suggestions,
                    'note': 'Claude Code should apply these suggestions to the original file'
                }
                print(json.dumps(output, indent=2))
            else:
                print(f"\n{'='*60}")
                print("POLISH SUGGESTIONS (for Claude Code)")
                print(f"{'='*60}")
                print(f"Input:  {result.original_path}")
                print(f"Words:  {result.word_count}")
                print(f"Sections: {result.section_count}")
                print(f"Elements to preserve: {result.preserved_elements}")
                print(f"\n{result.summary}\n")

                # Group by priority
                high = [s for s in result.suggestions if s.get('priority') == 'high']
                medium = [s for s in result.suggestions if s.get('priority') == 'medium']
                low = [s for s in result.suggestions if s.get('priority') == 'low']
                other = [s for s in result.suggestions if s.get('priority') not in ['high', 'medium', 'low']]

                if high:
                    print("HIGH PRIORITY:")
                    for s in high:
                        loc = f" (near: '{s['location'][:30]}...')" if s.get('location') else ""
                        print(f"  [{s.get('category', 'general')}] {s['suggestion']}{loc}")
                    print()

                if medium:
                    print("MEDIUM PRIORITY:")
                    for s in medium:
                        loc = f" (near: '{s['location'][:30]}...')" if s.get('location') else ""
                        print(f"  [{s.get('category', 'general')}] {s['suggestion']}{loc}")
                    print()

                if low or other:
                    print("LOW PRIORITY:")
                    for s in (low + other):
                        loc = f" (near: '{s['location'][:30]}...')" if s.get('location') else ""
                        print(f"  [{s.get('category', 'general')}] {s['suggestion']}{loc}")
                    print()

                print("\nNote: Run with --use-internal-model to have the CLI polish the file directly.")

        return 0

    except Exception as e:
        logger.error(f"Polishing failed: {e}")
        if args.format == 'json':
            print(json.dumps({'error': str(e)}))
        else:
            print(f"Error: {e}")
        return 1


if __name__ == '__main__':
    sys.exit(main())
