#!/usr/bin/env python3
"""
Comprehensive permit-to-job matching script.

Matches permit_sync records to scheduler jobs/customers using multiple strategies:
  1. TMK exact match
  2. Normalized address exact match
  3. Fuzzy address match (street number + street name)
  4. Street number + zip code match

Also backfills TMK from permit_sync into customers where a confident match exists.

Usage:
  python3 match_permits_to_jobs.py                # Run matching + backfill
  python3 match_permits_to_jobs.py --dry-run      # Preview only, no DB writes
  python3 match_permits_to_jobs.py --no-backfill  # Match only, skip TMK backfill
  python3 match_permits_to_jobs.py --verbose       # Detailed logging
"""

import argparse
import logging
import re
import sys
from datetime import datetime

import mysql.connector

log = logging.getLogger("permit_matcher")

DB_CONFIG = {
    "host": "18.225.0.90",
    "user": "AEI_User",
    "password": "P@55w02d7777",
    "database": "mandhdesign_schedular",
}

# ---------------------------------------------------------------------------
# Address Normalization (improved)
# ---------------------------------------------------------------------------

# Street type abbreviation map — both directions collapse to abbreviation
STREET_TYPES = {
    "STREET": "ST", "STR": "ST",
    "AVENUE": "AVE", "AV": "AVE",
    "DRIVE": "DR", "DRV": "DR",
    "ROAD": "RD",
    "LANE": "LN",
    "COURT": "CT",
    "PLACE": "PL",
    "BOULEVARD": "BLVD",
    "CIRCLE": "CIR",
    "PARKWAY": "PKWY",
    "HIGHWAY": "HWY",
    "LOOP": "LP",
    "TERRACE": "TER", "TERR": "TER",
    "TRAIL": "TRL",
    "WAY": "WAY",
    "PATH": "PATH",
}

# Build regex: match full words only
_STREET_TYPE_PATTERN = re.compile(
    r'\b(' + '|'.join(
        # Include both the long forms AND the short forms that need normalizing
        list(STREET_TYPES.keys()) + [v for v in set(STREET_TYPES.values())]
    ) + r')\.?\b',
    re.IGNORECASE,
)

def _replace_street_type(m):
    word = m.group(1).upper().rstrip('.')
    return STREET_TYPES.get(word, word)


def normalize_address(addr):
    """Aggressively normalize an address for matching.

    Handles:
    - Leading zeros in street numbers: 45-0441 → 45-441
    - Street type abbreviations: Street/St./ST → ST
    - Unit/apt removal
    - Punctuation removal
    - Case normalization
    - Letter suffixes on street numbers: 2123C, 2272A
    """
    if not addr:
        return ""

    addr = str(addr).strip().upper()

    # Remove everything after first comma (city, state, zip from SF format)
    if "," in addr:
        addr = addr.split(",")[0].strip()

    # Remove unit/apt/suite designators and everything after
    addr = re.sub(r'\s+(APT|UNIT|STE|SUITE|#|RM|ROOM|NO)\s*\.?\s*\S*.*$', '', addr)
    # Also handle "Unit A" at end without space before
    addr = re.sub(r'\s*#\s*\S+\s*$', '', addr)

    # Remove leading area names like "Kahala, " before the actual address
    addr = re.sub(r'^[A-Z]+,\s*', '', addr)

    # Normalize street types (Street→ST, Place→PL, etc.)
    addr = _STREET_TYPE_PATTERN.sub(_replace_street_type, addr)

    # Remove all punctuation (periods, commas)
    addr = re.sub(r'[.,]', '', addr)

    # Normalize leading zeros in street numbers: 45-0441 → 45-441, 91-0289 → 91-289
    addr = re.sub(r'^(\d+)-0+(\d)', r'\1-\2', addr)

    # Remove letter suffixes from street number for matching: "2123C" → "2123", "45-248B" → "45-248"
    # But keep them in a secondary form

    # Collapse whitespace
    addr = re.sub(r'\s+', ' ', addr).strip()

    return addr


def extract_street_number(addr):
    """Extract street number, stripping leading zeros and optional letter suffix."""
    if not addr:
        return "", ""
    addr = str(addr).strip().upper()
    # Match patterns like: 94-1044, 45-0441, 2123C, 2272A, 1134
    m = re.match(r'^(\d+(?:-\d+)?)\s*([A-Z])?\s', addr)
    if not m:
        return "", ""
    num = m.group(1)
    suffix = m.group(2) or ""
    # Strip leading zeros after hyphen: 45-0441 → 45-441
    num = re.sub(r'^(\d+)-0+(\d)', r'\1-\2', num)
    return num, suffix


def extract_street_name(normalized_addr):
    """Extract just the street name portion after the street number."""
    if not normalized_addr:
        return ""
    # Remove the leading number (with optional letter suffix)
    name = re.sub(r'^\d+(?:-\d+)?\s*[A-Z]?\s+', '', normalized_addr)
    return name.strip()


# ---------------------------------------------------------------------------
# Database
# ---------------------------------------------------------------------------

class Database:
    def __init__(self, dry_run=False):
        self.conn = None
        self.dry_run = dry_run

    def connect(self):
        self.conn = mysql.connector.connect(**DB_CONFIG)
        self.conn.autocommit = True
        log.info("Connected to production DB (%s)", DB_CONFIG["host"])

    def close(self):
        if self.conn:
            self.conn.close()

    def fetchall(self, sql, params=None):
        cursor = self.conn.cursor(dictionary=True)
        cursor.execute(sql, params)
        return cursor.fetchall()

    def fetchone(self, sql, params=None):
        cursor = self.conn.cursor(dictionary=True)
        cursor.execute(sql, params)
        return cursor.fetchone()

    def execute(self, sql, params=None):
        if self.dry_run:
            return
        cursor = self.conn.cursor()
        cursor.execute(sql, params)
        return cursor


# ---------------------------------------------------------------------------
# Matching Engine
# ---------------------------------------------------------------------------

class PermitMatcher:
    def __init__(self, db):
        self.db = db
        self._customer_cache = None
        self._addr_index = None  # street_number → [(customer_id, normalized_addr, city, zip, tmk, address_raw)]

    def build_customer_index(self):
        """Load all customers into memory and build an address index for fast matching."""
        log.info("Building customer address index...")

        rows = self.db.fetchall("""
            SELECT c.id as customer_id, c.address, c.city, c.zip_code, c.cust_tmk,
                   GROUP_CONCAT(j.id ORDER BY j.id DESC) as job_ids
            FROM customers c
            JOIN jobs j ON j.customer_id = c.id
            WHERE c.address IS NOT NULL AND c.address != ''
            GROUP BY c.id
        """)

        self._customer_cache = rows
        self._addr_index = {}

        for row in rows:
            norm = normalize_address(row["address"])
            if not norm:
                continue

            street_num, suffix = extract_street_number(row["address"])
            if not street_num:
                continue

            entry = {
                "customer_id": row["customer_id"],
                "normalized": norm,
                "street_name": extract_street_name(norm),
                "city": (row["city"] or "").strip().upper(),
                "zip": (str(row["zip_code"]) if row["zip_code"] else "").strip(),
                "tmk": (row["cust_tmk"] or "").strip(),
                "address_raw": row["address"],
                "job_ids": row["job_ids"],
                "suffix": suffix,
            }

            if street_num not in self._addr_index:
                self._addr_index[street_num] = []
            self._addr_index[street_num].append(entry)

        log.info("  Indexed %d customers with %d unique street numbers",
                 len(rows), len(self._addr_index))

    def build_tmk_index(self):
        """Build TMK → customer mapping."""
        self._tmk_index = {}
        for row in self._customer_cache:
            tmk = (row["cust_tmk"] or "").strip()
            if tmk and tmk != "NULL":
                if tmk not in self._tmk_index:
                    self._tmk_index[tmk] = []
                self._tmk_index[tmk].append(row)
        log.info("  TMK index: %d unique TMKs across %d customers",
                 len(self._tmk_index),
                 sum(len(v) for v in self._tmk_index.values()))

    def match_permit(self, permit):
        """Try to match a permit to a job. Returns (job_id, match_method, confidence, details)."""

        sf_addr = permit["address_display"] or ""
        sf_tmk = (permit["parcel_tmk"] or "").strip()

        # Parse SF address
        sf_street = sf_addr.split(",")[0].strip() if sf_addr else ""
        sf_zip = ""
        if sf_addr:
            zip_match = re.search(r'\b(\d{5})\b', sf_addr)
            if zip_match:
                sf_zip = zip_match.group(1)
        sf_city = ""
        if "," in sf_addr:
            parts = sf_addr.split(",")
            if len(parts) >= 2:
                sf_city = parts[1].strip().upper()
                # Remove "/ Area" suffix from city: "Honolulu / Makiki" → "Honolulu"
                sf_city = re.sub(r'\s*/\s*.*$', '', sf_city)

        norm_sf = normalize_address(sf_street)
        sf_street_name = extract_street_name(norm_sf)
        sf_num, sf_suffix = extract_street_number(sf_street)

        # --- Strategy 1: TMK exact match ---
        if sf_tmk and sf_tmk in self._tmk_index:
            customers = self._tmk_index[sf_tmk]
            # Pick the most recent job
            best_job = 0
            best_cust = None
            for c in customers:
                jobs = c["job_ids"].split(",") if c["job_ids"] else []
                if jobs:
                    jid = int(jobs[0])  # Most recent (DESC order)
                    if jid > best_job:
                        best_job = jid
                        best_cust = c
            if best_job:
                return best_job, "tmk_exact", 100, f"TMK {sf_tmk} → cust {best_cust['customer_id']}"

        # --- Strategy 2: Normalized address exact match ---
        if sf_num and sf_num in self._addr_index:
            candidates = self._addr_index[sf_num]

            for c in candidates:
                if norm_sf == c["normalized"]:
                    best_job = int(c["job_ids"].split(",")[0]) if c["job_ids"] else 0
                    if best_job:
                        return best_job, "addr_exact", 95, f"'{sf_street}' == '{c['address_raw']}'"

        # --- Strategy 3: Street name match (ignore suffix letter differences) ---
        if sf_num and sf_street_name and sf_num in self._addr_index:
            candidates = self._addr_index[sf_num]
            best_job = 0
            best_score = 0
            best_detail = ""

            for c in candidates:
                score = 0

                # Street name must match
                if sf_street_name != c["street_name"]:
                    continue

                score = 70  # Base score for street name match

                # Zip code bonus
                if sf_zip and c["zip"] and sf_zip == c["zip"]:
                    score += 15

                # City bonus
                if sf_city and c["city"]:
                    if sf_city == c["city"].upper():
                        score += 10
                    # Handle abbreviations: "Hon" == "Honolulu"
                    elif sf_city.startswith(c["city"].upper()[:3]) or c["city"].upper().startswith(sf_city[:3]):
                        score += 5

                # Suffix match bonus
                if sf_suffix and c["suffix"] and sf_suffix == c["suffix"]:
                    score += 5

                if score > best_score:
                    best_score = score
                    jid = int(c["job_ids"].split(",")[0]) if c["job_ids"] else 0
                    best_job = jid
                    best_detail = f"'{sf_street}' ~ '{c['address_raw']}' (zip:{sf_zip}={c['zip']}, city:{sf_city}={c['city']})"

            if best_job and best_score >= 70:
                return best_job, "addr_fuzzy", best_score, best_detail

        # --- Strategy 4: Try without suffix on street number (e.g., 2123C → 2123) ---
        if sf_num and sf_suffix:
            base_num = sf_num  # Already stripped; the suffix is separate
            if base_num in self._addr_index and sf_street_name:
                candidates = self._addr_index[base_num]
                for c in candidates:
                    if sf_street_name == c["street_name"]:
                        jid = int(c["job_ids"].split(",")[0]) if c["job_ids"] else 0
                        if jid:
                            score = 65
                            if sf_zip and c["zip"] and sf_zip == c["zip"]:
                                score += 15
                            return jid, "addr_fuzzy_nosuffix", score, f"'{sf_street}' ~ '{c['address_raw']}' (ignored suffix)"

        # --- Strategy 5: Try matching with common number variations ---
        # Some addresses have off-by-small-amount numbers on same street
        # Skip this — too risky for false positives

        return 0, "none", 0, ""

    def run(self, dry_run=False, backfill_tmk=True):
        """Run matching on all unmatched permits."""

        self.build_customer_index()
        self.build_tmk_index()

        # Get ALL permits (including already-matched, to re-evaluate)
        permits = self.db.fetchall("""
            SELECT id, permit_number, address_display, parcel_tmk, job_id
            FROM permit_sync
            WHERE address_display IS NOT NULL AND address_display != ''
        """)
        log.info("Processing %d permits for matching", len(permits))

        stats = {
            "total": len(permits),
            "already_matched": 0,
            "newly_matched": 0,
            "upgraded": 0,  # Better match found for already-matched
            "unmatched": 0,
            "by_method": {},
            "confidence_buckets": {"90-100": 0, "80-89": 0, "70-79": 0, "60-69": 0},
        }

        unmatched_samples = []
        matched_details = []

        for p in permits:
            job_id, method, confidence, detail = self.match_permit(p)

            if job_id:
                bucket = "90-100" if confidence >= 90 else "80-89" if confidence >= 80 else "70-79" if confidence >= 70 else "60-69"
                stats["confidence_buckets"][bucket] += 1
                stats["by_method"][method] = stats["by_method"].get(method, 0) + 1

                if p["job_id"] and p["job_id"] > 0:
                    if p["job_id"] != job_id:
                        stats["upgraded"] += 1
                        log.debug("  UPGRADE %s: job %d → %d (%s, %d%%)",
                                  p["permit_number"], p["job_id"], job_id, method, confidence)
                    else:
                        stats["already_matched"] += 1
                else:
                    stats["newly_matched"] += 1
                    matched_details.append({
                        "permit": p["permit_number"],
                        "job_id": job_id,
                        "method": method,
                        "confidence": confidence,
                        "detail": detail,
                    })
                    log.info("  MATCH %s → job %d [%s %d%%] %s",
                             p["permit_number"], job_id, method, confidence, detail[:80])

                # Update job_id if it changed
                if not dry_run and (p["job_id"] != job_id):
                    self.db.execute(
                        "UPDATE permit_sync SET job_id = %s, updated = NOW() WHERE id = %s",
                        (job_id, p["id"]),
                    )
            else:
                stats["unmatched"] += 1
                if len(unmatched_samples) < 30:
                    unmatched_samples.append({
                        "permit": p["permit_number"],
                        "address": p["address_display"],
                        "tmk": p["parcel_tmk"],
                    })

        # --- TMK Backfill ---
        tmk_backfilled = 0
        if backfill_tmk:
            log.info("")
            log.info("=" * 60)
            log.info("TMK Backfill: Updating customers with TMK from permit data")
            log.info("=" * 60)

            # For each confident address match where customer has no TMK but permit has TMK
            for p in permits:
                sf_tmk = (p["parcel_tmk"] or "").strip()
                if not sf_tmk:
                    continue

                job_id, method, confidence, detail = self.match_permit(p)

                # Only backfill from address-based matches with good confidence
                if job_id and method.startswith("addr") and confidence >= 80:
                    # Get the customer for this job
                    cust = self.db.fetchone("""
                        SELECT c.id, c.cust_tmk, c.address
                        FROM customers c
                        JOIN jobs j ON j.customer_id = c.id
                        WHERE j.id = %s
                    """, (job_id,))

                    if cust and (not cust["cust_tmk"] or cust["cust_tmk"].strip() in ("", "NULL")):
                        tmk_backfilled += 1
                        log.info("  TMK BACKFILL: customer %d (%s) ← TMK %s (from %s, %d%%)",
                                 cust["id"], cust["address"], sf_tmk, method, confidence)
                        if not dry_run:
                            self.db.execute(
                                "UPDATE customers SET cust_tmk = %s WHERE id = %s",
                                (sf_tmk, cust["id"]),
                            )

        # --- Summary ---
        log.info("")
        log.info("=" * 60)
        log.info("MATCHING RESULTS SUMMARY")
        log.info("=" * 60)
        log.info("  Total permits:       %d", stats["total"])
        log.info("  Already matched:     %d", stats["already_matched"])
        log.info("  Newly matched:       %d", stats["newly_matched"])
        log.info("  Match upgraded:      %d", stats["upgraded"])
        log.info("  Still unmatched:     %d", stats["unmatched"])
        log.info("")
        total_matched = stats["already_matched"] + stats["newly_matched"] + stats["upgraded"]
        match_rate = (total_matched / stats["total"] * 100) if stats["total"] else 0
        log.info("  MATCH RATE:          %.1f%% (%d / %d)", match_rate, total_matched, stats["total"])
        log.info("")
        log.info("  By method:")
        for method, count in sorted(stats["by_method"].items(), key=lambda x: -x[1]):
            log.info("    %-25s %d", method, count)
        log.info("")
        log.info("  By confidence:")
        for bucket, count in sorted(stats["confidence_buckets"].items(), reverse=True):
            log.info("    %-10s %d", bucket, count)

        if tmk_backfilled:
            log.info("")
            log.info("  TMK backfilled:      %d customers updated", tmk_backfilled)

        if unmatched_samples:
            log.info("")
            log.info("=" * 60)
            log.info("UNMATCHED SAMPLES (first %d):", len(unmatched_samples))
            log.info("=" * 60)
            for u in unmatched_samples:
                log.info("  %s | %s | TMK: %s", u["permit"], u["address"], u["tmk"])

        if dry_run:
            log.info("")
            log.info("*** DRY RUN — no changes written to database ***")

        return stats


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(description="Match DPP permits to scheduler jobs")
    parser.add_argument("--dry-run", action="store_true", help="Preview only, no DB writes")
    parser.add_argument("--no-backfill", action="store_true", help="Skip TMK backfill")
    parser.add_argument("--verbose", action="store_true", help="Debug-level logging")
    args = parser.parse_args()

    level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(
        level=level,
        format="%(asctime)s [%(levelname)s] %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        handlers=[
            logging.StreamHandler(sys.stdout),
            logging.FileHandler("/tmp/permit_match.log", mode="w"),
        ],
    )

    log.info("=" * 60)
    log.info("Permit-to-Job Matching Engine")
    log.info("  dry_run=%s, backfill_tmk=%s", args.dry_run, not args.no_backfill)
    log.info("=" * 60)

    db = Database(dry_run=args.dry_run)
    db.connect()

    try:
        matcher = PermitMatcher(db)
        matcher.run(dry_run=args.dry_run, backfill_tmk=not args.no_backfill)
    finally:
        db.close()


if __name__ == "__main__":
    main()