#!/usr/bin/env python3
"""
DPP Permit Match - Link SF permits to AEI jobs via TMK and address matching.

This script matches permit_sync records (from Salesforce) to AEI jobs/customers
by trying:
  1. TMK match (normalized 8-digit TMK comparison)
  2. Address match (normalized SF address vs normalized AEI address)

When a match is found:
  - Sets permit_sync.job_id to the matched job
  - If the AEI customer is missing TMK but the SF permit has it,
    backfills customers.cust_tmk

Usage:
  python3 dpp_permit_match.py                  # Dry run (default)
  python3 dpp_permit_match.py --apply          # Apply matches to DB
  python3 dpp_permit_match.py --verbose        # Detailed logging
  python3 dpp_permit_match.py --limit 10       # Process first N unmatched
"""

import argparse
import logging
import re
import sys

import mysql.connector

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------

DB_CONFIG = {
    "host": "18.225.0.90",
    "user": "AEI_User",
    "password": "P@55w02d7777",
    "database": "mandhdesign_schedular",
}

log = logging.getLogger("dpp_match")


def setup_logging(verbose=False):
    level = logging.DEBUG if verbose else logging.INFO
    fmt = "%(asctime)s [%(levelname)s] %(message)s"
    logging.basicConfig(level=level, format=fmt, datefmt="%Y-%m-%d %H:%M:%S")


# ---------------------------------------------------------------------------
# TMK normalization
# ---------------------------------------------------------------------------

def normalize_tmk(tmk):
    """Extract the core 8-digit TMK from various formats.

    AEI TMK formats seen:
      '29066059'          → 29066059  (already 8 digits)
      '29066059:0000'     → 29066059  (8 digits + colon + condo unit)
      '440130330008'      → 44013033  (first 8 digits, trailing = unit)
      ' 94149095'         → 94149095  (leading spaces)
      '91092053  '        → 91092053  (trailing spaces)
      '1-1-1-038-071-0000'→ 11038071  (dashed format, strip leading zone/section)
      '9-4-007:085 (0022)'→ 94007085  (dashed + colon + parens)

    SF TMKs are always clean 8-digit strings.
    """
    if not tmk:
        return ""
    tmk = tmk.strip()
    if not tmk:
        return ""

    # Strip everything after colon (condo unit), parentheses, spaces
    tmk = re.split(r'[:\s(]', tmk)[0].strip()

    # Remove dashes (handles "1-1-1-038-071-0000" → "111038071...")
    digits_only = tmk.replace('-', '')

    # Remove non-digit chars
    digits_only = re.sub(r'[^0-9]', '', digits_only)

    if not digits_only:
        return ""

    # If exactly 8 digits, use as-is
    if len(digits_only) == 8:
        return digits_only

    # If > 8 digits, the first 8 are typically the TMK (rest is unit/condo)
    # But check: if it's a dashed format like "1-1-1-038-071-0000" (zone-section-plat-parcel)
    # the Hawaiian TMK is typically 8 digits: ZZ-PPP-PPP (zone2 + plat3 + parcel3)
    # or sometimes just the 8-digit parcel identifier
    if len(digits_only) > 8:
        return digits_only[:8]

    # Less than 8 digits - probably junk, but try zero-padding
    if len(digits_only) >= 6:
        return digits_only.ljust(8, '0')

    return ""


# ---------------------------------------------------------------------------
# Address normalization
# ---------------------------------------------------------------------------

# Abbreviation map: normalize everything to the short form
STREET_ABBREVS = {
    'STREET': 'ST', 'AVENUE': 'AVE', 'DRIVE': 'DR', 'ROAD': 'RD',
    'LANE': 'LN', 'COURT': 'CT', 'PLACE': 'PL', 'BOULEVARD': 'BLVD',
    'CIRCLE': 'CIR', 'PARKWAY': 'PKWY', 'HIGHWAY': 'HWY', 'TERRACE': 'TER',
    'TRAIL': 'TRL', 'WAY': 'WAY', 'LOOP': 'LOOP', 'PATH': 'PATH',
}

# Hawaiian city names commonly appended to SF addresses
HAWAII_CITIES = {
    'HONOLULU', 'KAILUA', 'KANEOHE', 'KAPOLEI', 'EWA BEACH', 'AIEA',
    'PEARL CITY', 'MILILANI', 'WAIPAHU', 'WAHIAWA', 'HALEIWA', 'LAIE',
    'HAUULA', 'KAAAWA', 'WAIANAE', 'MAKAHA', 'NANAKULI', 'WAIMANALO',
    'HAWAII KAI', 'KAHALA', 'MANOA', 'MAKIKI', 'MOILIILI', 'KAPAHULU',
    'WAIKIKI', 'KAHUKU', 'WAIALUA', 'KUNIA', 'WAIPIO', 'KAHALUU',
    'AHUIMANU', 'HEEIA', 'EWA', 'OCEAN POINTE', 'HOOPILI', 'KALAELOA',
    'SCHOFIELD', 'WHEELER', 'IROQUOIS', 'SALT LAKE', 'MOANALUA',
    'FOSTER VILLAGE', 'NEWTOWN', 'VILLAGE PARK', 'WAIKELE', 'CRESTVIEW',
    'PACIFIC PALISADES', 'MAKAKILO', 'ROYAL KUNIA', 'WAIALAE',
    'DOWNTOWN', 'WAIALAE KAHALA', 'KAIMUKI', 'PALOLO', 'ST LOUIS',
}


def normalize_address(addr):
    """Normalize an address for comparison. Returns uppercase string."""
    if not addr:
        return ""
    addr = addr.strip().upper()

    # Remove SF prefixes like "{PV} (PC)", "{RM}", etc.
    addr = re.sub(r'^\{[^}]*\}\s*(\([^)]*\)\s*)*', '', addr)

    # Remove parenthetical suffixes like "(LOWER HOUSE)"
    addr = re.sub(r'\s*\([^)]*\)\s*', ' ', addr)

    # Remove unit/apt/suite suffixes
    addr = re.sub(r'\s+(APT|UNIT|STE|SUITE|#|RM|ROOM)\s*\.?\s*\S*', '', addr)

    # Normalize directional prefixes
    addr = re.sub(r'\bEAST\b', 'E', addr)
    addr = re.sub(r'\bWEST\b', 'W', addr)
    addr = re.sub(r'\bNORTH\b', 'N', addr)
    addr = re.sub(r'\bSOUTH\b', 'S', addr)

    # Normalize street type abbreviations (long form → short form)
    for long_form, short_form in STREET_ABBREVS.items():
        addr = re.sub(r'\b' + long_form + r'\b\.?', short_form, addr)
    # Also handle abbreviated forms with trailing periods (e.g., "St." → "ST")
    for short_form in STREET_ABBREVS.values():
        addr = re.sub(r'\b' + short_form + r'\.', short_form, addr)

    # Remove punctuation and collapse whitespace
    addr = re.sub(r'[.,\'#]', '', addr)
    addr = re.sub(r'\s+', ' ', addr).strip()
    return addr


def normalize_street_number(num):
    """Normalize a street number by stripping leading zeros from dash parts.

    "45-0248" → "45-248", "91-1501" → "91-1501", "503" → "503"
    """
    if not num:
        return num
    if '-' in num:
        parts = num.split('-')
        return '-'.join(p.lstrip('0') or '0' for p in parts)
    return num


def extract_street_number(addr):
    """Extract leading street number, including dash-prefixed Hawaiian addresses.

    Examples:
      "503 ANOLANI ST"      → "503"
      "91-1501 HALAHUA ST"  → "91-1501"
      "2578-C PACIFIC HTS"  → "2578"
      "45-0248A PAHIKAUA"   → "45-248"
    """
    if not addr:
        return ""
    addr = addr.strip()
    # Match number, optional dash+more digits
    m = re.match(r'^(\d+(?:-\d+)?)', addr)
    if m:
        return normalize_street_number(m.group(1))
    return ""


def strip_city_from_sf_street(street):
    """Remove city name appended to SF street addresses.

    SF format: "503 ANOLANI ST Honolulu" or "91-1501 HALAHUA ST Kapolei"
    We want: "503 ANOLANI ST" or "91-1501 HALAHUA ST"
    """
    if not street:
        return street
    upper = street.upper()

    # Try removing known city names from the end
    for city in sorted(HAWAII_CITIES, key=len, reverse=True):
        # Check if the address ends with the city name (case-insensitive)
        if upper.endswith(' ' + city):
            return street[:len(street) - len(city) - 1].strip()

    # Also try: if the last word looks like a city (not a street type),
    # and the second-to-last word IS a street type, strip the last word
    words = street.split()
    if len(words) >= 3:
        last = words[-1].upper().rstrip('.')
        second_last = words[-2].upper().rstrip('.')
        all_abbrevs = set(STREET_ABBREVS.values()) | set(STREET_ABBREVS.keys())
        if second_last in all_abbrevs and last not in all_abbrevs:
            return ' '.join(words[:-1])

    return street


def extract_street_words(addr):
    """Extract the street name words (after the number) from a normalized address.

    "91-1501 HALAHUA ST"     → ["HALAHUA", "ST"]
    "503 ANOLANI ST"         → ["ANOLANI", "ST"]
    "2578-C PACIFIC HTS RD"  → ["PACIFIC", "HTS", "RD"]
    "2578C PACIFIC HTS RD"   → ["PACIFIC", "HTS", "RD"]
    "47-496 A AHUIMANU RD"   → ["AHUIMANU", "RD"]  (skip single-letter unit)
    """
    if not addr:
        return []
    # Remove leading number (with optional dash-number, optional letter/dash-letter suffix)
    m = re.match(r'^\d+(?:-\d+)?(?:-?[A-Z])?\s+(.+)', addr.strip().upper())
    if m:
        words = m.group(1).split()
        # Skip leading single-letter words (unit designators like "A", "B", "C")
        while words and len(words[0]) == 1 and words[0].isalpha():
            words = words[1:]
        return words
    return []


# ---------------------------------------------------------------------------
# Matching logic
# ---------------------------------------------------------------------------

def run_matching(args):
    setup_logging(args.verbose)
    log.info("=" * 60)
    log.info("DPP Permit Match (apply=%s)", args.apply)
    log.info("=" * 60)

    conn = mysql.connector.connect(**DB_CONFIG)
    cursor = conn.cursor(dictionary=True)

    # Get unmatched permits with address/TMK data
    sql = """
        SELECT id, permit_number, address_display, parcel_tmk, job_id
        FROM permit_sync
        WHERE job_id = 0
    """
    if args.limit:
        sql += f" LIMIT {args.limit}"

    cursor.execute(sql)
    unmatched = cursor.fetchall()
    log.info("Found %d unmatched permits", len(unmatched))

    # Pre-load AEI customer/job data (all jobs with addresses)
    cursor.execute("""
        SELECT c.id as customer_id, c.cust_tmk, c.address, c.city, c.zip_code,
               j.id as job_id, j.building_permit
        FROM customers c
        JOIN jobs j ON j.customer_id = c.id
        WHERE c.address IS NOT NULL AND c.address != ''
    """)
    aei_jobs = cursor.fetchall()
    log.info("Loaded %d AEI jobs with addresses", len(aei_jobs))

    # Build TMK index using normalized 8-digit TMKs
    tmk_index = {}
    for row in aei_jobs:
        norm = normalize_tmk(row["cust_tmk"])
        if norm:
            tmk_index.setdefault(norm, []).append(row)

    # Build address index: street_number → list of rows
    addr_index = {}
    for row in aei_jobs:
        num = extract_street_number(row["address"])
        if num:
            addr_index.setdefault(num, []).append(row)

    log.info("TMK index: %d unique TMKs, Address index: %d unique street numbers",
             len(tmk_index), len(addr_index))

    # Stats
    stats = {
        "total": len(unmatched),
        "matched_tmk": 0,
        "matched_addr": 0,
        "tmk_backfilled": 0,
        "no_match": 0,
        "no_data": 0,
        "no_real_addr": 0,  # has description text, not a street address
    }

    for p in unmatched:
        permit_id = p["id"]
        permit_name = p["permit_number"]
        sf_tmk = normalize_tmk(p["parcel_tmk"])
        sf_addr = (p["address_display"] or "").strip()

        if not sf_tmk and not sf_addr:
            stats["no_data"] += 1
            continue

        matched_job_id = 0
        matched_customer_id = 0
        match_method = None

        # Strategy 1: TMK match (normalized 8-digit)
        if sf_tmk and sf_tmk in tmk_index:
            candidates = tmk_index[sf_tmk]
            best = max(candidates, key=lambda r: r["job_id"])
            matched_job_id = best["job_id"]
            matched_customer_id = best["customer_id"]
            match_method = "TMK"
            stats["matched_tmk"] += 1
            log.debug("  TMK match: %s (TMK %s) -> job %d (%s)",
                      permit_name, sf_tmk, matched_job_id, best["address"])

        # Strategy 2: Address match
        if not matched_job_id and sf_addr:
            # Clean SF address: take street part before comma, strip city
            sf_street = sf_addr.split(",")[0].strip()
            # Remove SF prefixes like "{PV} (PC)"
            sf_street = re.sub(r'^\{[^}]*\}\s*(\([^)]*\)\s*)*', '', sf_street).strip()
            # Strip city name from end
            sf_street = strip_city_from_sf_street(sf_street)

            # Detect description text masquerading as address
            # e.g., "20 PV INVERTER COUNT: 20", "100 AMPS TO 200 AMPS",
            #        "2020-0040 - (1) 4x10 panels", "30 modules WITH..."
            is_description = False
            if re.match(r'^\d+\s*(PV\b|AMPS?\b|module|panel|INVERTER|TO\s+EXISTING|'
                        r'BATTERY|SOLAR|WITH\b|GALLON\b|FT\b|A\s+TO\s+\d)', sf_street, re.I):
                is_description = True
            if re.match(r'^\d+A?\s+TO\s+\d', sf_street, re.I):  # "100A TO 200A..."
                is_description = True
            if re.match(r'^\d+-\d+\s*-\s*\(', sf_street):  # "2020-0040 - (1) 4x10..."
                is_description = True
            if re.match(r'^\d+\s*&[GL]T;', sf_street):  # "35 &GT;"
                is_description = True
            if re.match(r'^\d+\s+ON\s+EXISTING\b', sf_street, re.I):
                is_description = True
            if is_description:
                stats["no_real_addr"] += 1
                continue
            if ']' in sf_street[:20]:  # "172066 ] ADAMS" or "[TMK:..." remnant
                bracket_part = sf_street.split(']', 1)
                if len(bracket_part) == 2 and bracket_part[1].strip():
                    sf_street = bracket_part[1].strip()
                else:
                    stats["no_real_addr"] += 1
                    continue
            if 'JOB CANCELLED' in sf_street.upper() or '***' in sf_street:
                tmk_addr = re.search(r'\]\s*(.+)', sf_street)
                if tmk_addr:
                    sf_street = tmk_addr.group(1).strip()
                else:
                    stats["no_real_addr"] += 1
                    continue

            sf_num = extract_street_number(sf_street)
            norm_sf = normalize_address(sf_street)
            sf_words = extract_street_words(norm_sf)

            if sf_num and sf_num in addr_index:
                candidates = addr_index[sf_num]
                best_score = 0
                best_row = None

                for row in candidates:
                    norm_aei = normalize_address(row["address"])
                    if not norm_aei:
                        continue

                    aei_words = extract_street_words(norm_aei)
                    score = 0

                    # Exact normalized match (after city stripping)
                    if norm_sf == norm_aei:
                        score = 10
                    elif sf_words and aei_words:
                        # Count matching street name words (ignoring street type suffix)
                        # e.g., sf=["HALAHUA", "ST"] vs aei=["HALAHUA", "ST"]
                        matching = 0
                        min_len = min(len(sf_words), len(aei_words))
                        for i in range(min_len):
                            if sf_words[i] == aei_words[i]:
                                matching += 1
                            else:
                                break

                        if matching >= 2:
                            # Number + 2+ street words match (e.g., "91-1501 HALAHUA ST")
                            score = 8
                        elif matching == 1:
                            # Number + 1 street word (e.g., "589 KAWAILOA")
                            # This is solid if the word is distinctive (not "N", "S", etc.)
                            word = sf_words[0]
                            if len(word) >= 3 and word not in ('THE', 'AND', 'AVE', 'ST', 'DR', 'RD', 'LN', 'CT', 'PL'):
                                score = 6
                            else:
                                score = 3

                    if score > best_score:
                        best_score = score
                        best_row = row

                if best_row and best_score >= 5:
                    matched_job_id = best_row["job_id"]
                    matched_customer_id = best_row["customer_id"]
                    match_method = f"ADDRESS(score={best_score})"
                    stats["matched_addr"] += 1
                    log.debug("  Address match: %s (%s) -> job %d (%s) [score=%d]",
                              permit_name, sf_street[:40], matched_job_id,
                              best_row["address"][:40], best_score)

        if not matched_job_id:
            stats["no_match"] += 1
            if args.verbose:
                log.debug("  No match: %s (TMK=%s, addr=%s)",
                          permit_name, sf_tmk, sf_addr[:50] if sf_addr else "")
            continue

        # Apply changes if --apply
        if args.apply:
            cursor.execute(
                "UPDATE permit_sync SET job_id = %s WHERE id = %s",
                (matched_job_id, permit_id)
            )

            # Backfill TMK to customer if missing
            if sf_tmk and matched_customer_id:
                cursor.execute(
                    "SELECT cust_tmk FROM customers WHERE id = %s",
                    (matched_customer_id,)
                )
                cust = cursor.fetchone()
                if cust and not normalize_tmk(cust["cust_tmk"]):
                    cursor.execute(
                        "UPDATE customers SET cust_tmk = %s WHERE id = %s",
                        (sf_tmk, matched_customer_id)
                    )
                    stats["tmk_backfilled"] += 1
                    log.info("    Backfilled TMK %s -> customer %d",
                             sf_tmk, matched_customer_id)

            conn.commit()

    # Summary
    total_matched = stats["matched_tmk"] + stats["matched_addr"]
    log.info("=" * 60)
    log.info("Matching complete!")
    log.info("  Total unmatched:     %d", stats["total"])
    log.info("  Matched by TMK:      %d", stats["matched_tmk"])
    log.info("  Matched by address:  %d", stats["matched_addr"])
    log.info("  TMK backfilled:      %d", stats["tmk_backfilled"])
    log.info("  No match found:      %d", stats["no_match"])
    log.info("  No real address:     %d (description text, TMK-only)", stats["no_real_addr"])
    log.info("  No address/TMK data: %d", stats["no_data"])
    log.info("  Total matched:       %d (%.1f%%)", total_matched,
             100.0 * total_matched / stats["total"] if stats["total"] else 0)
    if not args.apply:
        log.info("  (DRY RUN - no changes written. Use --apply to write)")
    log.info("=" * 60)

    cursor.close()
    conn.close()


def main():
    parser = argparse.ArgumentParser(description="DPP Permit Match - Link SF permits to AEI jobs")
    parser.add_argument("--apply", action="store_true",
                        help="Apply matches to DB (default: dry run)")
    parser.add_argument("--limit", type=int, default=None,
                        help="Limit number of permits to process")
    parser.add_argument("--verbose", action="store_true", help="Debug logging")
    args = parser.parse_args()
    run_matching(args)


if __name__ == "__main__":
    main()
