"""
DiploAI Knowledge Graph - dig.watch Pipeline (Phase 2)

Usage:
    python run_dw.py                   # Full pipeline
    python run_dw.py --csv-only        # Only extract and export CSV
    python run_dw.py --neo4j-only      # Only load from existing CSV into Neo4j
    python run_dw.py --discover        # Discover post_types and taxonomies in DW database
"""

import sys
from datetime import datetime

import pandas as pd

from config import DW_CONFIG
from wp_extractor import WPExtractor
from node_builder import build_all_nodes
from link_builder import build_all_links
from neo4j_loader import get_driver, load_all_nodes, load_relationships, ensure_indexes, clear_database


def discover(extractor: WPExtractor):
    print("\n=== POST TYPES ===")
    print(extractor.discover_post_types().to_string(index=False))
    print("\n=== TAXONOMIES ===")
    print(extractor.discover_taxonomies().to_string(index=False))
    print("\n=== SEARCHING: trend, newsletter, process, approach, technolog, value ===")
    specific = extractor.discover_specific_taxonomies(
        ['trend', 'newsletter', 'process', 'approach', 'technolog', 'value']
    )
    print(specific.to_string(index=False) if not specific.empty else "No matching taxonomies found.")


def main():
    mode = sys.argv[1] if len(sys.argv) > 1 else '--full'
    cfg = DW_CONFIG
    today = datetime.now().strftime('%d%m%Y')

    extractor = WPExtractor(
        host=cfg['mysql_host'],
        database=cfg['mysql_db'],
        site_prefix=cfg['site_prefix'],
        include_post_types=cfg['include_post_types'],
    )

    if mode == '--discover':
        discover(extractor)
        return

    if mode in ('--full', '--csv-only'):
        print("=" * 60)
        print(f"DIG.WATCH KG PIPELINE  ({today})")
        print("=" * 60)

        print("\n--- Extracting posts ---")
        extractor.get_all_posts()
        extractor.get_all_posts_taxonomies()

        print("\n--- Building nodes ---")
        nodes_dict, df_topics_raw = build_all_nodes(extractor)

        print("\n--- Building links ---")
        df_links = build_all_links(extractor, nodes_dict, df_topics_raw)

        all_nodes = pd.concat([df for df in nodes_dict.values() if not df.empty], ignore_index=True)
        all_nodes.to_csv(f'dw_nodes_{today}.csv', index=False)
        df_links.to_csv(f'dw_links_{today}.csv', index=False)
        print(f"\nExported: dw_nodes_{today}.csv ({len(all_nodes)} nodes)")
        print(f"Exported: dw_links_{today}.csv ({len(df_links)} links)")

    if mode in ('--full', '--neo4j-only'):
        print(f"\n--- Loading into Neo4j ({cfg['neo4j_database']}) ---")
        driver = get_driver()

        if mode == '--full':
            clear_database(driver, cfg['neo4j_database'])

        ensure_indexes(driver, cfg['neo4j_database'])
        load_all_nodes(driver, cfg['neo4j_database'], nodes_dict)
        load_relationships(driver, cfg['neo4j_database'], df_links)

        driver.close()
        print("\nDone.")


if __name__ == '__main__':
    main()
