"""
DiploAI Knowledge Graph - WordPress Data Extractor
Unified Post class for extracting data from both diplomacy.edu and dig.watch MySQL databases.
"""

import mysql.connector
import pandas as pd
from bs4 import BeautifulSoup

from config import MYSQL_USER, MYSQL_PASS, WPCF_META_KEYS


class WPExtractor:
    """Extracts posts, taxonomies, meta, and relationships from a WordPress MySQL database."""

    def __init__(self, host: str, database: str, site_prefix: str,
                 include_post_types: list[str]):
        self.host = host
        self.database = database
        self.site_prefix = site_prefix
        self.include_post_types = include_post_types

        self.all_posts: pd.DataFrame | None = None
        self.all_posts_taxonomies: pd.DataFrame | None = None

    # ------------------------------------------------------------------
    # Core SQL helper
    # ------------------------------------------------------------------

    def _connect(self):
        return mysql.connector.connect(
            host=self.host, database=self.database,
            user=MYSQL_USER, password=MYSQL_PASS,
        )

    def query(self, sql: str) -> pd.DataFrame:
        conn = self._connect()
        cursor = conn.cursor()
        cursor.execute(sql)
        columns = [col[0] for col in cursor.description]
        rows = cursor.fetchall()
        conn.close()
        return pd.DataFrame(rows, columns=columns)

    # ------------------------------------------------------------------
    # Bulk extractions
    # ------------------------------------------------------------------

    def get_all_posts(self) -> pd.DataFrame:
        """Fetch all published posts with wpcf-* meta aggregated into additional_text."""
        q = ("SELECT p.ID, p.post_title, p.post_name, p.post_content, "
             "p.post_type, p.post_date_gmt, p.post_modified_gmt, p.guid, "
             "pm.meta_id, pm.meta_key, pm.meta_value "
             "FROM wp_posts p "
             "INNER JOIN wp_postmeta pm ON p.ID = pm.post_id "
             "WHERE p.post_status = 'publish' AND pm.meta_key LIKE 'wpcf-%%'")

        if self.include_post_types:
            types_str = ', '.join(f"'{t}'" for t in self.include_post_types)
            q += f" AND p.post_type IN ({types_str})"

        raw = self.query(q)

        records = []
        for _, group in raw.groupby('ID'):
            post = group.iloc[0, :-3].to_dict()
            long_meta = group.loc[group['meta_value'].str.len() > 10, 'meta_value'].tolist()
            post['additional_text'] = '\n'.join(
                BeautifulSoup(m, features="lxml").text for m in long_meta
            )
            records.append(post)

        self.all_posts = pd.DataFrame(records)
        print(f"[{self.site_prefix}] Extracted {len(self.all_posts)} posts.")
        return self.all_posts

    def get_all_posts_taxonomies(self) -> pd.DataFrame:
        """Fetch taxonomy assignments for all extracted posts."""
        ids = ', '.join(str(i) for i in self.all_posts['ID'].tolist())
        q = f"""
            SELECT tr.object_id AS post_id, t.term_id, t.name, t.slug,
                   t.term_group, tt.taxonomy, tt.description, tt.parent,
                   tt.count, t2.name AS parent_name
            FROM wp_terms AS t
            INNER JOIN wp_term_taxonomy AS tt ON tt.term_id = t.term_id
            INNER JOIN wp_term_relationships AS tr ON tr.term_taxonomy_id = tt.term_taxonomy_id
            LEFT JOIN wp_terms AS t2 ON t2.term_id = tt.parent
            WHERE tr.object_id IN ({ids})
            ORDER BY tr.object_id ASC"""
        self.all_posts_taxonomies = self.query(q)
        print(f"[{self.site_prefix}] Extracted {len(self.all_posts_taxonomies)} taxonomy records.")
        return self.all_posts_taxonomies

    # ------------------------------------------------------------------
    # Topic extraction
    # ------------------------------------------------------------------

    def get_topics(self) -> pd.DataFrame:
        """Fetch all terms in the 'topics' taxonomy."""
        q = """
            SELECT t.term_id, taxs.description, taxs.parent, t.name, t.slug
            FROM wp_term_taxonomy AS taxs
            LEFT JOIN wp_terms AS t ON taxs.term_taxonomy_id = t.term_id
            WHERE taxs.taxonomy = 'topics'"""
        df = self.query(q)
        df.dropna(inplace=True)
        df['term_id'] = df['term_id'].astype(int)

        df['additional_text'] = df['term_id'].apply(self._get_term_meta_text)
        df['text'] = df['description'].apply(lambda x: BeautifulSoup(x, "lxml").text)
        df['text'] += df['additional_text']
        return df

    def _get_term_meta_text(self, term_id: int) -> str:
        q = f"""
            SELECT tm.meta_value FROM wp_termmeta AS tm
            WHERE tm.term_id = {term_id}
              AND tm.meta_key LIKE 'wpcf-%%' AND tm.meta_value != ''"""
        df = self.query(q)
        if df.empty:
            return ''
        return BeautifulSoup('\n '.join(df['meta_value'].tolist()), "lxml").text

    # ------------------------------------------------------------------
    # Tag extraction
    # ------------------------------------------------------------------

    def get_tags(self) -> pd.DataFrame:
        """Fetch all terms in the 'post_tag' taxonomy."""
        q = """
            SELECT t.term_id, t.name, t.slug
            FROM wp_term_taxonomy AS tt
            JOIN wp_terms AS t ON tt.term_id = t.term_id
            WHERE tt.taxonomy = 'post_tag'"""
        return self.query(q)

    def get_post_tag_links(self, post_ids: list[int]) -> pd.DataFrame:
        """Fetch post -> tag assignments for the given post IDs."""
        ids_str = ', '.join(str(i) for i in post_ids)
        q = f"""
            SELECT tr.object_id AS post_id, t.term_id, t.name AS tag_name
            FROM wp_term_relationships AS tr
            JOIN wp_term_taxonomy AS tt ON tr.term_taxonomy_id = tt.term_taxonomy_id
            JOIN wp_terms AS t ON tt.term_id = t.term_id
            WHERE tt.taxonomy = 'post_tag'
              AND tr.object_id IN ({ids_str})"""
        return self.query(q)

    # ------------------------------------------------------------------
    # Wpcf-* meta extraction (dedicated query with specific keys)
    # ------------------------------------------------------------------

    def get_post_meta(self, post_ids: list[int]) -> pd.DataFrame:
        """Fetch specific wpcf-* meta fields and pivot them per post."""
        ids_str = ', '.join(str(i) for i in post_ids)
        keys_str = ', '.join(f"'{k}'" for k in WPCF_META_KEYS)
        q = f"""
            SELECT post_id, meta_key, meta_value
            FROM wp_postmeta
            WHERE post_id IN ({ids_str}) AND meta_key IN ({keys_str})"""
        raw = self.query(q)
        raw = raw[raw['meta_value'] != '']
        if raw.empty:
            return pd.DataFrame(columns=['post_id', 'new_meta'])
        grouped = raw.groupby('post_id').apply(
            lambda g: [{k: v} for k, v in zip(g['meta_key'], g['meta_value'])]
        )
        df = pd.DataFrame(grouped, columns=['new_meta'])
        df.reset_index(inplace=True)
        return df

    # ------------------------------------------------------------------
    # Toolset relationships
    # ------------------------------------------------------------------

    def _toolset_query(self, where_clause: str) -> pd.DataFrame:
        q = f"""
            SELECT relationships.display_name_singular AS relationship_name,
                   associations.parent_id AS parent_toolset_id,
                   connected_elements_par.element_id AS parent_wp_id,
                   wp_posts_post_par.post_title AS parent_title,
                   wp_posts_post_par.post_type AS parent_type,
                   associations.child_id AS child_toolset_id,
                   connected_elements_child.element_id AS child_wp_id,
                   wp_post_child.post_title AS child_title,
                   wp_post_child.post_type AS child_type
            FROM wp_toolset_associations AS associations
            LEFT JOIN wp_toolset_relationships AS relationships
                ON associations.relationship_id = relationships.id
            LEFT JOIN wp_toolset_connected_elements AS connected_elements_par
                ON associations.parent_id = connected_elements_par.group_id
            LEFT JOIN wp_posts AS wp_posts_post_par
                ON connected_elements_par.element_id = wp_posts_post_par.ID
            LEFT JOIN wp_toolset_connected_elements AS connected_elements_child
                ON associations.child_id = connected_elements_child.group_id
            LEFT JOIN wp_posts AS wp_post_child
                ON connected_elements_child.element_id = wp_post_child.ID
            WHERE {where_clause}"""
        return self.query(q)

    def get_toolset_relations(self, wp_ids: list[int]) -> tuple[pd.DataFrame, pd.DataFrame]:
        """Fetch Toolset relationships where posts appear as child or parent."""
        ids_str = ', '.join(str(i) for i in wp_ids)
        df_child = self._toolset_query(f"connected_elements_child.element_id IN ({ids_str})")
        df_parent = self._toolset_query(f"connected_elements_par.element_id IN ({ids_str})")
        return df_child, df_parent

    # ------------------------------------------------------------------
    # Post-to-topic taxonomy links
    # ------------------------------------------------------------------

    def get_post_topic_links(self, post_type: str) -> pd.DataFrame:
        """For a given post_type, find which topics are assigned."""
        q = f"""
            SELECT posts.ID, posts.post_title,
                   terms.name AS taxonomy_name,
                   taxonomy.taxonomy AS taxonomy_type
            FROM wp_posts AS posts
            INNER JOIN wp_term_relationships AS relationships
                ON posts.ID = relationships.object_id
            INNER JOIN wp_term_taxonomy AS taxonomy
                ON relationships.term_taxonomy_id = taxonomy.term_taxonomy_id
            INNER JOIN wp_terms AS terms
                ON terms.term_id = taxonomy.term_id
            WHERE posts.post_type = '{post_type}'
              AND taxonomy.taxonomy = 'topics'
            ORDER BY posts.ID ASC"""
        return self.query(q)

    # ------------------------------------------------------------------
    # Discovery helpers (for unknown DW content types)
    # ------------------------------------------------------------------

    def discover_post_types(self) -> pd.DataFrame:
        """List all post_types with counts (published posts only)."""
        q = """
            SELECT post_type, COUNT(*) AS cnt
            FROM wp_posts WHERE post_status = 'publish'
            GROUP BY post_type ORDER BY cnt DESC"""
        return self.query(q)

    def discover_taxonomies(self) -> pd.DataFrame:
        """List all taxonomy types with counts."""
        q = """
            SELECT taxonomy, COUNT(*) AS cnt
            FROM wp_term_taxonomy
            GROUP BY taxonomy ORDER BY cnt DESC"""
        return self.query(q)

    def discover_specific_taxonomies(self, patterns: list[str]) -> pd.DataFrame:
        """Search for taxonomy names matching LIKE patterns."""
        where = ' OR '.join(f"taxonomy LIKE '%{p}%'" for p in patterns)
        q = f"SELECT DISTINCT taxonomy FROM wp_term_taxonomy WHERE {where}"
        return self.query(q)
