
    *i                     n   d Z ddlZddlZddlmZ ddlmZ ddlm	Z	 de
de
fdZd	e
de
fd
Zde	dej                  fdZde	deej                  ej                  f   fdZde	dej                  fdZdej                  de
dej                  fdZde	deee
ej                  f   ej                  f   fdZy)aD  
DiploAI Knowledge Graph - Node Builder (Phase 2b)

Dual label model:
  - ALL WP posts -> :Document + specific label, with document_hash = md5(link)
  - Topics from taxonomy -> :Document:Topic (+ :TopicBasket if parent)
  - Tags -> :Tag only (no Document label, no hash)
  - Dates -> :Date only (no Document label, no hash)
    N)BeautifulSoup)POST_TYPE_LABELS)WPExtractortextreturnc                 d    t        j                  | j                               j                         S N)hashlibmd5encode	hexdigest)r   s    node_builder.py_md5r      s     ;;t{{}%//11    htmlc                 4    | syt        | d      j                  S )N lxml)r   r   )r   s    r   _clean_textr      s    v&+++r   	extractorc                    | j                   }| j                  }d|v rd| nd| }g }|j                         D ]  \  }}|d   }t        j                  ||j                               }| d| d|d    d}	|j                  | d| d|d    d	|gt        |	      |d
   ||t        |d         |d   |	t        |d         t        |j	                  dd            d        t        j                  |      }
|
j                  sO|
d   j                  d       j                         }t        d| dt!        |
       d|j#                          d       |
S )z
    Build nodes for ALL WP posts. Each gets:
      - labels: ['Document', specific_label] (e.g. ['Document', 'Blog'])
      - document_hash: md5(link_url)
    	diplomacyhttps://www.https://	post_type/	post_name_IDDocument
post_titlepost_contentpost_date_gmtr   )node_idlabelsdocument_hashnamer   sitewp_idslugurlr   dater%   c                     | d   S )N    )xs    r   <lambda>z"build_post_nodes.<locals>.<lambda>B   s    AaDr   [z] Post nodes: z  ())	all_postssite_prefixiterrowsr   get
capitalizeappendr   strr   pd	DataFrameemptyapplyvalue_countsprintlento_dict)r   postsprefixsite_urlrecordsr   rowptspecific_labellinkdflabel_countss               r   build_post_nodesrM   "   sq    E""F*5*?fX&xPVxEXHG.."3)--b"--/B1RD#k"2!315 2$aD	{3!>2!$Z%T^$N 3445
 	 #& 
g	B88(|)).9FFH&ByL4H4H4J3K1MNIr   c                    | j                   }d|v rd| nd| }| j                         }t        ||d   dk7     d   j                  t              j                               }g }|j                         D ]  \  }}t	        |d         }||v }	| d|d    d	}
d
dg}|	r|j                  d       |j                  | d| |t        |
      |d   d|t        |      |d   |
t        |j                  dd            d
        t        j                  |      }t        d |D              }t        d| dt!        |       d| dt!        |      |z
   d	       ||fS )z
    Build :Document:Topic nodes from taxonomy.
    Parent topics also get :TopicBasket label.
    Returns (df_topic_nodes, df_topics_raw).
    r   r   r   parentr   term_idz/topics/r*   r   r    TopicTopicBasket_topic_r'   topicr   r   )
r$   r%   r&   r'   r   r(   r)   r*   r+   r   c              3   0   K   | ]  }d |d   v sd  yw)rR   r%   r.   Nr/   ).0rs     r   	<genexpr>z$build_topic_nodes.<locals>.<genexpr>o   s     EW8(D!Ws   r2   z] Topic nodes: z  (TopicBasket=z, Topic=r3   )r5   
get_topicssetastypeinttolistr6   r9   r   r:   r   r7   r;   r<   sumr@   rA   )r   rD   rE   	df_topics
parent_idsrF   r   rG   tid	is_basketrJ   r%   rK   basketss                 r   build_topic_nodesrd   K   s    ""F*5*?fX&xPVxEXH$$&IYy2a78BII#NUUWXJG$$&3#i.!:%	8CK=2g&MM-( .!$ZK XK 34
 	 ', 
g	BEWEEG	AfX_SWI_WIXcRTgV]oM^^_
`ay=r   c           
      t   | j                   }| j                         }|j                  rt        j                         S g }|j                         D ]7  \  }}|j                  | d|d    dg|d   t        |d         |d   d       9 t        j                  |      }t        d| dt        |              |S )	N_tag_rP   Tagr'   r*   )r$   r%   r'   r)   r*   r2   z] Tag nodes: )
r5   get_tagsr=   r;   r<   r6   r9   r:   r@   rA   )r   rD   tagsrF   r   rG   rK   s          r   build_tag_nodesrj   x   s    ""FDzz||~G--/3 s9~&67gKY(K
 	 " 
g	B	AfX]3r7)
,-Ir   df_postsr5   c           
      z   | j                   sd| j                  vrt        j                         S t        j                  | d   d      j                         }|j                   rt        j                         S |j                  j                  d      j                         }g }t        |      D ]\  }|j                         }|j                  | d| dg|j                  d      t        |      |j                  |j                  d       ^ t        j                  |      }t!        d	| d
t#        |              |S )Nr,   coerce)errorsM_date_Datez%B %Y)r$   r%   r'   r)   yearmonthr2   z] Date nodes: )r=   columnsr;   r<   to_datetimedropnadt	to_perioduniquesortedto_timestampr9   strftimer:   rr   rs   r@   rA   )rk   r5   datesyear_monthsrF   ymrw   rK   s           r   build_date_nodesr      s   ~~x'7'77||~NN8F+H=DDFE{{||~(($$S)002KG[!__%fRD1hKK(WGGXX
 	 " 
g	B	Ak].R	
23Ir   c                    t        |       }t        |       \  }}t        |       }t        || j                        }||||d}t        d |j                         D              }t        d| j                   d|        ||fS )zq
    Returns:
      - nodes_dict: {'posts': df, 'topics': df, 'tags': df, 'dates': df}
      - df_topics_raw
    )rC   topicsri   r}   c              3   2   K   | ]  }t        |        y wr	   )rA   )rV   rK   s     r   rX   z"build_all_nodes.<locals>.<genexpr>   s     6"5BB"5s   r2   z] TOTAL nodes: )rM   rd   rj   r   r5   r^   valuesr@   )r   
post_nodestopic_nodesdf_topics_raw	tag_nodes
date_nodes
nodes_dicttotals           r   build_all_nodesr      s     "),J!29!=K	*I!*i.C.CDJ 	J 6*"3"3"566E	Ai##$OE7
;<}$$r   )__doc__r
   pandasr;   bs4r   configr   wp_extractorr   r:   r   r   r<   rM   tuplerd   rj   r   dictr   r/   r   r   <module>r      s       # $2s 2s 2,c ,c ," " "R& &r||R\\7Q1R &Z{ r|| 2r|| # ",, >%{ %uT#r||:K5Lbll5Z/[ %r   