
    3fiH                         d Z ddlZddlmZmZmZmZ ddlmZ ddl	m
Z
 ddlmZ  ej                  e      Z G d de
      Zy)	zLLoader that uses bs4 to load HTML files, enriching metadata with page title.    N)AnyDictIteratorUnion)Document)BaseBlobParser)Blobc            	       F    e Zd ZdZddddedededd	fd
Zdedee	   fdZ
y	)BS4HTMLParserz(Parse HTML files using `Beautiful Soup`.lxml )featuresget_text_separatorr   r   kwargsreturnNc                b    	 ddl }d|i|| _        || _        y# t        $ r t        d      w xY w)z#Initialize a bs4 based HTML parser.r   NzUbeautifulsoup4 package not found, please install it with `pip install beautifulsoup4`r   )bs4ImportError	bs_kwargsr   )selfr   r   r   r   s        s/var/www/auto_recruiter/arenv/lib/python3.12/site-packages/langchain_community/document_loaders/parsers/html/bs4.py__init__zBS4HTMLParser.__init__   sH    	 %h9&9"4  	/ 	s    .blobc              #   V  K   ddl m} |j                         5 } ||fi | j                  }ddd       j	                  | j
                        }|j                  r t        |j                  j                        }nd}|j                  |d}t        ||       y# 1 sw Y   qxY ww)z)Load HTML document into document objects.r   )BeautifulSoupNr   )sourcetitle)page_contentmetadata)r   r   as_bytes_ior   get_textr   r   strstringr   r   )r   r   r   fsouptextr   r   s           r   
lazy_parsezBS4HTMLParser.lazy_parse$   s     % 	61 5dnn5D	6 }}T445::

))*EE kk1
 D8<<	6 	6s   B)BA0B)B&"B))__name__
__module____qualname____doc__r"   r   r   r	   r   r   r'        r   r   r      sR    2
 "$	5 5  	5
 5 
5&=t =(: =r-   r   )r+   loggingtypingr   r   r   r   langchain_core.documentsr   )langchain_community.document_loaders.baser   1langchain_community.document_loaders.blob_loadersr	   	getLoggerr(   loggerr   r,   r-   r   <module>r5      s8    R  - - - D B			8	$(=N (=r-   