
    g3fi                        d Z ddlZddlZddlmZ ddlmZmZ  ej                  e	      Z
dZdZddj                  eD  cg c]  }  ej                  |       d	z    c}       z   d
z   Zddj                  eD  cg c]  }  ej                  |        c}       z   d
z   Zde de dZdddedeej&                  z  dz  dee   fdZdddddddedededz  deej&                  z  dz  dedee   dedee   fdZyc c} w c c} w )z Utilities for working with HTML.    N)Sequence)urljoinurlparse)zjavascript:zmailto:#)z.cssz.jsz.icoz.pngz.jpgz.jpegz.gifz.svgz.csvz.bz2z.zipz.epubz.webpz.pdfz.docxz.xlsxz.pptxz.pptmz(?!|z[\#'\"])z
href=[\"']z((?:z.)*?)[\#'\"]patternraw_htmlr
   returnc                f    |xs t         }t        t        t        j                  ||                   S )zExtract all links from a raw HTML string.

    Args:
        raw_html: original HTML.
        pattern: Regex to use for extracting links from raw HTML.

    Returns:
        A list of all links found in the HTML.
    )DEFAULT_LINK_REGEXlistsetrefindall)r   r
   s     W/var/www/auto_recruiter/arenv/lib/python3.12/site-packages/langchain_core/utils/html.pyfind_all_linksr   *   s*     ++GBJJw1233    T F)base_urlr
   prevent_outsideexclude_prefixescontinue_on_failureurlr   r   r   r   c                   ||n|}t        |      }t        |      }	t        | |      }
t               }|
D ]  }	 t        |      }|j                  dv r|}nW|j	                  d      r|	j                   d| }n4t        ||j                        }|j                  r|d|j                   z  }|j                  |        g }|D ]a  t        fd|D              r|r7t              }|j                  |j                  k7  r?j	                  |      sQ|j                         c |S # t        $ r%}|rt        j                  d||       Y d}~ d}~ww xY w)	a  Extract all links from a raw HTML string and convert into absolute paths.

    Args:
        raw_html: original HTML.
        url: the url of the HTML.
        base_url: the base URL to check for outside links against.
        pattern: Regex to use for extracting links from raw HTML.
        prevent_outside: If `True`, ignore external links which are not children
            of the base URL.
        exclude_prefixes: Exclude any URLs that start with one of these prefixes.
        continue_on_failure: If `True`, continue if parsing a specific link raises an
            exception. Otherwise, raise the exception.

    Returns:
        A list of absolute paths to sub links.
    Nr	   >   httphttpsz//:?z-Unable to load link %s. Raised exception:

%sc              3   @   K   | ]  }j                  |        y w)N)
startswith).0exclude_prefixpaths     r   	<genexpr>z$extract_sub_links.<locals>.<genexpr>q   s     V>t~.Vs   )r   r   r   schemer"   r   r%   queryadd	Exceptionloggerwarninganynetlocappend)r   r   r   r
   r   r   r   base_url_to_useparsed_base_url
parsed_url	all_linksabsolute_pathslinkparsed_linkabsolute_patheresultsparsed_pathr%   s                     @r   extract_sub_linksr;   :   sk   4 #+"6hCO/O#Jx9IUN 	"4.K!!%66 $&#-#4#4"5Qtf = '[-=-= >$$!q):):(;%<<M}-, G VEUVV"4.K%%););; ???3t  N3  	"EtQ 	s   BD++	E4EEE)__doc__loggingr   collections.abcr   urllib.parser   r   	getLogger__name__r+   PREFIXES_TO_IGNORESUFFIXES_TO_IGNOREjoinescapeSUFFIXES_TO_IGNORE_REGEXPREFIXES_TO_IGNORE_REGEXr   strPatternr   r   boolr;   )ss   0r   <module>rL      s   &  	 $ *			8	$4  * 
CHH9KLAibiilZ/LMMPSS  
CHH,>?qibiil?@@3F  *+40H/IV  :>44"RZZ/$64	#Y4(  '+ &( %FF	F Dj	F
 2::$F F smF F 
#YF5 M @s   C:
7C?
