
    i*                     \    d dl Z  G d d      Z G d d      Z G d d      Z G d d	      Zy)
    Nc                   $    e Zd ZdZddZd Zd Zy)RegexBuildera  Builds regex using arguments passed into a pattern template.

    Builds a regex object for which the pattern is made from an argument
    passed into a template. If more than one argument is passed (iterable),
    each pattern is joined by "|" (regex alternation 'or') to create a
    single pattern.

    Args:
        pattern_args (iterable): String element(s) to be each passed to
            ``pattern_func`` to create a regex pattern. Each element is
            ``re.escape``'d before being passed.
        pattern_func (callable): A 'template' function that should take a
            string and return a string. It should take an element of
            ``pattern_args`` and return a valid regex pattern group string.
        flags: ``re`` flag(s) to compile with the regex.

    Example:
        To create a simple regex that matches on the characters "a", "b",
        or "c", followed by a period::

            >>> rb = RegexBuilder('abc', lambda x: "{}\.".format(x))

        Looking at ``rb.regex`` we get the following compiled regex::

            >>> print(rb.regex)
            'a\.|b\.|c\.'

        The above is fairly simple, but this class can help in writing more
        complex repetitive regex, making them more readable and easier to
        create by using existing data structures.

    Example:
        To match the character following the words "lorem", "ipsum", "meili"
        or "koda"::

            >>> words = ['lorem', 'ipsum', 'meili', 'koda']
            >>> rb = RegexBuilder(words, lambda x: "(?<={}).".format(x))

        Looking at ``rb.regex`` we get the following compiled regex::

            >>> print(rb.regex)
            '(?<=lorem).|(?<=ipsum).|(?<=meili).|(?<=koda).'

    c                 X    || _         || _        || _        | j                         | _        y N)pattern_argspattern_funcflags_compileregex)selfr   r   r	   s       Q/var/www/auto_recruiter/arenv/lib/python3.12/site-packages/gtts/tokenizer/core.py__init__zRegexBuilder.__init__3   s(    ((
 ]]_
    c                     g }| j                   D ]9  }t        j                  |      }| j                  |      }|j	                  |       ; dj                  |      }t        j                  || j                        S )N|)r   reescaper   appendjoincompiler	   )r   altsargaltpatterns        r   r
   zRegexBuilder._compile;   si    $$ 	C))C.C##C(CKK	
 ((4.zz'4::..r   c                 ,    t        | j                        S r   )strr   r   s    r   __repr__zRegexBuilder.__repr__E   s    4::r   Nr   )__name__
__module____qualname____doc__r   r
   r    r   r   r   r      s    +Z%/r   r   c                   $    e Zd ZdZddZd Zd Zy)PreProcessorRegexa  Regex-based substitution text pre-processor.

    Runs a series of regex substitutions (``re.sub``) from each ``regex`` of a
    :class:`gtts.tokenizer.core.RegexBuilder` with an extra ``repl``
    replacement parameter.

    Args:
        search_args (iterable): String element(s) to be each passed to
            ``search_func`` to create a regex pattern. Each element is
            ``re.escape``'d before being passed.
        search_func (callable): A 'template' function that should take a
            string and return a string. It should take an element of
            ``search_args`` and return a valid regex search pattern string.
        repl (string): The common replacement passed to the ``sub`` method for
            each ``regex``. Can be a raw string (the case of a regex
            backreference, for example)
        flags: ``re`` flag(s) to compile with each `regex`.

    Example:
        Add "!" after the words "lorem" or "ipsum", while ignoring case::

            >>> import re
            >>> words = ['lorem', 'ipsum']
            >>> pp = PreProcessorRegex(words,
            ...                        lambda x: "({})".format(x), r'\\1!',
            ...                        re.IGNORECASE)

        In this case, the regex is a group and the replacement uses its
        backreference ``\\1`` (as a raw string). Looking at ``pp`` we get the
        following list of search/replacement pairs::

            >>> print(pp)
            (re.compile('(lorem)', re.IGNORECASE), repl='\1!'),
            (re.compile('(ipsum)', re.IGNORECASE), repl='\1!')

        It can then be run on any string of text::

            >>> pp.run("LOREM ipSuM")
            "LOREM! ipSuM!"

    See :mod:`gtts.tokenizer.pre_processors` for more examples.

    c                     || _         g | _        |D ]5  }t        |g||      }| j                  j                  |j                         7 y r   )replregexesr   r   r   )r   search_argssearch_funcr(   r	   r   rbs          r   r   zPreProcessorRegex.__init__v   sG    	  	*Cse[%8BLL)	*r   c                 `    | j                   D ]  }|j                  | j                  |      }  |S )zRun each regex substitution on ``text``.

        Args:
            text (string): the input text.

        Returns:
            string: text after all substitutions have been sequentially
            applied.

        )r)   subr(   )r   textr   s      r   runzPreProcessorRegex.run   s0     \\ 	.E99TYY-D	.r   c                     g }| j                   D ]-  }|j                  dj                  || j                               / dj	                  |      S )Nz({}, repl='{}'), )r)   r   formatr(   r   )r   	subs_strsrs      r   r   zPreProcessorRegex.__repr__   sK    	 	EA.55aCD	Eyy##r   Nr   r    r!   r"   r#   r   r0   r   r$   r   r   r&   r&   I   s    *X*$r   r&   c                   $    e Zd ZdZddZd Zd Zy)PreProcessorSuba@  Simple substitution text preprocessor.

    Performs string-for-string substitution from list a find/replace pairs.
    It abstracts :class:`gtts.tokenizer.core.PreProcessorRegex` with a default
    simple substitution regex.

    Args:
        sub_pairs (list): A list of tuples of the style
            ``(<search str>, <replace str>)``
        ignore_case (bool): Ignore case during search. Defaults to ``True``.

    Example:
        Replace all occurrences of "Mac" to "PC" and "Firefox" to "Chrome"::

            >>> sub_pairs = [('Mac', 'PC'), ('Firefox', 'Chrome')]
            >>> pp = PreProcessorSub(sub_pairs)

        Looking at the ``pp``, we get the following list of
        search (regex)/replacement pairs::

            >>> print(pp)
            (re.compile('Mac', re.IGNORECASE), repl='PC'),
            (re.compile('Firefox', re.IGNORECASE), repl='Chrome')

        It can then be run on any string of text::

            >>> pp.run("I use firefox on my mac")
            "I use Chrome on my PC"

    See :mod:`gtts.tokenizer.pre_processors` for more examples.

    c                     d }|rt         j                  nd}g | _        |D ]1  }|\  }}t        |g|||      }| j                  j	                  |       3 y )Nc                 $    dj                  |       S )Nz{})r3   )xs    r   r+   z-PreProcessorSub.__init__.<locals>.search_func   s    <<?"r   r   )r   Ipre_processorsr&   r   )	r   	sub_pairsignore_caser+   r	   sub_pairr   r(   pps	            r   r   zPreProcessorSub.__init__   s^    	# $ !! 	+H$MGT"G9k4GB&&r*	+r   c                 J    | j                   D ]  }|j                  |      } |S )zRun each substitution on ``text``.

        Args:
            text (string): the input text.

        Returns:
            string: text after all substitutions have been sequentially
            applied.

        )r=   r0   )r   r/   rA   s      r   r0   zPreProcessorSub.run   s+     %% 	 B66$<D	 r   c                 p    dj                  | j                  D cg c]  }t        |       c}      S c c}w )Nr2   )r   r=   r   )r   rA   s     r   r   zPreProcessorSub.__repr__   s)    yyD,?,?@b#b'@AA@s   3N)Tr6   r$   r   r   r8   r8      s    B+Br   r8   c                   @    e Zd ZdZej
                  fdZd Zd Zd Z	y)	Tokenizera
  An extensible but simple generic rule-based tokenizer.

    A generic and simple string tokenizer that takes a list of functions
    (called `tokenizer cases`) returning ``regex`` objects and joins them by
    "|" (regex alternation 'or') to create a single regex to use with the
    standard ``regex.split()`` function.

    ``regex_funcs`` is a list of any function that can return a ``regex``
    (from ``re.compile()``) object, such as a
    :class:`gtts.tokenizer.core.RegexBuilder` instance (and its ``regex``
    attribute).

    See the :mod:`gtts.tokenizer.tokenizer_cases` module for examples.

    Args:
        regex_funcs (list): List of compiled ``regex`` objects. Each
            function's pattern will be joined into a single pattern and
            compiled.
        flags: ``re`` flag(s) to compile with the final regex. Defaults to
            ``re.IGNORECASE``

    Note:
        When the ``regex`` objects obtained from ``regex_funcs`` are joined,
        their individual ``re`` flags are ignored in favour of ``flags``.

    Raises:
        TypeError: When an element of ``regex_funcs`` is not a function, or
            a function that does not return a compiled ``regex`` object.

    Warning:
        Joined ``regex`` patterns can easily interfere with one another in
        unexpected ways. It is recommended that each tokenizer case operate
        on distinct or non-overlapping characters/sets of characters
        (For example, a tokenizer case for the period (".") should also
        handle not matching/cutting on decimals, instead of making that
        a separate tokenizer case).

    Example:
        A tokenizer with a two simple case (*Note: these are bad cases to
        tokenize on, this is simply a usage example*)::

            >>> import re, RegexBuilder
            >>>
            >>> def case1():
            ...     return re.compile("\,")
            >>>
            >>> def case2():
            ...     return RegexBuilder('abc', lambda x: "{}\.".format(x)).regex
            >>>
            >>> t = Tokenizer([case1, case2])

        Looking at ``case1().pattern``, we get::

            >>> print(case1().pattern)
            '\\,'

        Looking at ``case2().pattern``, we get::

            >>> print(case2().pattern)
            'a\\.|b\\.|c\\.'

        Finally, looking at ``t``, we get them combined::

            >>> print(t)
            're.compile('\\,|a\\.|b\\.|c\\.', re.IGNORECASE)
             from: [<function case1 at 0x10bbcdd08>, <function case2 at 0x10b5c5e18>]'

        It can then be run on any string of text::

            >>> t.run("Hello, my name is Linda a. Call me Lin, b. I'm your friend")
            ['Hello', ' my name is Linda ', ' Call me Lin', ' ', " I'm your friend"]

    c                     || _         || _        	 | j                         | _        y # t        t
        f$ r}t	        dt        |      z         d }~ww xY w)Nz`Tokenizer() expects a list of functions returning regular expression objects (i.e. re.compile). )regex_funcsr	   _combine_regextotal_regex	TypeErrorAttributeErrorr   )r   rG   r	   es       r   r   zTokenizer.__init__"  s\    &
	#224D>* 	ACFq6J 	s   & AAAc                     g }| j                   D ]  }|j                   |               dj                  d |D              }t        j                  || j
                        S )Nr   c              3   4   K   | ]  }|j                     y wr   )r   ).0r   s     r   	<genexpr>z+Tokenizer._combine_regex.<locals>.<genexpr>4  s     733;;7s   )rG   r   r   r   r   r	   )r   r   funcr   s       r   rH   zTokenizer._combine_regex/  sU    $$ 	 DKK	  ((7$77zz'4::..r   c                 8    | j                   j                  |      S )zTokenize `text`.

        Args:
            text (string): the input text to tokenize.

        Returns:
            list: A list of strings (token) split according to the tokenizer cases.

        )rI   split)r   r/   s     r   r0   zTokenizer.run7  s     %%d++r   c                 ^    t        | j                        dz   t        | j                        z   S )Nz from: )r   rI   rG   r   s    r   r   zTokenizer.__repr__C  s(    4##$y03t7G7G3HHHr   N)
r    r!   r"   r#   r   
IGNORECASEr   rH   r0   r   r$   r   r   rE   rE      s'    HT +--- /
,Ir   rE   )r   r   r&   r8   rE   r$   r   r   <module>rV      s=    	A AHI$ I$X?B ?BDmI mIr   