
    3fiK                        d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ erdd
lmZ ddlmZ ddlmZ  ej:                  e      Ze G d d             Z  G d de      Z!y)zModule contains a PDF parser based on Document AI from Google Cloud.

You need to install two libraries to use this parser:
pip install google-cloud-documentai
pip install google-cloud-documentai-toolbox
    N)	dataclass)TYPE_CHECKINGAnyIteratorListOptionalSequence)BaseBlobParser)Blob)Document)batch_iterate)get_client_info)	OperationDocumentProcessorServiceClient)ProcessOptionsc                   &    e Zd ZU dZeed<   eed<   y)DocAIParsingResultsz1A dataclass to store Document AI parsing results.source_pathparsed_pathN)__name__
__module____qualname____doc__str__annotations__     ^/var/www/auto_recruiter/arenv/lib/python3.12/site-packages/langchain_google_community/docai.pyr   r      s    ;r   r   c                      e Zd ZdZddddddded   dee   dee   dee   d	ee   f
d
Zdedee	   fdZ
	 	 	 	 d'dee   deee      dee   dee   ddf
dZ	 d(dedee   dedee	   fdZ	 	 	 d)dee   dee   dedededee	   fdZdee   dee	   fdZdee   ded   fdZd ed   defd!Zddd"dd#dee   dee   d	ee   d$edee   deded   fd%Zd ed   dee   fd&Zy)*DocAIParsera  Google Cloud Document AI parser.

    Inherits from [`BaseBlobParser`][langchain_core.document_loaders.BaseBlobParser].

    Parses documents using Google Cloud Document AI for text extraction and
    layout analysis.

    See [Document AI documentation](https://cloud.google.com/document-ai/docs/overview)
    for detailed information.

    !!! note "Installation"

        Requires additional dependencies:

        ```bash
        pip install langchain-google-community[docai]
        ```
    N)client
project_idlocationgcs_output_pathprocessor_namer"   r   r#   r$   r%   r&   c                   t        |      t        |      k(  rt        d      d}|r%t        j                  ||      st        d| d      || _        || _        |r|| _        y		 ddlm} ddl	m
}  ||| d
      }
 ||
t        d            | _        | j                  j                  |      j                  | _        | j                  dk(  rd| _        y	d| _        y	# t        $ r}	t        d      |	d	}	~	ww xY w)a  Initializes the parser.

        Args:
            client: A `DocumentProcessorServiceClient` to use
            location: A Google Cloud location where a Document AI processor is located
            gcs_output_path: A path on Google Cloud Storage to store parsing results
            processor_name: Full resource name of a Document AI processor or processor
                version

        You should provide either a client or location (and then a client would be
        instantiated).
        zGYou must specify either a client or a location to instantiate a client.z?projects\/[0-9]+\/locations\/[a-z\-0-9]+\/processors\/[a-z0-9]+zProcessor name z has the wrong format. If your prediction endpoint looks like https://us-documentai.googleapis.com/v1/projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID:process, use only projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID part.r   )ClientOptionsr   Could not import google-cloud-documentai python package. Please, install docai dependency group: `pip install langchain-google-community[docai]`Nz-documentai.googleapis.com)quota_project_idapi_endpointzdocument-ai)module)client_optionsclient_infonameLAYOUT_PARSER_PROCESSORTF)bool
ValueErrorre	fullmatch_gcs_output_path_processor_name_clientgoogle.api_core.client_optionsr(   google.cloud.documentair   ImportErrorr   get_processortype_processor_type_use_layout_parser)selfr"   r#   r$   r%   r&   patternr(   r   excoptionss              r   __init__zDocAIParser.__init__<   s"   , <4>) 
 U",,w"G!.!1 2   !0-!DLHR $!+ (z)CDG :&+=ADL
 $(<<#=#=>#=#R#W#WD ##'@@*.'*/''  !F 	s   %C! !	C;*C66C;blobreturnc              #   \   K   | j                  |g| j                        E d{    y7 w)zParses a blob lazily.

        Args:
            blob: a `Blob` to parse

        This is a long-running operation. A recommended way is to batch documents
        together and use the `batch_parse` method.
        )r%   N)batch_parser6   )r@   rE   s     r   
lazy_parsezDocAIParser.lazy_parse   s)      ##TFD<Q<Q#RRRs   ",*,enable_native_pdf_parsing
page_range
chunk_sizeinclude_ancestor_headingsr   c                 h   	 ddl m}m} | j                  rQ |j
                  |j
                  j                  ||            }|r |j                  |      nd}	 |||	      }
|
S |r	 ||	      nd}|r |j                  |      nd}	 |||	
      }
|
S # t        $ r}t        d      |d}~ww xY w)a=  Prepare process options for DocAI process request

        Args:
            enable_native_pdf_parsing: Enable PDF embedded text extraction
            page_range: list of page numbers to parse. If `None`, entire document will
                be parsed.
            chunk_size: Maximum number of characters per chunk (supported only with
                Document AI Layout Parser processor).
            include_ancestor_headings: Whether or not to include ancestor headings when
                splitting (supported only with Document AI Layout Parser processor).
        r   )	OcrConfigr   ddocumentai package not found, please install it with `pip install langchain-google-community[docai]`N)rL   rM   )chunking_config)pages)layout_configindividual_page_selector)rJ   )
ocr_configrT   ) google.cloud.documentai_v1.typesrO   r   r;   r?   LayoutConfigChunkingConfigIndividualPageSelector)r@   rJ   rK   rL   rM   rO   r   rB   rS   rT   process_optionsrU   s               r   _prepare_process_optionsz$DocAIParser._prepare_process_options   s    $	R ""7N77 . ; ; J J).G !K !M  655JG %
 -+)AO&  - 4MN   655JG %
 -%@XO I  	B 	s   B 	B1 B,,B1
field_maskprocess_options_kwargsc           
   +   `  K   	 ddl m} 	 ddlm  | j
                  di |}| j                  j                  |j                  | j                  |j                  j                  j                  xs d      |d	|
            | j                  r2fdj                  j                  j                   D        E d{    yfdj                  j"                  D        E d{    y# t        $ r}t        d      |d}~ww xY w# t        $ r}t        d      |d}~ww xY w7 i7 Aw)a  Parses a `Blob` lazily using online processing.

        Args:
            blob: `Blob` to parse.
            field_mask: Comma-separated list of which fields to include in the
                Document AI response. suggested: `'text,pages.pageNumber,pages.layout'`
            process_options_kwargs: Optional parameters to pass to the Document AI
                processors
        r   
documentair)   N_text_from_layoutldocumentai_toolbox package not found, please install it with `pip install langchain-google-community[docai]`application/pdfgcs_uri	mime_typeT)r0   gcs_documentrZ   skip_human_reviewr\   c              3   z   K   | ]2  }t        |j                  |j                  j                  d        4 yw)chunk_idsourcepage_contentmetadataN)r   contentrl   path).0chunkrE   s     r   	<genexpr>z-DocAIParser.online_process.<locals>.<genexpr>   s?      	  !&$)NN"&)) 	s   8;c              3      K   | ]M  }t         |j                  j                  j                        |j                  j
                  d        O yw)pagerm   rn   N)r   layoutdocumenttextpage_numberrr   )rs   rx   rb   rE   responses     r   ru   z-DocAIParser.online_process.<locals>.<genexpr>  sT      	  !24;;@Q@Q@V@V!W $ 0 0"&)) 	   AAr   )google.cloudr`   r;   -google.cloud.documentai_toolbox.wrappers.pagerb   r[   r8   process_documentProcessRequestr7   GcsDocumentrr   mimetyper?   rz   chunked_documentchunksrR   )	r@   rE   r\   r]   r`   rB   rZ   rb   r}   s	    `     @@r   online_processzDocAIParser.online_process   sM    	/	 8$77Q:PQ<<00%%))'33 II"mm@/@ 4  !0"&% & 	
 ""	 &..??FF	 	 		 %--33	 	 	Y  	B 		  	B 	.		sc   D.C0 D B,D. D*)D.*D,+D.0	D
9DD

D.	D'D""D''D.,D.blobstimeout_seccheck_in_interval_secc              +     K   |xs | j                   }|st        d       | j                  |fd|i|}|D cg c]  }|j                  j                   }	}t
        j                  d|	       d}
| j                  |      rUt        j                  |       |
|z  }
|
|kD  rt        d|	 d      t
        j                  d       | j                  |      rU| j                  |      }| j                  |      E d	{    y	c c}w 7 
w)
ah  Parses a list of `Blob` lazily.

        Args:
            blobs: A list of `Blob` to parse.
            gcs_output_path: Path on Google Cloud Storage to store parsing results.
            timeout_sec: Timeout to wait for Document AI to complete, in seconds.
            check_in_interval_sec: Interval to wait until next check whether parsing
                operations have been completed, in seconds.
            process_options_kwargs: Optional parameters to pass to the Document AI
                processors

        ??? example "Long-running operation"

            ```python
            # Submit async jobs
            operations = parser.docai_parse(blobs, gcs_path)

            # Optionally poll until finished
            parser.is_running(operations)

            # Save operation names
            operation_names = [op.operation.name for op in operations]

            # Later, load results and build Documents
            operations = parser.operations_from_names(operation_names)
            results = parser.get_results(operations)
            docs = parser.parse_from_results(results)
            ```
        :An output path on Google Cloud Storage should be provided.r%   z9Started parsing with Document AI, submitted operations %sr   z#Timeout exceeded! Check operations z later!.)
operationsN)r6   r3   docai_parse	operationr0   loggerdebug
is_runningtimesleepTimeoutErrorget_resultsparse_from_results)r@   r   r%   r   r   r]   output_pathr   opoperation_namestime_elapsedresultss               r   rH   zDocAIParser.batch_parse  s!    J &>)>)>L  &T%%
#.
2H

 8BB2<<,,BBG	
 ooj)JJ,-11Lk)"9/9J'R  LL ooj) ""j"9**7333 C 	4s#   7D	DA?D	&D	<D=D	r   c              #   0  	K   	 ddl m} ddlm} ddlm |D ]W  	 |	j                        \  }} |||dz         }| j                  r	fd|D        E d {    D	fd|D        E d {    Y y # t        $ r}t        d      |d }~ww xY w7 97 &w)	Nr   )split_gcs_uri)_get_shardsra   rc   /c              3      K   | ]M  }|j                   j                  D ]2  }t        |j                  |j                  j
                  d        4 O ywrk   )r   r   r   rq   rl   r   )rs   shardrt   results      r   ru   z1DocAIParser.parse_from_results.<locals>.<genexpr>^  sd      
 !&!7!7!>!>
  %*]](-&,&8&8" 

r~   c              3      K   | ]T  }|j                   D ]C  }t         |j                  |j                        |j                  j
                  d        E V ywrw   )rR   r   ry   r{   r|   r   )rs   r   rx   rb   r   s      r   ru   z1DocAIParser.parse_from_results.<locals>.<genexpr>j  si      
  %
  %6t{{EJJ%O$($4$4&,&8&8" 

s   AA)	7google.cloud.documentai_toolbox.utilities.gcs_utilitiesr   1google.cloud.documentai_toolbox.wrappers.documentr   r   rb   r;   r   r?   )
r@   r   r   r   rB   gcs_bucket_name
gcs_prefixshardsrb   r   s
           @@r   r   zDocAIParser.parse_from_resultsJ  s     	 X  	F*78J8J*K'OZ *s2BCF&&
 "(
 
 

 "(
 
 
!	  	B 	

sF   BA5 A BBB-B.B5	B>B

BBBr   r   c                     	 ddl m} |D cg c]%  }| j                  j	                   ||            ' c}S # t        $ r}t        d      |d}~ww xY wc c}w )z5Initializes Long-Running Operations from their names.r   )GetOperationRequestzplong running operations package not found, please install it with`pip install langchain-google-community[docai]`Nr/   )request)!google.longrunning.operations_pb2r   r;   r8   get_operation)r@   r   r   rB   r0   s        r   operations_from_namesz!DocAIParser.operations_from_namesv  so    	 (
 LL&&/B/M&N
 	
  	B 	
s   9 *A	AAAr   c                 &    t        d |D              S )Nc              3   >   K   | ]  }|j                            y wN)done)rs   r   s     r   ru   z)DocAIParser.is_running.<locals>.<genexpr>  s     6Rrwwy=6s   )any)r@   r   s     r   r   zDocAIParser.is_running  s    6:666r   i  )r%   r&   
batch_sizer\   r   c                   	 ddl m} |xs | j                  }	|	t	        d      |xs | j
                  }|t	        d      g }
t        ||      D ]  }|j                  |j                  |D cg c]-  }|j                  |j                  |j                  xs d	      / c}
            }|j                  |j                  j                  |	|            } | j                  di |}|
j                  | j                   j#                  |j%                  ||||d                    |
S # t        $ r}t        d      |d}~ww xY wc c}w )aJ  Runs Google Document AI PDF Batch Processing on a list of `Blob`.

        Args:
            blobs: List of `Blob` to be parsed
            gcs_output_path: Path (folder) on GCS to store results
            processor_name: Name of a Document AI processor.
            batch_size: Amount of documents per batch
            field_mask: Comma-separated list of which fields to include in the Document
                AI response. Suggested: `'text,pages.pageNumber,pages.layout'`
            process_options_kwargs: Optional parameters to pass to the Document AI
                processors

        Document AI has a 1000 file limit per batch, so batches larger than that need
        to be split into multiple requests.

        Batch processing is an async long-running operation and results are stored in a
        output GCS bucket.
        r   r_   rP   Nr   z0A Document AI processor name should be provided.)sizeiterablerd   re   )	documents)gcs_documents)rf   r\   )gcs_output_configT)r0   input_documentsdocument_output_configrZ   ri   r   )r   r`   r;   r6   r3   r7   r   BatchDocumentsInputConfigGcsDocumentsr   rr   r   DocumentOutputConfigGcsOutputConfigr[   appendr8   batch_process_documentsBatchProcessRequest)r@   r   r%   r&   r   r\   r]   r`   rB   r   r   batchrE   input_configoutput_configrZ   s                   r   r   zDocAIParser.docai_parse  s   8	/ &>)>)>L  (?4+?+?!OPP
"
UC 	E%??(55 %*
 !	 #..$(II&*mm&H7H /  6  @ 
L ';;","A"A"Q"Q'J #R # < M <d;;U>TUO4422+(4/<(7*. 3 
+	@ a  	B 	&s   D. .2E.	E7EEc           	      x   	 ddl m} |D cg c]  }t        |j                  |      r|j                  j
                  n.|j                  |j                  j                        j
                  D ]#  }t        |j                  |j                        %  c}}S # t        $ r}t        d      |d }~ww xY wc c}}w )Nr   )BatchProcessMetadatarP   )r   r   )google.cloud.documentai_v1r   r;   
isinstancerp   individual_process_statusesdeserializevaluer   input_gcs_sourceoutput_gcs_destination)r@   r   r   rB   r   statuss         r   r   zDocAIParser.get_results  s    	 !

  bkk+?@ 77)55KK%%--
   "33"99

 	
  	B 	
s   B BB6	B3"B..B3)TNi  Tr   )Ni  <   )r   r   r   r   r   r   rD   r   r   r   rI   r2   r   intr[   r   r   r	   rH   r   r   r   r   r   r   r   r   r   r!   r!   (   s`   , >B$("&)-(,A0 9:A0 SM	A0
 3-A0 "#A0 !A0F	St 	S(: 	S 59*.$'488#+D>8 T#Y'8 SM	8
 $,D>8 
8z %)FF SMF #&	F
 
(	FV *.%'<4~<4 "#<4 	<4
  #<4 #&<4 
(	<4|*/0*	(	*X
T#Y 
4CT 
"7T+%6 74 7 *.(,$(N~N "#	N
 !N N SMN #&N 
k	N`
d;&7 
DAT<U 
r   r!   )"r   loggingr4   r   dataclassesr   typingr   r   r   r   r   r	   langchain_core.document_loadersr
   ,langchain_core.document_loaders.blob_loadersr   langchain_core.documentsr   langchain_core.utils.iterr   !langchain_google_community._utilsr   google.api_core.operationr   r:   r   rV   r   	getLoggerr   r   r   r!   r   r   r   <module>r      sw     	  ! I I : = - 3 =3 @			8	$   J
. J
r   