Skip to content

Commit d8aadc4

Browse files
authored
fix: Changed client_info import and added new quickstart samples (#268)
* fix: Changed `client_info` import `google.api_core.client_info` -> `google.api_core.gapic_v1.client_info` * Add examples for all document initialization options (w/tests) Fixes #266
1 parent 01843e4 commit d8aadc4

File tree

5 files changed

+191
-25
lines changed

5 files changed

+191
-25
lines changed

packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/utilities/gcs_utilities.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import re
1919
from typing import Dict, List, Optional, Tuple
2020

21-
from google.api_core import client_info
21+
from google.api_core.gapic_v1 import client_info
2222

2323
from google.cloud import documentai, documentai_toolbox, storage
2424
from google.cloud.documentai_toolbox import constants

packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/document.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import re
2323
from typing import Dict, List, Optional, Type, Union
2424

25+
from google.api_core.client_options import ClientOptions
2526
from google.api_core.operation import from_gapic as operation_from_gapic
2627
from google.cloud.vision import AnnotateFileResponse
2728
from google.longrunning.operations_pb2 import GetOperationRequest
@@ -138,6 +139,7 @@ def _get_shards(gcs_bucket_name: str, gcs_prefix: str) -> List[documentai.Docume
138139

139140
def _get_batch_process_metadata(
140141
operation_name: str,
142+
location: Optional[str] = None,
141143
timeout: Optional[float] = None,
142144
) -> documentai.BatchProcessMetadata:
143145
r"""Get `BatchProcessMetadata` from a `batch_process_documents()` long-running operation.
@@ -146,22 +148,41 @@ def _get_batch_process_metadata(
146148
operation_name (str):
147149
Required. The fully qualified operation name for a `batch_process_documents()` operation.
148150
151+
location (str):
152+
Optional. The location of the processor used for `batch_process_documents()`.
153+
Deprecated. Maintained for backwards compatibility.
154+
149155
timeout (float):
150156
Optional. Default None. Time in seconds to wait for operation to complete.
151157
If None, will wait indefinitely.
152158
Returns:
153159
documentai.BatchProcessMetadata:
154160
Metadata from batch process.
155161
"""
162+
# Validate Operation Name
163+
match = re.search(
164+
r"projects\/\w+\/locations\/(\w+)\/operations\/\w+", operation_name
165+
)
166+
167+
if not match:
168+
raise ValueError(
169+
f"Invalid Operation Name: {operation_name}\n"
170+
"Expected operation name in the format `projects/<project>/locations/<location>/operations/<operation>`"
171+
)
172+
173+
location = location or match.group(1)
174+
156175
client = documentai.DocumentProcessorServiceClient(
157176
client_info=gcs_utilities._get_client_info(module="get_batch_process_metadata"),
177+
client_options=ClientOptions(
178+
api_endpoint=f"{location}-documentai.googleapis.com"
179+
),
158180
)
159181

160182
# Poll Operation until complete.
161183
operation = operation_from_gapic(
162184
operation=client.get_operation(
163185
request=GetOperationRequest(name=operation_name),
164-
metadata=documentai.BatchProcessMetadata(),
165186
),
166187
operations_client=client,
167188
result_type=documentai.BatchProcessResponse,
@@ -599,6 +620,7 @@ def from_batch_process_operation(
599620
return cls.from_batch_process_metadata(
600621
metadata=_get_batch_process_metadata(
601622
operation_name=operation_name,
623+
location=location,
602624
timeout=timeout,
603625
)
604626
)

packages/google-cloud-documentai-toolbox/samples/snippets/quickstart_sample.py

Lines changed: 66 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -15,41 +15,88 @@
1515

1616

1717
# [START documentai_toolbox_quickstart]
18+
from typing import Optional
1819

20+
from google.cloud import documentai
1921
from google.cloud.documentai_toolbox import document
2022
from google.cloud.documentai_toolbox import gcs_utilities
2123

2224
# TODO(developer): Uncomment these variables before running the sample.
23-
# Given a document.proto or sharded document.proto in path gs://bucket/path/to/folder
25+
# Given a Document JSON or sharded Document JSON in path gs://bucket/path/to/folder
2426
# gcs_bucket_name = "bucket"
2527
# gcs_prefix = "path/to/folder"
2628

29+
# Or, given a Document JSON in path gs://bucket/path/to/folder/document.json
30+
# gcs_uri = "gs://bucket/path/to/folder/document.json"
31+
32+
# Or, given a Document JSON in path local/path/to/folder/document.json
33+
# document_path = "local/path/to/folder/document.json"
34+
35+
# Or, given a Document object from Document AI
36+
# documentai_document = documentai.Document()
37+
38+
# Or, given a BatchProcessMetadata object from Document AI
39+
# operation = client.batch_process_documents(request)
40+
# operation.result(timeout=timeout)
41+
# batch_process_metadata = documentai.BatchProcessMetadata(operation.metadata)
42+
43+
# Or, given a BatchProcessOperation name from Document AI
44+
# batch_process_operation = "projects/project_id/locations/location/operations/operation_id"
45+
46+
47+
def quickstart_sample(
48+
gcs_bucket_name: Optional[str] = None,
49+
gcs_prefix: Optional[str] = None,
50+
gcs_uri: Optional[str] = None,
51+
document_path: Optional[str] = None,
52+
documentai_document: Optional[documentai.Document] = None,
53+
batch_process_metadata: Optional[documentai.BatchProcessMetadata] = None,
54+
batch_process_operation: Optional[str] = None,
55+
) -> None:
56+
if gcs_bucket_name and gcs_prefix:
57+
# Load from Google Cloud Storage Directory
58+
print("Document structure in Cloud Storage")
59+
gcs_utilities.print_gcs_document_tree(
60+
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
61+
)
62+
63+
wrapped_document = document.Document.from_gcs(
64+
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
65+
)
66+
elif gcs_uri:
67+
# Load a single Document from a Google Cloud Storage URI
68+
wrapped_document = document.Document.from_gcs_uri(gcs_uri=gcs_uri)
69+
elif document_path:
70+
# Load from local `Document` JSON file
71+
wrapped_document = document.Document.from_document_path(document_path)
72+
elif documentai_document:
73+
# Load from `documentai.Document` object
74+
wrapped_document = document.Document.from_documentai_document(
75+
documentai_document
76+
)
77+
elif batch_process_metadata:
78+
# Load Documents from `BatchProcessMetadata` object
79+
wrapped_documents = document.Document.from_batch_process_metadata(
80+
metadata=batch_process_metadata
81+
)
82+
wrapped_document = wrapped_documents[0]
83+
elif batch_process_operation:
84+
wrapped_documents = document.Document.from_batch_process_operation(
85+
location="us", operation_name=batch_process_operation
86+
)
87+
wrapped_document = wrapped_documents[0]
88+
else:
89+
raise ValueError("No document source provided.")
2790

28-
def quickstart_sample(gcs_bucket_name: str, gcs_prefix: str) -> None:
29-
print("Document structure in Cloud Storage")
30-
gcs_utilities.print_gcs_document_tree(
31-
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
32-
)
33-
34-
wrapped_document = document.Document.from_gcs(
35-
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
36-
)
3791
# For all properties and methods, refer to:
3892
# https://cloud.google.com/python/docs/reference/documentai-toolbox/latest/google.cloud.documentai_toolbox.wrappers.document.Document
3993

40-
# Alternatively, create wrapped document from:
41-
#
42-
# - Local `Document` JSON file: `document.Document.from_document_path()`
43-
# - `Document` object: `document.Document.from_documentai_document()`
44-
# - `BatchProcessMetadata`: `document.Document.from_batch_process_metadata()`
45-
# - Batch Processing Operation: `document.Document.from_batch_process_operation()`
46-
4794
print("Document Successfully Loaded!")
4895
print(f"\t Number of Pages: {len(wrapped_document.pages)}")
4996
print(f"\t Number of Entities: {len(wrapped_document.entities)}")
5097

51-
for idx, page in enumerate(wrapped_document.pages):
52-
print(f"Page {idx}")
98+
for page in wrapped_document.pages:
99+
print(f"Page {page.page_number}")
53100
for block in page.blocks:
54101
print(block.text)
55102
for paragraph in page.paragraphs:

packages/google-cloud-documentai-toolbox/samples/snippets/test_quickstart_sample.py

Lines changed: 91 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,105 @@
1818
import pytest
1919
from samples.snippets import quickstart_sample
2020

21+
from google.cloud import documentai
22+
from google.longrunning.operations_pb2 import ListOperationsRequest # type: ignore
23+
2124
location = "us"
2225
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
23-
gcs_bucket_name = "documentai_toolbox_samples"
24-
gcs_input_uri = "output/123456789/0"
2526

2627

27-
def test_quickstart_sample(capsys: pytest.CaptureFixture) -> None:
28+
def test_quickstart_sample_gcs_bucket_prefix(capsys: pytest.CaptureFixture) -> None:
29+
gcs_bucket_name = "documentai_toolbox_samples"
30+
gcs_prefix = "output/123456789/0"
2831
quickstart_sample.quickstart_sample(
29-
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_input_uri
32+
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
3033
)
3134
out, _ = capsys.readouterr()
3235

3336
assert "Document structure in Cloud Storage" in out
3437
assert "Number of Pages: 1" in out
3538
assert "Number of Entities: 35" in out
39+
40+
41+
def test_quickstart_sample_gcs_uri(capsys: pytest.CaptureFixture) -> None:
42+
gcs_uri = (
43+
"gs://documentai_toolbox_samples/output/123456789/0/toolbox_invoice_test-0.json"
44+
)
45+
quickstart_sample.quickstart_sample(gcs_uri=gcs_uri)
46+
out, _ = capsys.readouterr()
47+
48+
assert "Number of Pages: 1" in out
49+
assert "Number of Entities: 35" in out
50+
51+
52+
def test_quickstart_sample_document_path(capsys: pytest.CaptureFixture) -> None:
53+
document_path = "resources/form_with_tables.json"
54+
quickstart_sample.quickstart_sample(document_path=document_path)
55+
out, _ = capsys.readouterr()
56+
57+
assert "Number of Pages: 1" in out
58+
assert "Number of Entities: 0" in out
59+
assert "Form Date" in out
60+
61+
62+
def test_quickstart_sample_documentai_document(capsys: pytest.CaptureFixture) -> None:
63+
with open("resources/form_with_tables.json", encoding="utf-8") as f:
64+
documentai_document = documentai.Document.from_json(
65+
f.read(), ignore_unknown_fields=True
66+
)
67+
68+
quickstart_sample.quickstart_sample(documentai_document=documentai_document)
69+
out, _ = capsys.readouterr()
70+
71+
assert "Number of Pages: 1" in out
72+
assert "Number of Entities: 0" in out
73+
assert "Form Date" in out
74+
75+
76+
def test_quickstart_sample_batch_process_metadata(
77+
capsys: pytest.CaptureFixture,
78+
) -> None:
79+
client = documentai.DocumentProcessorServiceClient()
80+
name = f"{client.common_location_path(project=project_id, location=location)}/operations"
81+
response = client.list_operations(
82+
request=ListOperationsRequest(
83+
name=name,
84+
filter="TYPE=BATCH_PROCESS_DOCUMENTS AND STATE=DONE",
85+
page_size=1,
86+
)
87+
)
88+
batch_process_metadata = documentai.BatchProcessMetadata.deserialize(
89+
response.operations[0].metadata.value
90+
)
91+
92+
quickstart_sample.quickstart_sample(batch_process_metadata=batch_process_metadata)
93+
94+
out, _ = capsys.readouterr()
95+
96+
assert "Document Successfully Loaded!" in out
97+
98+
99+
def test_quickstart_sample_batch_process_operation(
100+
capsys: pytest.CaptureFixture,
101+
) -> None:
102+
client = documentai.DocumentProcessorServiceClient()
103+
name = f"{client.common_location_path(project=project_id, location=location)}/operations"
104+
response = client.list_operations(
105+
request=ListOperationsRequest(
106+
name=name,
107+
filter="TYPE=BATCH_PROCESS_DOCUMENTS AND STATE=DONE",
108+
page_size=1,
109+
)
110+
)
111+
batch_process_operation = response.operations[0].name
112+
113+
quickstart_sample.quickstart_sample(batch_process_operation=batch_process_operation)
114+
115+
out, _ = capsys.readouterr()
116+
117+
assert "Document Successfully Loaded!" in out
118+
119+
120+
def test_quickstart_sample_no_input() -> None:
121+
with pytest.raises(ValueError, match="No document source provided."):
122+
quickstart_sample.quickstart_sample()

packages/google-cloud-documentai-toolbox/tests/unit/test_document.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,16 @@ def test_get_batch_process_metadata_with_invalid_metadata_type(mock_docai):
317317
document._get_batch_process_metadata(operation_name)
318318

319319

320+
def test_get_batch_process_metadata_with_invalid_operation_name():
321+
with pytest.raises(
322+
ValueError,
323+
match="Invalid Operation Name",
324+
):
325+
document._get_batch_process_metadata(
326+
"projects//locations/us/operations/7890123"
327+
)
328+
329+
320330
def test_bigquery_column_name():
321331
string_map = {
322332
"Phone #:": "phone_num",

0 commit comments

Comments
 (0)