|
1 | | -import json |
2 | | -from pyexpat import model |
3 | | - |
4 | 1 | import requests |
5 | 2 | from typing import Any, Dict, Optional |
6 | 3 |
|
|
10 | 7 | class Documents: |
11 | 8 | """Documents allows you to extract text from various document file types. |
12 | 9 |
|
13 | | - Usage:: |
| 10 | + Usage: |
14 | 11 |
|
15 | 12 | from predictionguard import PredictionGuard |
16 | 13 |
|
@@ -39,34 +36,71 @@ def __init__(self, api_key, url): |
39 | 36 |
|
40 | 37 | def create( |
41 | 38 | self, |
42 | | - file: str |
| 39 | + file: str, |
| 40 | + embed_images: Optional[bool] = False, |
| 41 | + output_format: Optional[str] = None, |
| 42 | + chunk_document: Optional[bool] = False, |
| 43 | + chunk_size: Optional[int] = None, |
| 44 | + toxicity: Optional[bool] = False, |
| 45 | + pii: Optional[str] = "", |
| 46 | + replace_method: Optional[str] = "", |
| 47 | + injection: Optional[bool] = False, |
43 | 48 | ) -> Dict[str, Any]: |
44 | 49 | """ |
45 | 50 | Creates a documents request to the Prediction Guard /documents/extract API |
46 | 51 |
|
47 | 52 | :param file: Document to be parsed |
| 53 | + :param embed_images: Whether to embed images into documents |
| 54 | + :param output_format: Output format |
| 55 | + :param chunk_document: Whether to chunk documents into chunks |
| 56 | + :param chunk_size: Chunk size |
| 57 | + :param toxicity: Whether to check for output toxicity |
| 58 | + :param pii: Whether to check for or replace pii |
| 59 | + :param replace_method: Replace method for any PII that is present. |
| 60 | + :param injection: Whether to check for prompt injection |
48 | 61 | :result: A dictionary containing the title, content, and length of the document. |
49 | 62 | """ |
50 | 63 |
|
51 | 64 | # Run _extract_documents |
52 | | - choices = self._extract_documents(file) |
| 65 | + choices = self._extract_documents( |
| 66 | + file, embed_images, output_format, |
| 67 | + chunk_document, chunk_size, toxicity, |
| 68 | + pii, replace_method, injection |
| 69 | + ) |
53 | 70 | return choices |
54 | 71 |
|
55 | | - def _extract_documents(self, file): |
| 72 | + def _extract_documents( |
| 73 | + self, file, embed_images, |
| 74 | + output_format, chunk_document, |
| 75 | + chunk_size, toxicity, pii, |
| 76 | + replace_method, injection |
| 77 | + ): |
56 | 78 | """ |
57 | 79 | Function to extract a document. |
58 | 80 | """ |
59 | 81 |
|
60 | 82 | headers = { |
61 | 83 | "Authorization": "Bearer " + self.api_key, |
62 | 84 | "User-Agent": "Prediction Guard Python Client: " + __version__, |
| 85 | + "Toxicity": str(toxicity), |
| 86 | + "Pii": pii, |
| 87 | + "Replace-Method": replace_method, |
| 88 | + "Injection": str(injection) |
| 89 | + } |
| 90 | + |
| 91 | + data = { |
| 92 | + "embedImages": embed_images, |
| 93 | + "outputFormat": output_format, |
| 94 | + "chunkDocument": chunk_document, |
| 95 | + "chunkSize": chunk_size, |
63 | 96 | } |
64 | 97 |
|
65 | 98 | with open(file, "rb") as doc_file: |
66 | 99 | files = {"file": (file, doc_file)} |
67 | 100 |
|
68 | 101 | response = requests.request( |
69 | | - "POST", self.url + "/documents/extract", headers=headers, files=files |
| 102 | + "POST", self.url + "/documents/extract", |
| 103 | + headers=headers, files=files, data=data |
70 | 104 | ) |
71 | 105 |
|
72 | 106 | # If the request was successful, print the proxies. |
|
0 commit comments