-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathllmwhisperer_cli.py
More file actions
executable file
·119 lines (102 loc) · 4.39 KB
/
llmwhisperer_cli.py
File metadata and controls
executable file
·119 lines (102 loc) · 4.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env python3
import sys
import os
import argparse
import json
from pathlib import Path
from dotenv import load_dotenv
from unstract.llmwhisperer import LLMWhispererClientV2
def main():
parser = argparse.ArgumentParser(description='LLMWhisperer Client - Extract text from documents')
parser.add_argument('file_path', help='Path to the file to process')
parser.add_argument('-o', '--output', dest='output_file',
help='Output file to save the extracted text')
parser.add_argument('-m', '--mode', dest='mode',
choices=['native_text', 'low_cost', 'high_quality', 'form', 'table'],
default='high_quality',
help='Extraction mode (default: high_quality)')
parser.add_argument('-p', '--pages', dest='pages',
help='Pages to extract (e.g., "1-5", "7", "1-5,7,21-")')
parser.add_argument('--vert', action='store_true',
help='Recreate vertical table borders')
parser.add_argument('--horiz', action='store_true',
help='Recreate horizontal table borders (requires --vert)')
args = parser.parse_args()
file_path = args.file_path
output_file = args.output_file
mode = args.mode
pages = args.pages if args.pages else ''
mark_vertical = args.vert
mark_horizontal = args.horiz
if mark_horizontal and not mark_vertical:
print("Error: --horiz requires --vert to be enabled")
sys.exit(1)
if not os.path.exists(file_path):
print(f"Error: File '{file_path}' does not exist")
sys.exit(1)
env_file = Path(__file__).parent / '.env'
if env_file.exists():
load_dotenv(env_file)
api_key = os.getenv('LLMWHISPERER_API_KEY')
if not api_key:
print("Error: LLMWHISPERER_API_KEY not found in environment or .env file")
sys.exit(1)
base_url = os.getenv('LLMWHISPERER_BASE_URL_V2', 'https://llmwhisperer-api.us-central.unstract.com/api/v2')
try:
client = LLMWhispererClientV2(
base_url=base_url,
api_key=api_key
)
print(f"Processing file: {file_path}")
print(f"Mode: {mode}")
if pages:
print(f"Pages: {pages}")
if mark_vertical or mark_horizontal:
borders = []
if mark_vertical:
borders.append("vertical")
if mark_horizontal:
borders.append("horizontal")
print(f"Table borders: {', '.join(borders)}")
print("Please wait...")
whisper_result = client.whisper(
file_path=file_path,
wait_for_completion=True,
mode=mode,
pages_to_extract=pages,
mark_vertical_lines=mark_vertical,
mark_horizontal_lines=mark_horizontal,
wait_timeout=200
)
if whisper_result.get('status') == 'processed':
extraction = whisper_result.get('extraction', {})
extracted_text = extraction.get('result_text', '')
metadata = extraction.get('metadata', {})
if output_file:
try:
with open(output_file, 'w', encoding='utf-8') as f:
f.write(extracted_text)
print(f"Extracted text successfully saved to: {output_file}")
except IOError as e:
print(f"Error writing to output file: {e}")
sys.exit(1)
else:
print("\n--- Extracted Text ---")
print(extracted_text)
print("--- End of Extracted Text ---")
if metadata:
page_count = len(metadata)
print(f"\nTotal pages processed: {page_count}")
else:
print(f"Processing status: {whisper_result.get('status')}")
print(f"Message: {whisper_result.get('message')}")
except Exception as e:
if hasattr(e, 'message'):
print(f"LLMWhisperer Error: {e.message}")
else:
print(f"Error: {str(e)}")
if hasattr(e, 'status_code'):
print(f"Status Code: {e.status_code}")
sys.exit(1)
if __name__ == "__main__":
main()