-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathrun_hlejson.py
More file actions
2152 lines (1794 loc) · 94.4 KB
/
run_hlejson.py
File metadata and controls
2152 lines (1794 loc) · 94.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import argparse
import json
import os
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Dict, Any
import time
import sys
import logging
from scripts.reverse_image import GoogleLensSearchTool
import re
import traceback
import numpy as np
import pandas as pd
import json
from pydantic import BaseModel, ValidationError
from typing import Literal
import os
import json
import copy
import math
import argparse
import asyncio
import numpy as np
from typing import Literal
from pydantic import BaseModel
from openai import AsyncOpenAI
from tqdm.asyncio import tqdm_asyncio
from datasets import load_dataset
from rich.console import Console
import smolagents.monitoring
print(smolagents.monitoring.AgentLogger)
from smolagents.monitoring import AgentLogger
from smolagents.monitoring import LogLevel
import datasets
from datasets import Dataset
from dotenv import load_dotenv
from huggingface_hub import login
from smolagents import CodeAgent
from smolagents.agents import ToolCallingAgent
from scripts.reformulator import prepare_response
from scripts.run_agents import (
get_single_file_description,
get_zip_description,
)
from scripts.text_inspector_tool import TextInspectorTool
from scripts.text_web_browser import (
ArchiveSearchTool,
FinderTool,
FindNextTool,
PageDownTool,
PageUpTool,
SearchInformationTool,
SimpleTextBrowser,
VisitTool,
)
from scripts.image_web_browser import (
SimpleImageBrowser,
SearchInformationTool_Image,
VisitTool_Image,
ArchiveSearchTool_Image,
PageUpTool_Image,
PageDownTool_Image,
FinderTool_Image,
FindNextTool_Image,
SaveHTMLTool,
)
from scripts.file_processing import (
FileProcessor,
OCRTool,
PDFTool,
DOCXTool,
XLSXTool,
PPTXTool,
ImageAnalysisTool,
)
from scripts.web_tools import (
LiteratureSearchingTool,
GeneralBrowserTool,
RelevantLiteratureFinderTool,
BookMatchExtractorTool,
DirectGoogleBooksCrawlerTool,
SpringerSearchTool,
SpringerStructuredSearchTool,
SpringerDownloadAndParseTool,
)
from scripts.LocalGoogleSearchTool import LocalGoogleSearchTool
from scripts.visual_qa import visualizer
from tqdm import tqdm
from smolagents import (
LiteLLMModel,
Model,
)
import openai
from smolagents.models import MessageRole
from dataset_loader import load_custom_dataset
from scripts.translator import TranslatorTool
from scripts.speech_recognition import SpeechRecognitionTool
from scripts.ocr import OCRTool
from scripts.frame_extract import VideoFrameExtractorTool
import os
import json
import copy
import math
import argparse
import asyncio
import numpy as np
from typing import Literal
from pydantic import BaseModel
from openai import AsyncOpenAI
from tqdm.asyncio import tqdm_asyncio
from datasets import load_dataset
client = AsyncOpenAI(timeout=300.0, max_retries=1, api_key=os.getenv("OPENAI_API_KEY"))
JUDGE_PROMPT = """You are a fair evaluator. Judge whether the following [response] to [question] is semantically consistent with the [correct_answer] below.
[question]: {question}
[response]: {response}
[correct_answer]: {correct_answer}
When you judge, consider only whether the core meaning and all necessary key points in the response match the correct answer. Even if wording or format differs, treat equivalent semantics as correct. Treat missing key points or any substantive error or omission as incorrect. For numerical answers, a small rounding difference is acceptable. Tolerate substantive deviations from the correct answer. If the extracted_final_answer is a more specific instance of the correct_answer (for example, "Pieter Schenk II" vs "Pieter Schenk"), and it still contains the core string of the correct_answer, treat it as correct.
Please output exactly in the format and criteria specified below:
extracted_final_answer: The final exact answer extracted from the [response]. Put the extracted answer as 'None' if there is no exact, final answer to extract from the response.
reasoning: Explain why the extracted_final_answer is correct or incorrect based on [correct_answer], focusing only on if there are meaningful differences between [correct_answer] and the extracted_final_answer. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers match.
correct: Answer 'yes' if extracted_final_answer matches the [correct_answer] given above, or is within a small margin of error for numerical problems. Answer 'no' otherwise, i.e. if there is any inconsistency, ambiguity, non-equivalency, or if the extracted answer is incorrect.
confidence: The extracted confidence score between 0|\%| and 100|\%| from [response]. Put 100 if there is no confidence score available."""
class ExtractedAnswer(BaseModel):
extracted_final_answer: str
reasoning: str
correct: Literal["yes", "no"]
confidence: int
strict: Literal[True] # 100% reliability
AUTHORIZED_IMPORTS = [
"requests",
"zipfile",
"os",
"pandas",
"numpy",
"sympy",
"json",
"bs4",
"pubchempy",
"xml",
"yahoo_finance",
"Bio",
"sklearn",
"scipy",
"pydub",
"io",
"PIL",
"chess",
"PyPDF2",
"pptx",
"torch",
"datetime",
"fractions",
"csv",
]
load_dotenv(override=True)
login(os.getenv("HF_TOKEN"))
append_answer_lock = threading.Lock()
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--concurrency", type=int, default=1)
parser.add_argument("--model-id", type=str, default="gpt-4o")
parser.add_argument("--run-name", type=str, required=True)
parser.add_argument("--api-key", type=str, help="OpenAI API key", default=os.getenv("OPENAI_API_KEY"))
parser.add_argument("--use-image-agent", action="store_true", help="Enable image information agent")
parser.add_argument("--use-file-agent", action="store_true", help="Enable file processor agent")
parser.add_argument("--use-literature-agent", action="store_true", help="Enable literature search agent")
parser.add_argument("--no-text-webbrowser-agent", action="store_true", help="Disable text webbrowser agent (enabled by default)")
parser.add_argument("--use-springer", action="store_true", default=True, help="Enable Springer tools (enabled by default)")
parser.add_argument("--no-springer", action="store_false", dest="use_springer", help="Disable Springer tools")
parser.add_argument("--use-browser", action="store_true", help="Enable interactive browser functionality for literature search tools")
parser.add_argument("--use-ocr-agent", action="store_true", help="Enable OCR agent")
parser.add_argument("--use-translator-agent", action="store_true", help="Enable translator agent")
parser.add_argument("--use-speech-recognition-agent", action="store_true", help="Enable speech recognition agent")
parser.add_argument("--results-json-path", type=str, default=None, help="Path to previous results JSON file for filtering already correct answers")
parser.add_argument("--baseline", action="store_true", help="Use baseline agent instead of agent hierarchy")
parser.add_argument("--output-dir", type=str, default="output", help="Output directory for results")
parser.add_argument("--level", type=str, default="level2", choices=["level1", "level2", "level3"], help="Specify which level of questions to test")
parser.add_argument("--question-ids", type=str, help="Comma-separated list of specific question IDs to run (e.g., '16,24,35')")
parser.add_argument("--start-id", type=int, help="Starting question ID for a range of questions to run")
parser.add_argument("--end-id", type=int, help="Ending question ID for a range of questions to run")
parser.add_argument("--springer-api-key", type=str, help="Springer Nature API key", default=os.getenv("SPRINGER_API_KEY"))
parser.add_argument("--llama-api-key", type=str, help="LlamaParse API key", default=os.getenv("LLAMA_API_KEY"))
parser.add_argument("--use-Chinese-agent", action="store_true",default=False, help="Enable Chinese agent")
return parser.parse_args()
print("Make sure you deactivated Tailscale VPN, else some URLs will be blocked!")
USE_OPEN_MODELS = False
SET = None
custom_role_conversions = {"tool-call": "assistant", "tool-response": "user"}
### LOAD EVALUATION DATASET
# eval_ds = datasets.load_dataset("gaia-benchmark/GAIA", "2023_all")[SET]
# eval_ds = eval_ds.rename_columns({"Question": "question", "Final answer": "true_answer", "Level": "task"})
# RELATIVE_EXCEL_PATH = "Historical/HistBench/HistBench.xlsx"
# RELATIVE_EXCEL_PATH = "Historical/Historical/Historical Q&A collections(100).xlsx"
RELATIVE_EXCEL_PATH = "Historical/HLEjson/HLE.json"
EXCEL_PATH = os.path.abspath(RELATIVE_EXCEL_PATH)
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
SHARED_CONFIG = {
"downloads_folder": "downloads",
"ocr_languages": ["en", "ch_sim"],
"speech_model": "google",
"translation_url": "http://127.0.0.1:5000/translate",
"imgbb_api_key": os.getenv("IMGBB_API_KEY"),
"serpapi_api_key": os.getenv("SERPAPI_API_KEY")
}
BROWSER_CONFIG = {
"viewport_size": 1024 * 5,
"downloads_folder": "downloads_folder",
"request_kwargs": {
"headers": {"User-Agent": user_agent},
"timeout": 300,
},
"serpapi_key": os.getenv("SERPAPI_API_KEY"),
}
BROWSER_CONFIG_IMAGE = {
"viewport_size": 1024 * 5,
"downloads_folder": "image_downloads_folder",
"request_kwargs": {
"headers": {"User-Agent": user_agent},
"timeout": 300,
},
"serpapi_key": os.getenv("SERPAPI_API_KEY"),
}
GOOGLE_LENS_CONFIG = {
"imgbb_api_key": os.getenv("IMGBB_API_KEY"),
"serpapi_api_key": os.getenv("SERPAPI_API_KEY"),
"search_api_key": os.getenv("SEARCH_API_KEY")
}
OCR_CONFIG = {
"imgbb_api_key": os.getenv("IMGBB_API_KEY"),
"openrouter_api_key": os.getenv("OPENROUTER_API_KEY")
}
SPRINGER_CONFIG = {
"springer_api_key": os.getenv("SPRINGER_API_KEY"),
"llama_api_key": os.getenv("LLAMA_API_KEY"),
"downloads_folder": "springer_downloads"
}
os.makedirs(f"./{BROWSER_CONFIG['downloads_folder']}", exist_ok=True)
os.makedirs(f"./{BROWSER_CONFIG_IMAGE['downloads_folder']}", exist_ok=True)
os.makedirs(f"./{SPRINGER_CONFIG['downloads_folder']}", exist_ok=True)
def create_agent_hierarchy(model: Model, use_image_agent=False, use_file_agent=False, use_literature_agent=False, use_text_webbrowser_agent=True, baseline=False, springer_api_key=None, llama_api_key=None, use_springer=True, use_browser=False, use_ocr_agent=False, use_translator_agent=False, use_speech_recognition_agent=False, use_Chinese_agent=False, logger=None):
"""
Create agent hierarchy or baseline agent
Parameters:
model: Language model used
use_image_agent: Whether to use image agent
use_file_agent: Whether to use file processing agent
use_literature_agent: Whether to use literature search agent
use_text_webbrowser_agent: Whether to use text browser agent (enabled by default)
baseline: Whether to use baseline agent instead of agent hierarchy
springer_api_key: Springer Nature API key
llama_api_key: LlamaParse API key
use_springer: Whether to use Springer related tools (enabled by default)
use_browser: Whether to enable browser functionality for literature search tools
logger: Agent logger
Returns:
Agent: Created agent instance
"""
text_limit = 100000
ti_tool = TextInspectorTool(model, text_limit)
browser = SimpleTextBrowser(**BROWSER_CONFIG)
browser_image = SimpleImageBrowser(**BROWSER_CONFIG_IMAGE)
Image_Reverse_Search_Tool = GoogleLensSearchTool(
imgbb_api_key=GOOGLE_LENS_CONFIG["imgbb_api_key"],
serpapi_api_key=GOOGLE_LENS_CONFIG["serpapi_api_key"],
search_api_key= GOOGLE_LENS_CONFIG["search_api_key"]
)
Image_Reverse_Search_Tool.name = "Image_Reverse_Search_Tool"
file_processor = FileProcessor(
ocr_languages=SHARED_CONFIG["ocr_languages"],
model=model
)
pdf_tool = PDFTool(file_processor)
xlsx_tool = XLSXTool(file_processor)
docx_tool = DOCXTool(file_processor)
pptx_tool = PPTXTool(file_processor)
WEB_TOOLS = [
SearchInformationTool(browser),
VisitTool(browser),
PageUpTool(browser),
PageDownTool(browser),
FinderTool(browser),
FindNextTool(browser),
ArchiveSearchTool(browser),
TextInspectorTool(model, text_limit),
]
if baseline:
text_webbrowser_agent = ToolCallingAgent(
model=model,
tools=WEB_TOOLS,
max_steps=20,
verbosity_level=2,
planning_interval=4,
name="search_agent",
description="""A team member that will search the internet to answer your question.
Ask him for all your questions that require browsing the web.
Provide him as much context as possible, in particular if you need to search on a specific timeframe!
And don't hesitate to provide him with a complex search task, like finding a difference between two webpages.
Your request must be a real sentence, not a google search! Like "Find me this information (...)" rather than a few keywords.
""",
provide_run_summary=True,
logger=logger
)
text_webbrowser_agent.prompt_templates["managed_agent"]["task"] += """You can navigate to .txt online files.
If a non-html page is in another format, especially .pdf or a Youtube video, use tool 'inspect_file_as_text' to inspect it.
Additionally, if after some searching you find out that you need more information to answer the question, you can use `final_answer` with your request for clarification as argument to request for more information."""
manager_agent = CodeAgent(
model=model,
tools=[visualizer, ti_tool],
max_steps=12,
verbosity_level=2,
additional_authorized_imports=AUTHORIZED_IMPORTS,
planning_interval=4,
managed_agents=[text_webbrowser_agent],
logger=logger
)
return manager_agent
LITERATURE_SEARCH_TOOLS = [
LiteratureSearchingTool(api_key=os.getenv("OPENAI_API_KEY"), download_path="downloads_folder", use_browser=use_browser),
GeneralBrowserTool(api_key=os.getenv("OPENAI_API_KEY"), download_path="downloads_folder", use_browser=use_browser),
RelevantLiteratureFinderTool(api_key=os.getenv("OPENAI_API_KEY"), download_path="downloads_folder", use_browser=use_browser),
BookMatchExtractorTool(api_key=os.getenv("OPENAI_API_KEY"), download_path="downloads_folder", use_browser=use_browser),
DirectGoogleBooksCrawlerTool(api_key=os.getenv("OPENAI_API_KEY"), download_path="downloads_folder", use_browser=use_browser),
]
if use_springer:
springer_tools = [
SpringerSearchTool(springer_api_key=springer_api_key, llama_api_key=llama_api_key, download_path="springer_downloads"),
SpringerStructuredSearchTool(springer_api_key=springer_api_key, llama_api_key=llama_api_key, download_path="springer_downloads"),
SpringerDownloadAndParseTool(springer_api_key=springer_api_key, llama_api_key=llama_api_key, download_path="springer_downloads"),
]
LITERATURE_SEARCH_TOOLS.extend(springer_tools)
print(LITERATURE_SEARCH_TOOLS)
IMAGE_SEARCH_TOOLS = [
SearchInformationTool_Image(browser_image),
VisitTool_Image(browser_image),
PageUpTool_Image(browser_image),
PageDownTool_Image(browser_image),
FinderTool_Image(browser_image),
FindNextTool_Image(browser_image),
ArchiveSearchTool_Image(browser_image),
TextInspectorTool(model, text_limit),
SaveHTMLTool(browser_image),
]
FILE_TOOLS = [
ImageAnalysisTool(file_processor, model),
pdf_tool,
docx_tool,
xlsx_tool,
pptx_tool
]
ocr_tool = OCRTool(
imgbb_api_key=OCR_CONFIG["imgbb_api_key"],
openrouter_api_key=OCR_CONFIG["openrouter_api_key"],
model=model
)
ocr_agent = ToolCallingAgent(
model=model,
tools=[ocr_tool],
max_steps=5,
verbosity_level=2,
planning_interval=2,
name="ocr_agent",
description="""Agent specialized in image text recognition.
Features:
1. Extract text content from images
2. Automatically detect languages in images
3. Support multi-language OCR processing
4. Provide image content description when OCR fails
Use cases:
- Extract text from screenshots, scanned documents, or photos
- Process charts, images, or documents containing text
- Recognize mixed multi-language content in images
""",
provide_run_summary=True,
logger=logger
)
speech_tool = SpeechRecognitionTool(model)
speech_recognition_agent = ToolCallingAgent(
model=model,
tools=[speech_tool],
max_steps=3,
verbosity_level=2,
planning_interval=1,
name="speech_recognition_agent",
description="""Agent specialized in speech recognition.
Features:
1. Convert speech in audio files to text
2. Support processing of multiple audio formats
3. Use Google Speech Recognition API for transcription
Use cases:
- Transcribe recordings, voice notes, or audio meetings
- Process voice commands or voice messages
- Analyze audio content
""",
provide_run_summary=True,
logger=logger
)
frame_extractor_tool = VideoFrameExtractorTool()
frame_extractor_agent = ToolCallingAgent(
model=model,
tools=[frame_extractor_tool, visualizer],
max_steps=5,
verbosity_level=2,
planning_interval=1,
name="frame_extractor_agent",
description="""Agent specialized in video frame extraction.
Features:
1. Extract frames from videos
2. Support processing of multiple video formats
""",
provide_run_summary=True,
logger=logger
)
translator_tool = TranslatorTool()
translator_agent = ToolCallingAgent(
model=model,
tools=[translator_tool],
max_steps=3,
verbosity_level=2,
planning_interval=1,
name="translator_agent",
description="""Agent specialized in text translation.
Features:
1. Translate text to different languages
2. Support conversion between multiple languages
3. Use specialized translation methods for special languages
Use cases:
- Translate foreign language text
- Process multilingual content
- Cross-language communication and understanding
""",
provide_run_summary=True,
logger=logger
)
WEB_TOOLS.append(LocalGoogleSearchTool(model, browser))
text_webbrowser_agent = ToolCallingAgent(
model=model,
tools=WEB_TOOLS,
max_steps=20,
verbosity_level=2,
planning_interval=4,
name="search_agent",
description="""A team member that will search the internet to answer your question.
Ask him for all your questions that require browsing the web or searching for academic literature.
A general-purpose web search agent capable of retrieving both academic and non-academic information quickly across web pages, files, and multilingual content.
Use this agent when:
- Fast or broad coverage is needed
- The task involves web-based sources (news, blogs, Wikipedia, videos, PDFs, etc.)
- You want preliminary academic content from sources like Google Scholar or publisher pages
- The language of the query is not English (e.g., Chinese, German)
This agent supports academic search, but it does not specialize in scholarly database crawling.
""",
provide_run_summary=True,
logger=logger
)
text_webbrowser_agent.prompt_templates["managed_agent"]["task"] += """You can navigate to .txt online files.
If a non-html page is in another format, especially .pdf or a Youtube video, use tool 'inspect_file_as_text' to inspect it.
Additionally, if after some searching you find out that you need more information to answer the question, you can use `final_answer` with your request for clarification as argument to request for more information.
You are the `text_webbrowser_agent`, a fast and flexible search agent for retrieving both academic and non-academic information from the open web.
**Your Strengths**:
- Fast search with broad coverage
- Access to web pages, PDFs, videos, and multilingual content
- Can return preliminary academic sources from open-access sites (e.g., Google Scholar, publisher homepages)
**Use Cases**:
- When a quick answer is needed
- When the query is not strictly academic (e.g., involves media, practical info, non-peer-reviewed knowledge)
- When the query is in Chinese, German, or another language requiring multilingual search
- When scholarly precision is not the top priority (e.g., exploring relevant context or background first)
**Important Functions**:
1. Start with `LocalGoogleSearchTool` to gather search results.
2. Use `VisitTool` to read high-potential pages.
3. Use `TextInspectorTool` to analyze special files (e.g., .pdf, .docx, .pptx).
4. Use `final_answer()` to return clarification requests if needed.
**Fallback Recommendation**:
If the query clearly requires peer-reviewed precision (e.g., academic definition, citation, exact phrase matching), consider passing the task to `literature_search_agent`.
**!!!Attention!!!**
ALL Numbers in the task (such as year, quantity, etc.) and the corresponding context of the numbers MUST be retained as the input, including background information.
Additionally, if after some searching you find out that you need more information to answer the question, you can use `final_answer` with your request for clarification as argument to request for more information.
Start now.
---
Task:
{{task}}
"""
image_information_agent = ToolCallingAgent(
model=model,
tools=[*IMAGE_SEARCH_TOOLS, Image_Reverse_Search_Tool, visualizer],
max_steps=15,
verbosity_level=2,
planning_interval=4,
name="image_information_agent",
description="""You are the image_information_agent. You are always the first to process any task that involves an image.
Your responsibility is to extract, search, and summarize all relevant information from images and their online appearances. You perform reverse image search and follow up with targeted page visits to uncover the origin, meaning, and associated context of any image.
Use this agent when:
- The task contains an image file path (e.g., .jpg, .png, .webp)
- The user asks what is shown, written, or represented in an image
- The goal is to identify the image's source, creator, or related knowledge online
Your core capabilities:
- Perform reverse image search using `Image_Reverse_Search_Tool`
- Visit related pages using `VisitTool` to find detailed metadata
- Separate information found **in the image** (e.g., symbols, people, writing) from what exists **about the image online**
Execution rules:
- You must run even if the image contains non-English text (e.g., Chinese calligraphy)
- Only process images that are explicitly mentioned in the task — ignore all illustrative examples
You are the default entry point for all image-related tasks.
""",
provide_run_summary=True,
logger=logger
)
image_information_agent.prompt_templates["managed_agent"]["task"] += """
You are the `image_information_agent`, responsible for extracting and analyzing information from images. You process both the **visual content** and its **online context** by using reverse image search and web tools.
You should give the highest importance and priority to the first result of the `Image_Reverse_Search_Tool`, which includes the website title, the image source link, and the image url.
1. **Image_Reverse_Search_Tool**:
- Purpose: Find where an image appears online and discover related information.
- When to use: This should be your first step when analyzing any image.
- Output: Provides links to web pages where the image or similar images appear.
- You should give the first result of the `Image_Reverse_Search_Tool`, which includes the website title, the image source link, and the image url, the highest importance and priority.
2. **VisitTool**:
- Purpose: Visit a specific web page to gather detailed information.
- When to use: When you need to examine a particular web page in detail.
- What to look for: Detailed information such as:
* Product descriptions and specifications
* Historical context and background information
* Auction details and provenance information
* Artist or creator information
* Dating and authentication details
* Any other relevant contextual information
- Advantage: Allows focused analysis of a single important page.
**Recommended Functions**:
1. Start with `Image_Reverse_Search_Tool` to find where the image appears online.
2. Use `VisitTool` to visit all the pages you found in the `Image_Reverse_Search_Tool` including the "link" and "image URL".
3. Use `visualizer` to visualize the image by the "image URL" returned by `Image_Reverse_Search_Tool`.
3. Integrate all findings into a comprehensive report about the image.
**IMPORTANT: DISTINGUISHING EXAMPLES FROM ACTUAL TASKS**
The following is just an EXAMPLE to illustrate the workflow. DO NOT process 'historical_document.png' unless it's specifically mentioned in the actual task:
- *Example Task*: Analyze 'historical_document.png'.
- *Example Process*:
- Use `Image_Reverse_Search_Tool: historical_document.png` to find online sources
- Use `VisitTool: https://specific-page.com` for any specific page that needs detailed examination
- Integrate findings into a report
Your objective is to process only the actual images mentioned in the current task, not any examples used for illustration.
Your task is:
{{task}}
Begin by identifying any image file paths in this task and using Image_Reverse_Search_Tool. You'd better visit the top five results returned by `Image_Reverse_Search_Tool` first.
"""
literature_search_agent = ToolCallingAgent(
model=model,
tools=LITERATURE_SEARCH_TOOLS,
max_steps=8,
verbosity_level=2,
planning_interval=4,
name="literature_search_agent",
description="""A specialized literature research agent skilled in finding authoritative academic sources for historical questions:
Use this agent when:
- The task demands peer-reviewed articles, citations, or scholarly books
- Precision and source quality are more important than response speed
- You need to locate exact phrases, match historical facts, or verify academic claims
This agent is slower than general search but significantly more reliable for formal academic tasks.
1. LiteratureSearchingTool: Search for scholarly articles and books on a specific topic
2. RelevantLiteratureFinderTool: Find and filter the most relevant literature sources
3. GeneralBrowserTool: Perform general web searches for academic information
4. BookMatchExtractorTool: Extract book match snippets from Google Books search
5. DirectGoogleBooksCrawlerTool: Directly analyze Google Books search results
6. SpringerSearchTool: Search academic papers on Springer Nature's open access platform
7. SpringerStructuredSearchTool: Perform structured searches using categorized research concepts
8. SpringerDownloadParseTool: Download and parse PDFs from Springer Nature using LlamaParse
For "exactMatch" questions, search for the exact original wording that exists in scholarly literature.
For other history questions, locate and verify facts using credible academic sources.
""",
provide_run_summary=True,
logger=logger
)
literature_search_agent.prompt_templates["managed_agent"]["task"] = """You are the `literature_search_agent`, a specialized agent for high-quality academic literature retrieval.
**Primary Role**:
- Handle academic and historical questions where **source credibility**, **precision**, and **citation** are critical.
**Use Cases**:
- "exactMatch" type questions where answers must appear verbatim in books or papers
- Verification of scientific, medical, or historical facts
- Retrieval of scholarly articles, citations, or excerpts from books
For 'exactMatch' type questions:
- The EXACT original wording can be found in scholarly literature
- Your primary task is to locate this exact text
- The answer exists verbatim in academic sources
- CRITICAL REQUIREMENT: You MUST input the ENTIRE question text as your search query
- IMPORTANT: If the question contains blanks (like "____", "___", or "[BLANK]"), remove these blanks before searching
- Example: "The Battle of _____ was fought in 1815" → search for "The Battle of was fought in 1815"
- Do NOT break down the question into keywords - use the complete text
For all other question types:
- Relevant supporting content must be found in academic sources
- Prioritize high-quality, well-cited scholarly papers
You have five powerful tools at your disposal:
1. **LiteratureSearchingTool**:
- Purpose: Search for high-impact, recent scholarly articles on a specific topic
- Usage: `LiteratureSearchingTool: [research topic/query]`
- Output: Returns 5 relevant scholarly articles with citation counts, publication years, and key findings
- When to use: For initial broad search of authoritative academic sources
2. **RelevantLiteratureFinderTool**:
- Purpose: Filter and rank the most relevant literature sources for a specific query
- Usage: `RelevantLiteratureFinderTool: [specific research question]`
- Output: Returns the 3 most relevant sources with relevance scores and key information
- When to use: To pinpoint the most directly relevant sources for your question
- For exactMatch questions, use this to find the exact original wording
3. **GeneralBrowserTool**:
- Purpose: Perform general web searches beyond academic databases
- Usage: `GeneralBrowserTool: [search query]`
- Output: Returns general web search results
- When to use: Only after exhausting academic sources, for supplementary information
4. **BookMatchExtractorTool**:
- Purpose: Extract exact book match snippets from Google Books with highlighted matching terms
- Usage: `BookMatchExtractorTool: [exact phrase to search]`
- Output: Returns book match snippets with highlighted terms that match the query
- When to use: BEST TOOL for exactMatch questions - use this FIRST with the entire question (blanks removed)
- Example: For "The Battle of _____ was fought in 1815"
- Do this: `BookMatchExtractorTool: The Battle of was fought in 1815`
5. **DirectGoogleBooksCrawlerTool**:
- Purpose: Extract book match snippets directly from a Google Books search URL
- Usage: `DirectGoogleBooksCrawlerTool: [google books search URL]`
- Output: Returns book match snippets from the URL with highlighted terms
- When to use: When you already have a Google Books search URL and need to extract match snippets"""
if use_springer:
literature_search_agent.prompt_templates["managed_agent"]["task"] += """
6. SpringerSearchTool: Search academic papers on Springer Nature's open access platform
- Usage: `SpringerSearchTool: [research topic/query]`
- Output: Returns 5 relevant scholarly articles with citation counts, publication years, and key findings
- When to use: For initial broad search of authoritative academic sources
7. SpringerStructuredSearchTool: Perform structured searches using categorized research concepts
- Usage: `SpringerStructuredSearchTool: [research topic/query]`
- Output: Returns 5 relevant scholarly articles with citation counts, publication years, and key findings
- When to use: For initial broad search of authoritative academic sources
8. SpringerDownloadParseTool: Download and parse PDFs from Springer Nature using LlamaParse
- Usage: `SpringerDownloadParseTool: [url]`
- Output: Returns 5 relevant scholarly articles with citation counts, publication years, and key findings
- When to use: For initial broad search of authoritative academic sources
"""
literature_search_agent.prompt_templates["managed_agent"]["task"] += """
**Mandatory Functions for exactMatch questions**:
1. FIRST use `BookMatchExtractorTool` with the ENTIRE question text (with blanks removed)
- Example: For "The Battle of _____ was fought in 1815"
- Do this: `BookMatchExtractorTool: The Battle of was fought in 1815`
2. If no exact match is found, use `RelevantLiteratureFinderTool` with the same query
- Example: `RelevantLiteratureFinderTool: The Battle of was fought in 1815`
3. If still no exact match, use traditional literature search tools
For all other questions:
- Start with `LiteratureSearchingTool` to get a broad overview of scholarly articles
- Then use `RelevantLiteratureFinderTool` with precise query terms to find the most relevant sources
- Only after exhausting academic sources, use `GeneralBrowserTool` if needed
Always integrate findings into a comprehensive answer with proper academic citations
You have been submitted this task by your manager.
---
Task:
{{task}}
---
Begin by determining if this is an exactMatch question. If it is, use BookMatchExtractorTool with the entire question text (blanks removed) FIRST. If not, proceed with the standard workflow starting with LiteratureSearchingTool.
"""
file_processor_agent = ToolCallingAgent(
model=model,
tools=FILE_TOOLS,
max_steps=20,
verbosity_level=2,
planning_interval=4,
name="file_processor",
description="""A specialized team member for processing various types of files:
1. Automatic File Type Detection:
- Files are automatically analyzed to determine their type
- No need to specify file type in your requests
- Just provide the file path and the appropriate tool will be selected
2. OCR: Extract ONLY the plain text from images using EasyOCR
- Returns EXACTLY the text content with no analysis or additional information
- Input: image file path
- Output: extracted text only
3. Image Analysis: Analyze and describe image content in detail
- Provides detailed descriptions of what appears in the image
- Input: image file path
- Output: comprehensive description of the image content
4. Speech Recognition: Convert speech to text
- Input: audio file path (.wav, .mp3, etc.)
- Output: transcribed text
5. Translation: Translate text between languages
- Input: text and target language code (e.g., 'en', 'zh')
- Output: translated text
""",
provide_run_summary=True,
logger=logger
)
file_processor_agent.prompt_templates["managed_agent"]["task"] += """
File Type Detection:
- The system automatically detects file types based on file extension or content analysis
- Simply provide the file path without specifying the file type
- Example: "Extract content from this file: /path/to/file.ext" instead of "Extract text from this image: /path/to/image.png"
For image files (detected automatically):
- Supported formats: .png, .jpg, .jpeg, .bmp
- Two processing options:
1. Text extraction using OCR - when you need to extract text from the image
2. Image analysis - when you need to understand the image content and get a detailed description
- Example: "Extract text from this image: /path/to/image.jpg" for OCR
- Example: "Analyze this image: /path/to/image.jpg" for visual description
For audio files (detected automatically):
- Supported formats: .wav, .mp3, .m4a
- Speech recognition is applied automatically
- For non-English audio, transcribe first then translate
For document files (detected automatically):
- Supported formats: .pdf, .docx, .xlsx, .pptx
- Text extraction is applied based on document type
For text translation:
- Use TranslatorTool with appropriate language codes
- Common codes: 'en' (English), 'zh' (Chinese), 'ja' (Japanese), 'ko' (Korean)
If you encounter any issues:
- Check if file exists
- Verify file path is correct
- Use `final_answer` with error description if file is inaccessible or format unsupported
"""
cn_model = LiteLLMModel(
model_id="openrouter/deepseek/deepseek-r1",
api_key=OCR_CONFIG["openrouter_api_key"],
api_base="https://openrouter.ai/api/v1",
max_completion_tokens=8192,
drop_params=True,
)
Chinese_agent = ToolCallingAgent(
model=cn_model,
tools=[visualizer, ti_tool],
verbosity_level=2,
planning_interval=4,
name="Chinese_agent",
description="""You are an experienced history expert, familiar with important events, figures, and intellectual trends across all historical periods. Please complete the following history multiple-choice question with a rigorous approach:
Please analyze all options one by one, pointing out the basis for or errors in each option;
Finally, clearly indicate the correct answer that best aligns with historical facts or the intent of the question, and briefly explain the reason;
Your answer should demonstrate logical thinking and professionalism, avoiding intuition-based responses.""",
logger=logger
)
managed_agents = []
if use_text_webbrowser_agent:
managed_agents.append(text_webbrowser_agent)
if use_image_agent:
managed_agents.append(image_information_agent)
if use_literature_agent:
managed_agents.append(literature_search_agent)
if use_file_agent:
managed_agents.append(file_processor_agent)
if use_ocr_agent:
managed_agents.append(ocr_agent)
if use_translator_agent:
managed_agents.append(translator_agent)
if use_speech_recognition_agent:
managed_agents.append(speech_recognition_agent)
if use_Chinese_agent:
managed_agents.append(Chinese_agent)
managed_agents.append(frame_extractor_agent)
manager_agent = CodeAgent(
model=model,
tools=[visualizer,ti_tool],
max_steps=20,
verbosity_level=2,
additional_authorized_imports=AUTHORIZED_IMPORTS,
planning_interval=4,
managed_agents=managed_agents,
name="manager",
description="""Team manager responsible for coordinating work between different agents.
You can use following agents to solve the task:
1. text_webbrowser_agent - For web searches and browsing
2. image_information_agent - For image analysis
3. literature_search_agent - Specialized agent for academic literature searches
4. ocr_agent - For OCR text extraction from images
5. translator_agent - For text translation
6. speech_recognition_agent - For speech recognition
7. Chinese_agent - For Chinese text analysis
Remember:
- For any image file, you must use image_information_agent and visualizer to analyze the image!
- For image with text, you must use ocr_agent to extract the text first!
- For any question without image file, you must use text_webbrowser_agent to obtain factual information!
""",
logger=logger
)
manager_agent.prompt_templates["system"] = """You are a team manager, responsible for coordinating the work of specialized agents to solve complex tasks.
You have access to the following agents:
1. text_webbrowser_agent - For web searches and browsing
2. image_information_agent - For image analysis
3. literature_search_agent - Specialized agent for academic literature searches
4. ocr_agent - For OCR text extraction from images
5. translator_agent - For text translation
6. speech_recognition_agent - For speech recognition
7. Chinese_agent - For Chinese text analysis
8. frame_extractor_agent - For video frame extraction
"""
manager_agent.prompt_templates["task"] = """You are the manager of a team of specialized agents. Your job is to coordinate their work to solve complex tasks.
You MUST use the text_webbrowser_agent to search for the information you need.
Remember, image_information_agent is not the visualizer.
you must use the ocr_agent to extract the text first.
"""
return manager_agent
def append_answer(entry: dict, jsonl_file: str) -> None:
jsonl_file = Path(jsonl_file)
jsonl_file.parent.mkdir(parents=True, exist_ok=True)
# Get main logger
logger = logging.getLogger("main")
# Get task ID to create task-specific logger
task_id = str(entry.get("task_id", "unknown"))
task_logger = get_task_logger(LOG_DIR, task_id)
# Write to JSONL file
with append_answer_lock, open(jsonl_file, "a", encoding="utf-8") as fp:
fp.write(json.dumps(entry) + "\n")
# Get Excel file path
excel_file = jsonl_file.with_suffix('.xlsx')
# Convert entry to DataFrame
entry_df = pd.DataFrame([entry])
# If Excel file exists, append data
if os.path.exists(excel_file):
try:
existing_df = pd.read_excel(excel_file)
# Merge old and new data
combined_df = pd.concat([existing_df, entry_df], ignore_index=True)
# Write to Excel file
combined_df.to_excel(excel_file, index=False)
except Exception as e:
task_logger.error(f"Error updating Excel file: {e}, creating new file")
entry_df.to_excel(excel_file, index=False)
else:
# If Excel file doesn't exist, create new file
entry_df.to_excel(excel_file, index=False)
assert os.path.exists(jsonl_file), "JSONL file doesn't exist!"
assert os.path.exists(excel_file), "Excel file doesn't exist!"
task_logger.info(f"Answer exported to file: {jsonl_file.resolve()}")
task_logger.info(f"Answer exported to Excel file: {excel_file.resolve()}")
logger.info(f"Answer exported to file: {jsonl_file.resolve()} and Excel file: {excel_file.resolve()}")
def answer_single_question(example, model_id, answers_file, visualizer, args):
"""Answer a single question and save the result, including answer evaluation and summary generation"""
# Get task ID, ensure it's a string
task_id = str(example["task_id"])
# Create task-specific logger
task_logger = logging.getLogger(f"task_{task_id}")
task_logger.setLevel(logging.INFO)
# Clear existing handlers
if task_logger.handlers:
task_logger.handlers.clear()
# Create log directory
log_dir = os.path.join(os.path.dirname(answers_file), "output_logs", args.run_name)
os.makedirs(log_dir, exist_ok=True)
# Create file handler
file_handler = logging.FileHandler(os.path.join(log_dir, f"{task_id}.log"), encoding='utf-8')
file_handler.setLevel(logging.INFO)
# Create console handler