Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
449b400
zhishu completion function
TablewareBox Dec 27, 2023
25e80ac
trial implementation of table_extract tasks
TablewareBox Dec 27, 2023
4d2e240
bugfixes and add retrieve_native completion_fn
TablewareBox Dec 27, 2023
686ecd4
add fuzzy_compare for table content
TablewareBox Dec 27, 2023
19cd84a
add fuzzy_normalize for table headers
TablewareBox Dec 28, 2023
035de6e
add uni-finder completion_fn and separated format tests (json/csv)
TablewareBox Dec 29, 2023
7d91b0a
basic mlops loggers
TablewareBox Dec 29, 2023
d4ad1fd
bugfixes on example showcase
TablewareBox Dec 29, 2023
30efde8
add rag to openai native completion_fns
TablewareBox Jan 9, 2024
474d63b
add RAG for match, modelgraded_classify, table_extract evals
TablewareBox Jan 10, 2024
d7213e0
add scipaper_tag2mol, scipaper_hasmol, scipaper_targets and markush2m…
TablewareBox Jan 10, 2024
3f30772
add Chemistry evalset
TablewareBox Jan 10, 2024
b861b7d
bugfixes
TablewareBox Jan 15, 2024
e52d776
table comparison with self-defined index
TablewareBox Jan 18, 2024
f1b82bc
fix table extraction with detailed csv text processing and edit-dista…
TablewareBox Jan 18, 2024
decf0f3
fix match_field compare logic to edit-distance
TablewareBox Jan 19, 2024
86f90c9
fixes on data and details for good scipaper_affinity performance
TablewareBox Jan 19, 2024
9860058
update uni_finder api with pdf_parse_mode
TablewareBox Jan 19, 2024
f846a1a
update Zhishu completion_fn with common chat (no file_link) support
TablewareBox Jan 23, 2024
3a4a643
split test sets into general_chemistry and drug_discovery
TablewareBox Jan 23, 2024
e6dece5
fix Zhishu for mocked GPT-4
TablewareBox Jan 25, 2024
21cef0c
move --mlops option into llmreport entrypoint
TablewareBox Jan 26, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 96 additions & 0 deletions evals/cli/llmreport.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import argparse
import json
import pickle
import re
import glob
from io import StringIO
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt


def main() -> None:
parser = argparse.ArgumentParser(description="Report evals results")
parser.add_argument("run_id", type=str, nargs="+", help="Eval Run id")
parser.add_argument("--mlops", type=str, default=None)
parser.add_argument("--name", type=str, default="LLM_Eval")

args = parser.parse_args()

logfiles = []
for run_id in args.run_id:
logfiles += glob.glob(f"/tmp/evallogs/{run_id}*/**", recursive=True)
logfiles = sorted([f for f in logfiles if Path(f).suffix == ".jsonl"])
logger_data = {}
table_collection = []
qa_collection = []

for logfile in logfiles:
with open(logfile, "r") as f:
events_df = pd.read_json(f, lines=True)
if not "final_report" in events_df.columns:
continue
final_report = events_df["final_report"].dropna().iloc[0]

print(events_df)
run_config = events_df.loc[0, "spec"]
evalname = run_config["base_eval"]
model = run_config["completion_fns"][0].replace("/", ".")
matches_df = events_df[events_df["type"] == "match"].reset_index(drop=True)
matches_df = matches_df.join(pd.json_normalize(matches_df.data))

qa_collection.append({"eval": evalname, "model": model, **final_report})

if "file_name" in matches_df.columns:
matches_df["doi"] = [re.sub("__([0-9]+)__", r"(\1)", Path(f).stem).replace("_", "/") for f in matches_df["file_name"]]

# TODO: compare on different completion_functions
if "jobtype" in matches_df.columns:
# Table extract tasks
accuracy_by_type_and_file = matches_df.groupby(["jobtype", "doi"])['correct'].mean().reset_index()
accuracy_by_type_and_file["model"] = model
table_collection.append(accuracy_by_type_and_file)

accuracy_by_type = matches_df.groupby(["jobtype"])['correct'].mean().to_dict()
print(accuracy_by_type_and_file)

logger_data = {**logger_data, **{f"Accuracy_{key}/model:{model}": value for key, value in accuracy_by_type.items()}}

for doi, df in matches_df.groupby("doi"):
print(df)
logger_data[f"{doi.replace('/', '_')}/model:{model},context:match"] = df[df["jobtype"] != "match_all"][["correct", "expected", "picked", "jobtype"]]
match_all_data = df[df["jobtype"] == "match_all"].iloc[0, :]
logger_data[f"{doi.replace('/', '_')}/context:truth"] = pd.read_csv(StringIO(match_all_data["expected"]), header=[0, 1])
logger_data[f"{doi.replace('/', '_')}/model:{model},context:extract"] = pd.read_csv(StringIO(match_all_data["picked"]), header=[0, 1]) \
if df["jobtype"].iloc[0] != "match_all" else match_all_data["picked"]
else:
# Regular tasks
pass

if len(table_collection) > 0:
accuracy_by_model_type_and_file = pd.concat(table_collection)
metrics_by_eval = pd.DataFrame(qa_collection)
accuracies = metrics_by_eval[metrics_by_eval["accuracy"] >= 0]
scores = metrics_by_eval[metrics_by_eval["score"] >= 0]

if args.mlops:
import plotly.express as px
logger_data["TableExtraction"] = px.box(accuracy_by_model_type_and_file,
x="jobtype", y="correct", color="model",
title="Accuracy by jobtype and model")
logger_data["QA_accuracy"] = px.bar(accuracies, x="eval", y="accuracy", color="model",
title="Accuracy by eval and model")
logger_data["QA_score"] = px.bar(scores, x="eval", y="accuracy", color="model",
title="Accuracy by eval and model")
if args.mlops:
config_logger = json.load(open(args.mlops, 'r'))
if "name" not in config_logger.keys():
config_logger["name"] = args.name
if "dp_mlops" in config_logger:
from evals.reporters.DPTracking import DPTrackingReporter
DPTrackingReporter.report_run(config_logger, {}, logger_data, step=0)


if __name__ == "__main__":
main()
6 changes: 6 additions & 0 deletions evals/cli/oaieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,14 @@
This file defines the `oaieval` CLI for running evals.
"""
import argparse
import json
import logging
import pickle
import re
import shlex
import sys
from io import StringIO
from pathlib import Path
from typing import Any, Mapping, Optional, Union, cast

import openai
Expand Down Expand Up @@ -229,6 +234,7 @@ def to_number(x: str) -> Union[int, float, str]:
logger.info("Final report:")
for key, value in result.items():
logger.info(f"{key}: {value}")

return run_spec.run_id


Expand Down
58 changes: 44 additions & 14 deletions evals/completion_fns/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from evals.utils.api_utils import (
openai_chat_completion_create_retrying,
openai_completion_create_retrying,
openai_rag_completion_create_retrying
)


Expand Down Expand Up @@ -46,6 +47,15 @@ def get_completions(self) -> list[str]:
return completions


class RetrievalCompletionResult(CompletionResult):
def __init__(self, response: str, prompt: Any) -> None:
self.response = response
self.prompt = prompt

def get_completions(self) -> list[str]:
return [self.response.strip()]


class OpenAICompletionFn(CompletionFn):
def __init__(
self,
Expand Down Expand Up @@ -81,13 +91,22 @@ def __call__(

openai_create_prompt: OpenAICreatePrompt = prompt.to_formatted_prompt()

result = openai_completion_create_retrying(
OpenAI(api_key=self.api_key, base_url=self.api_base),
model=self.model,
prompt=openai_create_prompt,
**{**kwargs, **self.extra_options},
)
result = OpenAICompletionResult(raw_data=result, prompt=openai_create_prompt)
if "file_name" not in kwargs:
result = openai_completion_create_retrying(
OpenAI(api_key=self.api_key, base_url=self.api_base),
model=self.model,
prompt=openai_create_prompt,
**{**kwargs, **self.extra_options},
)
result = OpenAICompletionResult(raw_data=result, prompt=openai_create_prompt)
else:
answer = openai_rag_completion_create_retrying(
OpenAI(api_key=self.api_key, base_url=self.api_base),
model=self.model,
instructions=kwargs.get("instructions", ""),
file_name=kwargs.get("file_name", ""),
)
result = RetrievalCompletionResult(answer, prompt=openai_create_prompt)
record_sampling(prompt=result.prompt, sampled=result.get_completions())
return result

Expand Down Expand Up @@ -126,12 +145,23 @@ def __call__(

openai_create_prompt: OpenAICreateChatPrompt = prompt.to_formatted_prompt()

result = openai_chat_completion_create_retrying(
OpenAI(api_key=self.api_key, base_url=self.api_base),
model=self.model,
messages=openai_create_prompt,
**{**kwargs, **self.extra_options},
)
result = OpenAIChatCompletionResult(raw_data=result, prompt=openai_create_prompt)
if "file_name" not in kwargs:
result = openai_chat_completion_create_retrying(
OpenAI(api_key=self.api_key, base_url=self.api_base),
model=self.model,
messages=openai_create_prompt,
**{**kwargs, **self.extra_options},
)
result = OpenAIChatCompletionResult(raw_data=result, prompt=openai_create_prompt)
else:
chatmodel_to_apimodel = lambda x: "gpt-3.5-turbo-1106" if x.startswith("gpt-3.5-turbo") else "gpt-4-1106-preview" if x.startswith("gpt-4") else ""
answer = openai_rag_completion_create_retrying(
OpenAI(api_key=self.api_key, base_url=self.api_base),
model=chatmodel_to_apimodel(self.model),
instructions=kwargs.get("instructions", ""),
file_name=kwargs.get("file_name", ""),
prompt=CompletionPrompt(raw_prompt=openai_create_prompt).to_formatted_prompt()
)
result = RetrievalCompletionResult(answer, prompt=openai_create_prompt)
record_sampling(prompt=result.prompt, sampled=result.get_completions())
return result
60 changes: 60 additions & 0 deletions evals/completion_fns/retrieval_native.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""
Extending Completion Functions with Embeddings-based retrieval from a fetched dataset
"""
import os
from ast import literal_eval
import time
from typing import Any, Optional, Union

import numpy as np
from openai import OpenAI

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

from evals.api import CompletionFn, CompletionResult
from evals.completion_fns.openai import RetrievalCompletionResult
from evals.prompt.base import ChatCompletionPrompt, CompletionPrompt
from evals.record import record_sampling
from evals.utils.api_utils import openai_rag_completion_create_retrying


class OpenAIRetrievalCompletionFn(CompletionFn):
"""
This Completion Function uses embeddings to retrieve the top k relevant docs from a dataset to the prompt, then adds them to the context before calling the completion.
"""

def __init__(
self,
model: Optional[str] = None,
instructions: Optional[str] = "You are a helpful assistant on extracting information from files.",
api_base: Optional[str] = None,
api_key: Optional[str] = None,
n_ctx: Optional[int] = None,
extra_options: Optional[dict] = {},
**kwargs
):
self.model = model
self.instructions = instructions
self.api_base = api_base
self.api_key = api_key
self.n_ctx = n_ctx
self.extra_options = extra_options

def __call__(self, prompt: Union[str, list[dict]], **kwargs: Any) -> RetrievalCompletionResult:
"""
Args:
prompt: The prompt to complete, in either text string or Chat format.
kwargs: Additional arguments to pass to the completion function call method.
"""

assert "file_name" in kwargs, "Must provide a file_name to retrieve."

answer = openai_rag_completion_create_retrying(
client,
model=self.model,
instructions=self.instructions,
file_name=kwargs.get("file_name", ""),
prompt=CompletionPrompt(raw_prompt=prompt).to_formatted_prompt(),
)
record_sampling(prompt=prompt, sampled=answer)
return RetrievalCompletionResult(answer, prompt=prompt)
103 changes: 103 additions & 0 deletions evals/completion_fns/uni_finder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
"""
Extending Completion Functions with Embeddings-based retrieval from a fetched dataset
"""
import json
import os
import time
from pathlib import Path

import requests
from typing import Any, Optional, Union

from evals.prompt.base import CompletionPrompt
from evals.api import CompletionFn, CompletionResult
from evals.record import record_sampling


class UniFinderCompletionResult(CompletionResult):
def __init__(self, response: str) -> None:
self.response = response

def get_completions(self) -> list[str]:
return [self.response.strip()] if self.response else ["Unknown"]


class UniFinderCompletionFn(CompletionFn):
"""
This Completion Function uses embeddings to retrieve the top k relevant docs from a dataset to the prompt, then adds them to the context before calling the completion.
"""

def __init__(
self,
model: Optional[str] = None,
instructions: Optional[str] = "You are a helpful assistant on extracting information from files.",
api_base: Optional[str] = None,
api_key: Optional[str] = None,
n_ctx: Optional[int] = None,
cache_dir: Optional[str] = str(Path.home() / ".uni_finder/knowledge_base.json"),
pdf_parse_mode: Optional[str] = 'fast', # or 'precise', 指定使用的pdf解析版本
extra_options: Optional[dict] = {},
**kwargs
):
self.model = model
self.instructions = instructions
self.api_base = api_base or os.environ.get("UNIFINDER_API_BASE")
self.api_key = api_key or os.environ.get("UNIFINDER_API_KEY")
self.n_ctx = n_ctx
self.extra_options = extra_options
self.cache_dir = cache_dir
self.pdf_parse_mode = pdf_parse_mode
Path(self.cache_dir).parent.mkdir(parents=True, exist_ok=True)
if not Path(self.cache_dir).exists():
json.dump({}, open(self.cache_dir, "w"))

def __call__(self, prompt: Union[str, list[dict]], **kwargs: Any) -> UniFinderCompletionResult:
"""
Args:
prompt: The prompt to complete, in either text string or Chat format.
kwargs: Additional arguments to pass to the completion function call method.
"""

pdf_token = []
if "file_name" in kwargs:
cache = json.load(open(self.cache_dir, 'r+'))

if cache.get(kwargs["file_name"], {}).get(self.pdf_parse_mode, None) is None:
url = f"{self.api_base}/api/external/upload_pdf"
files = {'file': open(kwargs["file_name"], 'rb')}
data = {
'pdf_parse_mode': self.pdf_parse_mode,
'api_key': self.api_key
}
response = requests.post(url, data=data, files=files)
pdf_id = response.json()['pdf_token'] # 获得pdf的id,表示上传成功,后续可以使用这个id来指定pdf

if kwargs["file_name"] not in cache:
cache[kwargs["file_name"]] = {self.pdf_parse_mode: pdf_id}
else:
cache[kwargs["file_name"]][self.pdf_parse_mode] = pdf_id
json.dump(cache, open(self.cache_dir, "w"))
else:
pdf_id = cache[kwargs["file_name"]][self.pdf_parse_mode]
print("############# pdf_id ##############", pdf_id)
pdf_token.append(pdf_id)

url = f"{self.api_base}/api/external/chatpdf"

if type(prompt) == list:
prompt = CompletionPrompt(prompt).to_formatted_prompt()

payload = {
"model_engine": self.model,
"pdf_token": pdf_token,
"query": prompt,
'api_key': self.api_key
}
response = requests.post(url, json=payload, timeout=1200)
try:
answer = response.json()['answer']
except:
print(response.text)
answer = response.text
record_sampling(prompt=prompt, sampled=answer)
return UniFinderCompletionResult(answer)
Loading