From c6717c8ea69058b68817594c27cdfa891d36c4e5 Mon Sep 17 00:00:00 2001 From: root <1700011741@pku.edu.cn> Date: Sat, 3 Feb 2024 17:18:08 +0800 Subject: [PATCH 1/4] enzyme benchmark --- .../data/00_scipaper_enzyme/auto_add.sh | 15 +++++++++++ .../data/00_scipaper_enzyme/sample_file | 1 + .../data/00_scipaper_enzyme/samples.jsonl | 3 +++ .../auto_add.sh | 26 +++++++++++++++++++ .../sample_file | 1 + .../samples.jsonl | 3 +++ .../00_scipaper_enzyme_inhibitor/auto_add.sh | 26 +++++++++++++++++++ .../00_scipaper_enzyme_inhibitor/sample_file | 1 + .../samples.jsonl | 3 +++ .../auto_add.sh | 26 +++++++++++++++++++ .../sample_file | 1 + .../samples.jsonl | 3 +++ evals/registry/evals/00_scipaper_enzyme.yaml | 19 ++++++++++++++ .../00_scipaper_enzyme_activate_compound.yaml | 18 +++++++++++++ .../evals/00_scipaper_enzyme_inhibitor.yaml | 18 +++++++++++++ .../00_scipaper_enzyme_localization.yaml | 16 ++++++++++++ 16 files changed, 180 insertions(+) create mode 100644 evals/registry/data/00_scipaper_enzyme/auto_add.sh create mode 100644 evals/registry/data/00_scipaper_enzyme/sample_file create mode 100644 evals/registry/data/00_scipaper_enzyme/samples.jsonl create mode 100644 evals/registry/data/00_scipaper_enzyme_activate_compound/auto_add.sh create mode 100644 evals/registry/data/00_scipaper_enzyme_activate_compound/sample_file create mode 100644 evals/registry/data/00_scipaper_enzyme_activate_compound/samples.jsonl create mode 100644 evals/registry/data/00_scipaper_enzyme_inhibitor/auto_add.sh create mode 100644 evals/registry/data/00_scipaper_enzyme_inhibitor/sample_file create mode 100644 evals/registry/data/00_scipaper_enzyme_inhibitor/samples.jsonl create mode 100644 evals/registry/data/00_scipaper_enzyme_localization/auto_add.sh create mode 100644 evals/registry/data/00_scipaper_enzyme_localization/sample_file create mode 100644 evals/registry/data/00_scipaper_enzyme_localization/samples.jsonl create mode 100644 evals/registry/evals/00_scipaper_enzyme.yaml create mode 100644 evals/registry/evals/00_scipaper_enzyme_activate_compound.yaml create mode 100644 evals/registry/evals/00_scipaper_enzyme_inhibitor.yaml create mode 100644 evals/registry/evals/00_scipaper_enzyme_localization.yaml diff --git a/evals/registry/data/00_scipaper_enzyme/auto_add.sh b/evals/registry/data/00_scipaper_enzyme/auto_add.sh new file mode 100644 index 0000000000..4537d504c4 --- /dev/null +++ b/evals/registry/data/00_scipaper_enzyme/auto_add.sh @@ -0,0 +1,15 @@ +#!/bin/bash +for paper in /root/uni-finder/enzyme/paper/*.pdf +do + file_name="${paper##*/}" + name=${file_name%.*} + key_word="" + key_word=$(grep "${name}" samples.jsonl) + if [[ ${key_word} == "" ]] + then + echo "add ${name} to jsonl" + sed 's|target_mark|'"${name}"'|g' sample_file >> samples.jsonl + else + echo "${name}: was already in the jsonl" + fi +done diff --git a/evals/registry/data/00_scipaper_enzyme/sample_file b/evals/registry/data/00_scipaper_enzyme/sample_file new file mode 100644 index 0000000000..530f8260c0 --- /dev/null +++ b/evals/registry/data/00_scipaper_enzyme/sample_file @@ -0,0 +1 @@ +{"file_name": "../uni-finder/enzyme/paper/target_mark.pdf", "file_link": "https://dp-filetrans-bj.oss-cn-beijing.aliyuncs.com/changjunhan/target_mark.pdf", "answerfile_name": "../uni-finder/enzyme/answer/target_mark.csv", "answerfile_link": "https://dp-filetrans-bj.oss-cn-beijing.aliyuncs.com/changjunhan/target_mark.csv", "compare_fields": [""], "index": ""} diff --git a/evals/registry/data/00_scipaper_enzyme/samples.jsonl b/evals/registry/data/00_scipaper_enzyme/samples.jsonl new file mode 100644 index 0000000000..01eb9d337f --- /dev/null +++ b/evals/registry/data/00_scipaper_enzyme/samples.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bf58abc2068a46085404efd770a640185ac65d831c49c3b98800a60ff62814b +size 4699 diff --git a/evals/registry/data/00_scipaper_enzyme_activate_compound/auto_add.sh b/evals/registry/data/00_scipaper_enzyme_activate_compound/auto_add.sh new file mode 100644 index 0000000000..f699a14a86 --- /dev/null +++ b/evals/registry/data/00_scipaper_enzyme_activate_compound/auto_add.sh @@ -0,0 +1,26 @@ +#!/bin/bash +target_job=$1 +if [[ ${target_job} == "" ]] +then + echo ">>> Error: target_job is not define" + exit +fi +if [[ ! -f samples.jsonl ]] +then + touch samples.jsonl +fi +for paper in /root/uni-finder/enzyme/"${target_job}"/paper/*.pdf +do + echo "find file ${paper}" + file_name="${paper##*/}" + name=${file_name%.*} + key_word="" + key_word=$(grep "${name}" samples.jsonl) + if [[ ${key_word} == "" ]] + then + echo "add ${name} to jsonl" + sed 's|target_mark|'"${name}"'|g' sample_file | sed 's|target_Job|'"${target_job}"'|g' >> samples.jsonl + else + echo "${name}: was already in the jsonl" + fi +done diff --git a/evals/registry/data/00_scipaper_enzyme_activate_compound/sample_file b/evals/registry/data/00_scipaper_enzyme_activate_compound/sample_file new file mode 100644 index 0000000000..95ce86b217 --- /dev/null +++ b/evals/registry/data/00_scipaper_enzyme_activate_compound/sample_file @@ -0,0 +1 @@ +{"file_name": "../uni-finder/enzyme/target_Job/paper/target_mark.pdf", "file_link": "https://dp-filetrans-bj.oss-cn-beijing.aliyuncs.com/changjunhan/target_mark.pdf", "answerfile_name": "../uni-finder/enzyme/target_Job/answer/target_mark.csv", "answerfile_link": "https://dp-filetrans-bj.oss-cn-beijing.aliyuncs.com/changjunhan/target_mark.csv", "compare_fields": ["Activating Compound", "Comment", "Organism"], "index": "Activating Compound"} diff --git a/evals/registry/data/00_scipaper_enzyme_activate_compound/samples.jsonl b/evals/registry/data/00_scipaper_enzyme_activate_compound/samples.jsonl new file mode 100644 index 0000000000..488ee911ff --- /dev/null +++ b/evals/registry/data/00_scipaper_enzyme_activate_compound/samples.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfbe6bb20ea161238ab3d5e3404d9df76514d88c7ed9fe528d2984d0d9fb7d07 +size 538 diff --git a/evals/registry/data/00_scipaper_enzyme_inhibitor/auto_add.sh b/evals/registry/data/00_scipaper_enzyme_inhibitor/auto_add.sh new file mode 100644 index 0000000000..f699a14a86 --- /dev/null +++ b/evals/registry/data/00_scipaper_enzyme_inhibitor/auto_add.sh @@ -0,0 +1,26 @@ +#!/bin/bash +target_job=$1 +if [[ ${target_job} == "" ]] +then + echo ">>> Error: target_job is not define" + exit +fi +if [[ ! -f samples.jsonl ]] +then + touch samples.jsonl +fi +for paper in /root/uni-finder/enzyme/"${target_job}"/paper/*.pdf +do + echo "find file ${paper}" + file_name="${paper##*/}" + name=${file_name%.*} + key_word="" + key_word=$(grep "${name}" samples.jsonl) + if [[ ${key_word} == "" ]] + then + echo "add ${name} to jsonl" + sed 's|target_mark|'"${name}"'|g' sample_file | sed 's|target_Job|'"${target_job}"'|g' >> samples.jsonl + else + echo "${name}: was already in the jsonl" + fi +done diff --git a/evals/registry/data/00_scipaper_enzyme_inhibitor/sample_file b/evals/registry/data/00_scipaper_enzyme_inhibitor/sample_file new file mode 100644 index 0000000000..57bad47af5 --- /dev/null +++ b/evals/registry/data/00_scipaper_enzyme_inhibitor/sample_file @@ -0,0 +1 @@ +{"file_name": "../uni-finder/enzyme/target_Job/paper/target_mark.pdf", "file_link": "https://dp-filetrans-bj.oss-cn-beijing.aliyuncs.com/changjunhan/target_mark.pdf", "answerfile_name": "../uni-finder/enzyme/target_Job/answer/target_mark.csv", "answerfile_link": "https://dp-filetrans-bj.oss-cn-beijing.aliyuncs.com/changjunhan/target_mark.csv", "compare_fields": ["Inhibitors", "Comment", "Organism"], "index": "Inhibitors"} diff --git a/evals/registry/data/00_scipaper_enzyme_inhibitor/samples.jsonl b/evals/registry/data/00_scipaper_enzyme_inhibitor/samples.jsonl new file mode 100644 index 0000000000..116a945850 --- /dev/null +++ b/evals/registry/data/00_scipaper_enzyme_inhibitor/samples.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49193f70b4942b3ad8aaac0497da4b37b6de40ebde79edb2ec3df6bfc0972924 +size 1444 diff --git a/evals/registry/data/00_scipaper_enzyme_localization/auto_add.sh b/evals/registry/data/00_scipaper_enzyme_localization/auto_add.sh new file mode 100644 index 0000000000..f699a14a86 --- /dev/null +++ b/evals/registry/data/00_scipaper_enzyme_localization/auto_add.sh @@ -0,0 +1,26 @@ +#!/bin/bash +target_job=$1 +if [[ ${target_job} == "" ]] +then + echo ">>> Error: target_job is not define" + exit +fi +if [[ ! -f samples.jsonl ]] +then + touch samples.jsonl +fi +for paper in /root/uni-finder/enzyme/"${target_job}"/paper/*.pdf +do + echo "find file ${paper}" + file_name="${paper##*/}" + name=${file_name%.*} + key_word="" + key_word=$(grep "${name}" samples.jsonl) + if [[ ${key_word} == "" ]] + then + echo "add ${name} to jsonl" + sed 's|target_mark|'"${name}"'|g' sample_file | sed 's|target_Job|'"${target_job}"'|g' >> samples.jsonl + else + echo "${name}: was already in the jsonl" + fi +done diff --git a/evals/registry/data/00_scipaper_enzyme_localization/sample_file b/evals/registry/data/00_scipaper_enzyme_localization/sample_file new file mode 100644 index 0000000000..ec5ce35204 --- /dev/null +++ b/evals/registry/data/00_scipaper_enzyme_localization/sample_file @@ -0,0 +1 @@ +{"file_name": "../uni-finder/enzyme/target_Job/paper/target_mark.pdf", "file_link": "https://dp-filetrans-bj.oss-cn-beijing.aliyuncs.com/changjunhan/target_mark.pdf", "answerfile_name": "../uni-finder/enzyme/target_Job/answer/target_mark.csv", "answerfile_link": "https://dp-filetrans-bj.oss-cn-beijing.aliyuncs.com/changjunhan/target_mark.csv", "compare_fields": [ "Localization", "Organism"], "index":"Localization"} diff --git a/evals/registry/data/00_scipaper_enzyme_localization/samples.jsonl b/evals/registry/data/00_scipaper_enzyme_localization/samples.jsonl new file mode 100644 index 0000000000..ecf84e1650 --- /dev/null +++ b/evals/registry/data/00_scipaper_enzyme_localization/samples.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2909201b52e0c8f3e71ee569a65a72e4dfbdede7dde2d427c04e3e43185a86ca +size 467 diff --git a/evals/registry/evals/00_scipaper_enzyme.yaml b/evals/registry/evals/00_scipaper_enzyme.yaml new file mode 100644 index 0000000000..1ab8ea7d40 --- /dev/null +++ b/evals/registry/evals/00_scipaper_enzyme.yaml @@ -0,0 +1,19 @@ +scipaper_enzyme: + id: scipaper_enzyme.val.csv + metrics: [accuracy] + +scipaper_enzyme.val.csv: + class: evals.elsuite.rag_table_extract:TableExtract + args: + samples_jsonl: 00_scipaper_enzyme/samples.jsonl + instructions: | + Please give a complete list of SMILES structures, Km values, Vmax values, target info (protein or cell line), and organism of all substrates in the paper. Usually the substrates' tags are numbers or IUPAC names. + 1. Output in csv format, write units not in header but in the value like "10.5 µM". Quote the value if it has comma! For example: + ```csv + Substrate,Inhibitors, Km value,Km max,Comment,organism,Vmax value,SMILES,Target info,Activating Compound, + ATP,Cu2+,0.001 mM,-,-,Homo sapiens,-,-,ATP-linker aldehyde,Carboxybenzaldehyde, + p-xylene,NADH,0.004 mM,-,-,Homo sapiens,-,C1CCCCC1,-,Methylbenzaldehyde + NADPH,benzaldehyde, 0.12 mM,125 mM,enzyme form ATP,Bos taurus,-,-,NH4+ + + ``` + 2. If there are multiple tables, concat them. Don't give me reference or using "...", give me complete table! diff --git a/evals/registry/evals/00_scipaper_enzyme_activate_compound.yaml b/evals/registry/evals/00_scipaper_enzyme_activate_compound.yaml new file mode 100644 index 0000000000..7241a1fe5c --- /dev/null +++ b/evals/registry/evals/00_scipaper_enzyme_activate_compound.yaml @@ -0,0 +1,18 @@ +scipaper_enzyme_activate_compound: + id: scipaper_enzyme_activate_compound.val.csv + metrics: [accuracy] + +scipaper_enzyme_activate_compound.val.csv: + class: evals.elsuite.rag_table_extract:TableExtract + args: + samples_jsonl: 00_scipaper_enzyme_activate_compound/samples.jsonl + instructions: | + Please give a complete list of Activating Compound, Commentand Organism of all substrates in the paper. Usually the substrates' tags are numbers or IUPAC names. + 1. Output in csv format, write units not in header but in the value like "10.5 µM". Quote the value if it has comma! For example: + ```csv + Activating Compound,Comment,Organism + Cu2+,at 0.001 mM of the activity without activator,Homo sapiens + p-xylene,"11.4 mM, slight activation",Bos taurus + NH4+, 0.002 mM,Bos taurus + ``` + 2. If there are multiple tables, concat them. Don't give me reference or using "...", give me complete table! diff --git a/evals/registry/evals/00_scipaper_enzyme_inhibitor.yaml b/evals/registry/evals/00_scipaper_enzyme_inhibitor.yaml new file mode 100644 index 0000000000..3c712c4701 --- /dev/null +++ b/evals/registry/evals/00_scipaper_enzyme_inhibitor.yaml @@ -0,0 +1,18 @@ +scipaper_enzyme_inhibitor: + id: scipaper_enzyme_inhibitor.val.csv + metrics: [accuracy] + +scipaper_enzyme_inhibitor.val.csv: + class: evals.elsuite.rag_table_extract:TableExtract + args: + samples_jsonl: 00_scipaper_enzyme_inhibitor/samples.jsonl + instructions: | + Please give a complete list of Inhibitor, Commentand Organism of all substrates in the paper. Usually the substrates' tags are numbers or IUPAC names. + 1. Output in csv format, write units not in header but in the value like "10.5 µM". Quote the value if it has comma! For example: + ```csv + Inhibitor,Comment,Organism + ATP,"competitive inhibition of verapamil-dependent ATPase-activity",Homo sapiens + p-xylene,"11.4 mM, slight inhibitor",Bos taurus + NH4+, 0.002 mM,Bos taurus + ``` + 2. If there are multiple tables, concat them. Don't give me reference or using "...", give me complete table! diff --git a/evals/registry/evals/00_scipaper_enzyme_localization.yaml b/evals/registry/evals/00_scipaper_enzyme_localization.yaml new file mode 100644 index 0000000000..3c6ca4e590 --- /dev/null +++ b/evals/registry/evals/00_scipaper_enzyme_localization.yaml @@ -0,0 +1,16 @@ +scipaper_enzyme_localization: + id: scipaper_enzyme_localization.val.csv + metrics: [accuracy] + +scipaper_enzyme_localization.val.csv: + class: evals.elsuite.rag_table_extract:TableExtract + args: + samples_jsonl: 00_scipaper_enzyme_localization/samples.jsonl + instructions: | + Please give a complete list of Localization, Commentand and Organism of all substrates in the paper. Usually the substrates' tags are numbers or IUPAC names. + 1. Output in csv format, write units not in header but in the value like "10.5 µM". Quote the value if it has comma! For example: + ```csv + Localization,Organism + periplasm,Bos taurus + ``` + 2. If there are multiple tables, concat them. Don't give me reference or using "...", give me complete table! From 0888cfc8ae8a606b5365fb10db949781c3fa6e97 Mon Sep 17 00:00:00 2001 From: root <1700011741@pku.edu.cn> Date: Mon, 5 Feb 2024 22:32:41 +0800 Subject: [PATCH 2/4] add km --- .../data/00_scipaper_enzyme_km/auto_add.sh | 26 +++++++++++++++++++ .../data/00_scipaper_enzyme_km/sample_file | 1 + .../data/00_scipaper_enzyme_km/samples.jsonl | 3 +++ 3 files changed, 30 insertions(+) create mode 100644 evals/registry/data/00_scipaper_enzyme_km/auto_add.sh create mode 100644 evals/registry/data/00_scipaper_enzyme_km/sample_file create mode 100644 evals/registry/data/00_scipaper_enzyme_km/samples.jsonl diff --git a/evals/registry/data/00_scipaper_enzyme_km/auto_add.sh b/evals/registry/data/00_scipaper_enzyme_km/auto_add.sh new file mode 100644 index 0000000000..f699a14a86 --- /dev/null +++ b/evals/registry/data/00_scipaper_enzyme_km/auto_add.sh @@ -0,0 +1,26 @@ +#!/bin/bash +target_job=$1 +if [[ ${target_job} == "" ]] +then + echo ">>> Error: target_job is not define" + exit +fi +if [[ ! -f samples.jsonl ]] +then + touch samples.jsonl +fi +for paper in /root/uni-finder/enzyme/"${target_job}"/paper/*.pdf +do + echo "find file ${paper}" + file_name="${paper##*/}" + name=${file_name%.*} + key_word="" + key_word=$(grep "${name}" samples.jsonl) + if [[ ${key_word} == "" ]] + then + echo "add ${name} to jsonl" + sed 's|target_mark|'"${name}"'|g' sample_file | sed 's|target_Job|'"${target_job}"'|g' >> samples.jsonl + else + echo "${name}: was already in the jsonl" + fi +done diff --git a/evals/registry/data/00_scipaper_enzyme_km/sample_file b/evals/registry/data/00_scipaper_enzyme_km/sample_file new file mode 100644 index 0000000000..a218a4803b --- /dev/null +++ b/evals/registry/data/00_scipaper_enzyme_km/sample_file @@ -0,0 +1 @@ +{"file_name": "../uni-finder/enzyme/target_Job/paper/target_mark.pdf", "file_link": "https://dp-filetrans-bj.oss-cn-beijing.aliyuncs.com/changjunhan/target_mark.pdf", "answerfile_name": "../uni-finder/enzyme/target_Job/answer/target_mark.csv", "answerfile_link": "https://dp-filetrans-bj.oss-cn-beijing.aliyuncs.com/changjunhan/target_mark.csv", "compare_fields": ["Substrate", "SMILES", "Km value", "Km vmax", "organism"], "index": "Substrate"} diff --git a/evals/registry/data/00_scipaper_enzyme_km/samples.jsonl b/evals/registry/data/00_scipaper_enzyme_km/samples.jsonl new file mode 100644 index 0000000000..d699617478 --- /dev/null +++ b/evals/registry/data/00_scipaper_enzyme_km/samples.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d526411b82df30527d64c8d35afe4a09152d6669637fd876242f4ae0539c447e +size 510 From a2315db834200c66006b45b28e8ff7f63a5aa216 Mon Sep 17 00:00:00 2001 From: root <1700011741@pku.edu.cn> Date: Mon, 5 Feb 2024 22:33:00 +0800 Subject: [PATCH 3/4] add km --- ...00_scipaper_enzyme.yaml => 00_scipaper_enzyme_km.yaml} | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) rename evals/registry/evals/{00_scipaper_enzyme.yaml => 00_scipaper_enzyme_km.yaml} (87%) diff --git a/evals/registry/evals/00_scipaper_enzyme.yaml b/evals/registry/evals/00_scipaper_enzyme_km.yaml similarity index 87% rename from evals/registry/evals/00_scipaper_enzyme.yaml rename to evals/registry/evals/00_scipaper_enzyme_km.yaml index 1ab8ea7d40..9e5d4d0734 100644 --- a/evals/registry/evals/00_scipaper_enzyme.yaml +++ b/evals/registry/evals/00_scipaper_enzyme_km.yaml @@ -1,11 +1,11 @@ -scipaper_enzyme: - id: scipaper_enzyme.val.csv +scipaper_enzyme_km: + id: scipaper_enzyme_km.val.csv metrics: [accuracy] -scipaper_enzyme.val.csv: +scipaper_enzyme_km.val.csv: class: evals.elsuite.rag_table_extract:TableExtract args: - samples_jsonl: 00_scipaper_enzyme/samples.jsonl + samples_jsonl: 00_scipaper_enzyme_km/samples.jsonl instructions: | Please give a complete list of SMILES structures, Km values, Vmax values, target info (protein or cell line), and organism of all substrates in the paper. Usually the substrates' tags are numbers or IUPAC names. 1. Output in csv format, write units not in header but in the value like "10.5 µM". Quote the value if it has comma! For example: From c32d3fb083058bd3c5679e77e0696c02ed736396 Mon Sep 17 00:00:00 2001 From: TablewareBox <1700011741@pku.edu.cn> Date: Tue, 6 Feb 2024 06:20:12 +0800 Subject: [PATCH 4/4] add enzyme testset and fix data --- .../data/00_scipaper_enzyme/auto_add.sh | 15 ----------- .../data/00_scipaper_enzyme/sample_file | 1 - .../data/00_scipaper_enzyme/samples.jsonl | 3 --- .../auto_add.sh | 26 ------------------- .../sample_file | 1 - .../00_scipaper_enzyme_inhibitor/auto_add.sh | 26 ------------------- .../00_scipaper_enzyme_inhibitor/sample_file | 1 - .../data/00_scipaper_enzyme_km/auto_add.sh | 26 ------------------- .../data/00_scipaper_enzyme_km/sample_file | 1 - .../data/00_scipaper_enzyme_km/samples.jsonl | 3 --- .../auto_add.sh | 26 ------------------- .../sample_file | 1 - .../samples.jsonl | 3 +++ .../registry/eval_sets/chemistry_enzyme.yaml | 6 +++++ ...yaml => 00_scipaper_enzyme_substrate.yaml} | 8 +++--- 15 files changed, 13 insertions(+), 134 deletions(-) delete mode 100644 evals/registry/data/00_scipaper_enzyme/auto_add.sh delete mode 100644 evals/registry/data/00_scipaper_enzyme/sample_file delete mode 100644 evals/registry/data/00_scipaper_enzyme/samples.jsonl delete mode 100644 evals/registry/data/00_scipaper_enzyme_activate_compound/auto_add.sh delete mode 100644 evals/registry/data/00_scipaper_enzyme_activate_compound/sample_file delete mode 100644 evals/registry/data/00_scipaper_enzyme_inhibitor/auto_add.sh delete mode 100644 evals/registry/data/00_scipaper_enzyme_inhibitor/sample_file delete mode 100644 evals/registry/data/00_scipaper_enzyme_km/auto_add.sh delete mode 100644 evals/registry/data/00_scipaper_enzyme_km/sample_file delete mode 100644 evals/registry/data/00_scipaper_enzyme_km/samples.jsonl delete mode 100644 evals/registry/data/00_scipaper_enzyme_localization/auto_add.sh delete mode 100644 evals/registry/data/00_scipaper_enzyme_localization/sample_file create mode 100644 evals/registry/data/00_scipaper_enzyme_substrate/samples.jsonl create mode 100644 evals/registry/eval_sets/chemistry_enzyme.yaml rename evals/registry/evals/{00_scipaper_enzyme_km.yaml => 00_scipaper_enzyme_substrate.yaml} (85%) diff --git a/evals/registry/data/00_scipaper_enzyme/auto_add.sh b/evals/registry/data/00_scipaper_enzyme/auto_add.sh deleted file mode 100644 index 4537d504c4..0000000000 --- a/evals/registry/data/00_scipaper_enzyme/auto_add.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -for paper in /root/uni-finder/enzyme/paper/*.pdf -do - file_name="${paper##*/}" - name=${file_name%.*} - key_word="" - key_word=$(grep "${name}" samples.jsonl) - if [[ ${key_word} == "" ]] - then - echo "add ${name} to jsonl" - sed 's|target_mark|'"${name}"'|g' sample_file >> samples.jsonl - else - echo "${name}: was already in the jsonl" - fi -done diff --git a/evals/registry/data/00_scipaper_enzyme/sample_file b/evals/registry/data/00_scipaper_enzyme/sample_file deleted file mode 100644 index 530f8260c0..0000000000 --- a/evals/registry/data/00_scipaper_enzyme/sample_file +++ /dev/null @@ -1 +0,0 @@ -{"file_name": "../uni-finder/enzyme/paper/target_mark.pdf", "file_link": "https://dp-filetrans-bj.oss-cn-beijing.aliyuncs.com/changjunhan/target_mark.pdf", "answerfile_name": "../uni-finder/enzyme/answer/target_mark.csv", "answerfile_link": "https://dp-filetrans-bj.oss-cn-beijing.aliyuncs.com/changjunhan/target_mark.csv", "compare_fields": [""], "index": ""} diff --git a/evals/registry/data/00_scipaper_enzyme/samples.jsonl b/evals/registry/data/00_scipaper_enzyme/samples.jsonl deleted file mode 100644 index 01eb9d337f..0000000000 --- a/evals/registry/data/00_scipaper_enzyme/samples.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2bf58abc2068a46085404efd770a640185ac65d831c49c3b98800a60ff62814b -size 4699 diff --git a/evals/registry/data/00_scipaper_enzyme_activate_compound/auto_add.sh b/evals/registry/data/00_scipaper_enzyme_activate_compound/auto_add.sh deleted file mode 100644 index f699a14a86..0000000000 --- a/evals/registry/data/00_scipaper_enzyme_activate_compound/auto_add.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -target_job=$1 -if [[ ${target_job} == "" ]] -then - echo ">>> Error: target_job is not define" - exit -fi -if [[ ! -f samples.jsonl ]] -then - touch samples.jsonl -fi -for paper in /root/uni-finder/enzyme/"${target_job}"/paper/*.pdf -do - echo "find file ${paper}" - file_name="${paper##*/}" - name=${file_name%.*} - key_word="" - key_word=$(grep "${name}" samples.jsonl) - if [[ ${key_word} == "" ]] - then - echo "add ${name} to jsonl" - sed 's|target_mark|'"${name}"'|g' sample_file | sed 's|target_Job|'"${target_job}"'|g' >> samples.jsonl - else - echo "${name}: was already in the jsonl" - fi -done diff --git a/evals/registry/data/00_scipaper_enzyme_activate_compound/sample_file b/evals/registry/data/00_scipaper_enzyme_activate_compound/sample_file deleted file mode 100644 index 95ce86b217..0000000000 --- a/evals/registry/data/00_scipaper_enzyme_activate_compound/sample_file +++ /dev/null @@ -1 +0,0 @@ -{"file_name": "../uni-finder/enzyme/target_Job/paper/target_mark.pdf", "file_link": "https://dp-filetrans-bj.oss-cn-beijing.aliyuncs.com/changjunhan/target_mark.pdf", "answerfile_name": "../uni-finder/enzyme/target_Job/answer/target_mark.csv", "answerfile_link": "https://dp-filetrans-bj.oss-cn-beijing.aliyuncs.com/changjunhan/target_mark.csv", "compare_fields": ["Activating Compound", "Comment", "Organism"], "index": "Activating Compound"} diff --git a/evals/registry/data/00_scipaper_enzyme_inhibitor/auto_add.sh b/evals/registry/data/00_scipaper_enzyme_inhibitor/auto_add.sh deleted file mode 100644 index f699a14a86..0000000000 --- a/evals/registry/data/00_scipaper_enzyme_inhibitor/auto_add.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -target_job=$1 -if [[ ${target_job} == "" ]] -then - echo ">>> Error: target_job is not define" - exit -fi -if [[ ! -f samples.jsonl ]] -then - touch samples.jsonl -fi -for paper in /root/uni-finder/enzyme/"${target_job}"/paper/*.pdf -do - echo "find file ${paper}" - file_name="${paper##*/}" - name=${file_name%.*} - key_word="" - key_word=$(grep "${name}" samples.jsonl) - if [[ ${key_word} == "" ]] - then - echo "add ${name} to jsonl" - sed 's|target_mark|'"${name}"'|g' sample_file | sed 's|target_Job|'"${target_job}"'|g' >> samples.jsonl - else - echo "${name}: was already in the jsonl" - fi -done diff --git a/evals/registry/data/00_scipaper_enzyme_inhibitor/sample_file b/evals/registry/data/00_scipaper_enzyme_inhibitor/sample_file deleted file mode 100644 index 57bad47af5..0000000000 --- a/evals/registry/data/00_scipaper_enzyme_inhibitor/sample_file +++ /dev/null @@ -1 +0,0 @@ -{"file_name": "../uni-finder/enzyme/target_Job/paper/target_mark.pdf", "file_link": "https://dp-filetrans-bj.oss-cn-beijing.aliyuncs.com/changjunhan/target_mark.pdf", "answerfile_name": "../uni-finder/enzyme/target_Job/answer/target_mark.csv", "answerfile_link": "https://dp-filetrans-bj.oss-cn-beijing.aliyuncs.com/changjunhan/target_mark.csv", "compare_fields": ["Inhibitors", "Comment", "Organism"], "index": "Inhibitors"} diff --git a/evals/registry/data/00_scipaper_enzyme_km/auto_add.sh b/evals/registry/data/00_scipaper_enzyme_km/auto_add.sh deleted file mode 100644 index f699a14a86..0000000000 --- a/evals/registry/data/00_scipaper_enzyme_km/auto_add.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -target_job=$1 -if [[ ${target_job} == "" ]] -then - echo ">>> Error: target_job is not define" - exit -fi -if [[ ! -f samples.jsonl ]] -then - touch samples.jsonl -fi -for paper in /root/uni-finder/enzyme/"${target_job}"/paper/*.pdf -do - echo "find file ${paper}" - file_name="${paper##*/}" - name=${file_name%.*} - key_word="" - key_word=$(grep "${name}" samples.jsonl) - if [[ ${key_word} == "" ]] - then - echo "add ${name} to jsonl" - sed 's|target_mark|'"${name}"'|g' sample_file | sed 's|target_Job|'"${target_job}"'|g' >> samples.jsonl - else - echo "${name}: was already in the jsonl" - fi -done diff --git a/evals/registry/data/00_scipaper_enzyme_km/sample_file b/evals/registry/data/00_scipaper_enzyme_km/sample_file deleted file mode 100644 index a218a4803b..0000000000 --- a/evals/registry/data/00_scipaper_enzyme_km/sample_file +++ /dev/null @@ -1 +0,0 @@ -{"file_name": "../uni-finder/enzyme/target_Job/paper/target_mark.pdf", "file_link": "https://dp-filetrans-bj.oss-cn-beijing.aliyuncs.com/changjunhan/target_mark.pdf", "answerfile_name": "../uni-finder/enzyme/target_Job/answer/target_mark.csv", "answerfile_link": "https://dp-filetrans-bj.oss-cn-beijing.aliyuncs.com/changjunhan/target_mark.csv", "compare_fields": ["Substrate", "SMILES", "Km value", "Km vmax", "organism"], "index": "Substrate"} diff --git a/evals/registry/data/00_scipaper_enzyme_km/samples.jsonl b/evals/registry/data/00_scipaper_enzyme_km/samples.jsonl deleted file mode 100644 index d699617478..0000000000 --- a/evals/registry/data/00_scipaper_enzyme_km/samples.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d526411b82df30527d64c8d35afe4a09152d6669637fd876242f4ae0539c447e -size 510 diff --git a/evals/registry/data/00_scipaper_enzyme_localization/auto_add.sh b/evals/registry/data/00_scipaper_enzyme_localization/auto_add.sh deleted file mode 100644 index f699a14a86..0000000000 --- a/evals/registry/data/00_scipaper_enzyme_localization/auto_add.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -target_job=$1 -if [[ ${target_job} == "" ]] -then - echo ">>> Error: target_job is not define" - exit -fi -if [[ ! -f samples.jsonl ]] -then - touch samples.jsonl -fi -for paper in /root/uni-finder/enzyme/"${target_job}"/paper/*.pdf -do - echo "find file ${paper}" - file_name="${paper##*/}" - name=${file_name%.*} - key_word="" - key_word=$(grep "${name}" samples.jsonl) - if [[ ${key_word} == "" ]] - then - echo "add ${name} to jsonl" - sed 's|target_mark|'"${name}"'|g' sample_file | sed 's|target_Job|'"${target_job}"'|g' >> samples.jsonl - else - echo "${name}: was already in the jsonl" - fi -done diff --git a/evals/registry/data/00_scipaper_enzyme_localization/sample_file b/evals/registry/data/00_scipaper_enzyme_localization/sample_file deleted file mode 100644 index ec5ce35204..0000000000 --- a/evals/registry/data/00_scipaper_enzyme_localization/sample_file +++ /dev/null @@ -1 +0,0 @@ -{"file_name": "../uni-finder/enzyme/target_Job/paper/target_mark.pdf", "file_link": "https://dp-filetrans-bj.oss-cn-beijing.aliyuncs.com/changjunhan/target_mark.pdf", "answerfile_name": "../uni-finder/enzyme/target_Job/answer/target_mark.csv", "answerfile_link": "https://dp-filetrans-bj.oss-cn-beijing.aliyuncs.com/changjunhan/target_mark.csv", "compare_fields": [ "Localization", "Organism"], "index":"Localization"} diff --git a/evals/registry/data/00_scipaper_enzyme_substrate/samples.jsonl b/evals/registry/data/00_scipaper_enzyme_substrate/samples.jsonl new file mode 100644 index 0000000000..5cf918e3d3 --- /dev/null +++ b/evals/registry/data/00_scipaper_enzyme_substrate/samples.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6316846852a855013f98ee678e945582013c1269fcad311c8e933859ade77c68 +size 1919 diff --git a/evals/registry/eval_sets/chemistry_enzyme.yaml b/evals/registry/eval_sets/chemistry_enzyme.yaml new file mode 100644 index 0000000000..7a9b3525df --- /dev/null +++ b/evals/registry/eval_sets/chemistry_enzyme.yaml @@ -0,0 +1,6 @@ +chemistry_enzyme: + evals: + - scipaper_enzyme_substrate + - scipaper_enzyme_activate_compound + - scipaper_enzyme_inhibitor + - scipaper_enzyme_localization \ No newline at end of file diff --git a/evals/registry/evals/00_scipaper_enzyme_km.yaml b/evals/registry/evals/00_scipaper_enzyme_substrate.yaml similarity index 85% rename from evals/registry/evals/00_scipaper_enzyme_km.yaml rename to evals/registry/evals/00_scipaper_enzyme_substrate.yaml index 9e5d4d0734..b266b07e05 100644 --- a/evals/registry/evals/00_scipaper_enzyme_km.yaml +++ b/evals/registry/evals/00_scipaper_enzyme_substrate.yaml @@ -1,11 +1,11 @@ -scipaper_enzyme_km: - id: scipaper_enzyme_km.val.csv +scipaper_enzyme_substrate: + id: scipaper_enzyme_substrate.val.csv metrics: [accuracy] -scipaper_enzyme_km.val.csv: +scipaper_enzyme_substrate.val.csv: class: evals.elsuite.rag_table_extract:TableExtract args: - samples_jsonl: 00_scipaper_enzyme_km/samples.jsonl + samples_jsonl: 00_scipaper_enzyme_substrate/samples.jsonl instructions: | Please give a complete list of SMILES structures, Km values, Vmax values, target info (protein or cell line), and organism of all substrates in the paper. Usually the substrates' tags are numbers or IUPAC names. 1. Output in csv format, write units not in header but in the value like "10.5 µM". Quote the value if it has comma! For example: