Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ wheels/
.venv

.vscode/
.env
83 changes: 83 additions & 0 deletions codemeticulous/ai_convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import litellm
import logging
import re
import json
from dotenv import load_dotenv
from pydantic import BaseModel, ValidationError
from codemeticulous.standards import STANDARDS
from codemeticulous.prompt_strategies import DefaultPrompt
from codemeticulous.summarize_schema import check_schema

logging.basicConfig(level=logging.INFO)
load_dotenv()

# Toggle for additional llm debugging
# litellm._turn_on_debug()


def extract_json(llm_output: str) -> dict:
# Try to extract JSON from markdown code block and disregard it
json_match = re.search(r'```json\s*(.*?)\s*```', llm_output, re.DOTALL)

if json_match:
json_str = json_match.group(1)
else:
# If no code block found, assume the whole string is JSON obj
json_str = llm_output
return json.loads(json_str)


def structured_completion(llm_model: str, messages: list, target_model: BaseModel) -> BaseModel | None:
try:
response = litellm.completion(
model=llm_model,
messages=messages,
)
output = extract_json(response.choices[0].message.content)
# logging.info(output)
except Exception as e:
logging.error(f"ERROR: structured output failed: {e}")
raise

try:
return target_model(**output)
except ValidationError as e: #TODO: add llm retries if there's validation errors
logging.error(f"Pydantic validation error: {e}")
raise


def convert_ai(llm_model: str, source_format: str, target_format: str, source_data):
"""
Automate metadata standard conversion using LLM and canonical representation.

Args:
- llm_model: LLM model string (e.g., "openrouter/openai/gpt-4o").
- source_format: string representation of the source metadata standard.
- target_format: string representation of the target metadata standard.
- model: LLM model string (e.g., "openrouter/openai/gpt-4o")
- source_data: dict or pydantic.BaseModel instance representing the source metadata
"""

# Build prompt messages using pydantic schemas and the source data
source_model = STANDARDS[source_format]["model"]
target_model = STANDARDS[target_format]["model"]

# Creates pydantic model instance of source data
if isinstance(source_data, dict):
source_instance = source_model(**source_data)
elif isinstance(source_data, source_model):
source_instance = source_data

# Create summarized schema of source pydantic model according to data instance
source_schema_dict = check_schema(source_format, llm_model, source_instance)
source_schema = json.dumps(source_schema_dict, indent=2)

target_schema = check_schema(target_format, llm_model)
target_schema = json.dumps(target_schema, indent=2)
# logging.info(target_schema)

strategy = DefaultPrompt()
messages = strategy.generate_system_prompt(source_instance, source_schema, target_schema)
target_data = structured_completion(llm_model, messages, target_model)

return target_data
77 changes: 75 additions & 2 deletions codemeticulous/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import yaml

from codemeticulous.convert import STANDARDS, convert as _convert

from codemeticulous.ai_convert import convert_ai as _convert_ai

@click.group()
def cli():
Expand Down Expand Up @@ -104,6 +104,79 @@ def validate(format_name, input_file, verbose):
traceback.print_exc()


@cli.command()
@click.option(
"-m",
"--model",
"llm_model",
type=str,
required=True,
help="LLM model to use for conversion (e.g., 'openrouter/openai/gpt-4o')",
)
@click.option(
"-f",
"--from",
"source_format",
type=click.Choice(STANDARDS.keys()),
required=True,
help="Source format",
)
@click.option(
"-t",
"--to",
"target_format",
type=click.Choice(STANDARDS.keys()),
required=True,
help="Target format",
)
@click.option(
"-o",
"--output",
"output_file",
type=click.File("w"),
default=None,
help="Output file name (by default prints to stdout)",
)
@click.option(
"-v",
"--verbose",
is_flag=True,
default=False,
help="Print verbose output",
)
@click.argument("input_file", type=click.Path(exists=True))
def ai_convert(llm_model: str, source_format: str, target_format: str, input_file, output_file, verbose):
try:
input_data = load_file_autodetect(input_file)
except Exception as e:
click.echo(f"Failed to load file: {input_file}. {str(e)}", err=True)
if verbose:
traceback.print_exc()
try:
converted_data = _convert_ai(llm_model, source_format, target_format, input_data)
except Exception as e:
click.echo(f"Error during AI-assisted conversion: {str(e)}", err=True)
if verbose:
traceback.print_exc()
return

output_format = STANDARDS[target_format]["format"]

try:
output_data = dump_data(converted_data, output_format)
except Exception as e:
click.echo(f"Error during serialization: {str(e)}", err=True)
if verbose:
traceback.print_exc()
return

if output_file:
output_file.write(output_data)
click.echo(f"Data written to {output_file.name}")
else:
click.echo(output_data)


def dump_data(data, format):
if format == "json":
return data.json()
Expand Down Expand Up @@ -136,4 +209,4 @@ def load_file_autodetect(file_path):
else:
raise ValueError(f"Unsupported file extension: {ext}.")
except Exception as e:
raise ValueError(f"Failed to load file: {file_path}. {str(e)}")
raise ValueError(f"Failed to load file: {file_path}. {str(e)}")
23 changes: 1 addition & 22 deletions codemeticulous/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,7 @@
from codemeticulous.codemeta.convert import canonical_to_codemeta, codemeta_to_canonical
from codemeticulous.datacite.convert import canonical_to_datacite, datacite_to_canonical
from codemeticulous.cff.convert import canonical_to_cff, cff_to_canonical


STANDARDS = {
"codemeta": {
"model": CodeMeta,
"format": "json",
"to_canonical": codemeta_to_canonical,
"from_canonical": canonical_to_codemeta,
},
"datacite": {
"model": DataCite,
"format": "json",
"to_canonical": datacite_to_canonical,
"from_canonical": canonical_to_datacite,
},
"cff": {
"model": CitationFileFormat,
"format": "yaml",
"to_canonical": cff_to_canonical,
"from_canonical": canonical_to_cff,
},
}
from codemeticulous.standards import STANDARDS


def to_canonical(source_format: str, source_data):
Expand Down
39 changes: 39 additions & 0 deletions codemeticulous/prompt_strategies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from abc import ABC, abstractmethod

class PromptStrategy(ABC):
@abstractmethod
def generate_system_prompt(self, source_instance) -> list:
pass


class DefaultPrompt(PromptStrategy):
PROMPT = """
Your task is to convert source metadata from one format to another using the provided schemas.

INPUTS PROVIDED:
- Source data: A Pydantic model instance containing the original metadata
- Source schema: A JSON object containing the source Pydantic model's fields and descriptions
- Target schema: The Pydantic model definition for the output format

INSTRUCTIONS:
1. Analyze the source data and understand its structure.
2. Extract and map the relevant fields from the source data to the corresponding fields in the target format.
3. Transform data types and structures as needed to match the target schema requirements which could either be one-to-one or complex transformations.
4. Instantiate the target model using the mapped and transformed data, so that a new instance of the target Pyndantic model can be created.

OUTPUT REQUIREMENTS:
- Return ONLY a raw JSON without any further encoding such as escaping quotes.
- Ensure all required fields in the target schema are populated
- Use appropriate data types as defined in the target schema

The final output must be an instance of the target model schema that can be successfully validated by Pydantic.
"""

def generate_system_prompt(self, source_instance, source_schema, target_schema) -> list:

return [
{"role": "system", "content": self.PROMPT},
{"role": "user", "content": "SOURCE_DATA:\n" + source_instance.json()},
{"role": "user", "content": "SOURCE SCHEMA\n" + source_schema},
{"role": "user", "content": "TARGET_MODEL:\n" + target_schema}
]
30 changes: 30 additions & 0 deletions codemeticulous/standards.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from codemeticulous.codemeta.models import CodeMeta
from codemeticulous.datacite.models import DataCite
from codemeticulous.cff.models import CitationFileFormat
from codemeticulous.codemeta.convert import canonical_to_codemeta, codemeta_to_canonical
from codemeticulous.datacite.convert import canonical_to_datacite, datacite_to_canonical
from codemeticulous.cff.convert import canonical_to_cff, cff_to_canonical

STANDARDS = {
"codemeta": {
"model": CodeMeta,
"format": "json",
"to_canonical": codemeta_to_canonical,
"from_canonical": canonical_to_codemeta,
"schema": None
},
"datacite": {
"model": DataCite,
"format": "json",
"to_canonical": datacite_to_canonical,
"from_canonical": canonical_to_datacite,
"schema": "schema/datacite/schema46.json"
},
"cff": {
"model": CitationFileFormat,
"format": "yaml",
"to_canonical": cff_to_canonical,
"from_canonical": canonical_to_cff,
"schema": "schema/cff/1.2.0/schema.json"
},
}
91 changes: 91 additions & 0 deletions codemeticulous/summarize_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import re
import csv
import json
import logging
import litellm
from pydantic import BaseModel
from pathlib import Path
from codemeticulous.standards import STANDARDS

def generate_desc(model_name: str, data, llm_model: str) -> str:
prompt = f"""
For a Pydantic model '{model_name}', we have a list of lists, each containing a field and their field type.

In one or two sentences, please provide brief descriptions of each field in relation to the model at the end of each sub-list.
Your response should be in a valid array consisting of the field name, field type, and new descriptions.
Please do not include outside explanatory text or unnecessary formatting syntax so your response can be piped into 'json.loads()'.

Here is the data:
{data}
"""

try:
response = litellm.completion(
messages=[{"role": "user", "content": prompt}],
model=llm_model
)
return response.choices[0].message.content.strip()
except Exception as e:
print(f"ERROR: structured output failed: {e}")
raise


def check_schema(model: str, llm_model: str, instance_data: BaseModel = None):
pydantic_model = STANDARDS[model]["model"]
schema_file = STANDARDS[model]["schema"]

if schema_file is not None: # iterate through fields if there's an instance that calls for the schema to be pruned
with open(schema_file, 'r') as f:
schema = json.load(f)

# TODO: prune if there's instance data
return schema

else: # check for schema_cache directory, return the data file if its exists
cache_directory = Path(__file__).parent.parent / "schema_cache"
file = cache_directory / f"{model}.csv"

if file.exists():
with open(file, 'r') as f:
reader = csv.reader(f)
next(reader)
field_descriptions = [row for row in reader]

return {
"model_name": model,
"fields": field_descriptions
}

else: # if it doesn't exist, use LLM to generate a csv of schema information
fields = []

for field_name, model_field in pydantic_model.__fields__.items():
# if there's an instance and the field isn't referenced in it, skip
if instance_data is not None and getattr(instance_data, field_name) is None:
continue
field_type = model_field.annotation
field = [field_name, field_type]
fields.append(field)

llm_response = generate_desc(pydantic_model.__name__, fields, llm_model)
match = re.search(r'\{.*\}|\[.*\]', llm_response, re.DOTALL) # clean up LLM response

if match:
llm_response = match.group(0)

try:
field_descriptions = json.loads(llm_response)

# generate a csv retaining the final schema information and store to reuse
with open(file, "w", newline='') as f:
writer = csv.writer(f)
writer.writerow(["Field Name", "Field Type", "Description"])
writer.writerows(field_descriptions)

return {
"model_name": model,
"fields": field_descriptions
}
except Exception as e:
logging.error(f"ERROR: failed to create list from llm response: ", e)
raise
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"click>=8.1.7",
"instructor>=1.13.0",
"litellm>=1.77.1",
"pydantic2-schemaorg==0.2.0",
"pydantic>=2.9.2",
"pyyaml>=6.0.2",
Expand All @@ -30,4 +32,4 @@ include-package-data = false

[tool.setuptools.packages.find]
include = ["codemeticulous", "codemeticulous.*"]
exclude = ["tests*", "schema"]
exclude = ["tests*", "schema"]
Loading