SciCodes · kellytea · Sep 30, 2025 · Nov 10, 2025 · Nov 20, 2025 · Nov 25, 2025
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,4 @@ wheels/
 .venv
 
 .vscode/
+.env
diff --git a/codemeticulous/ai_convert.py b/codemeticulous/ai_convert.py
@@ -0,0 +1,83 @@
+import litellm
+import logging
+import re
+import json
+from dotenv import load_dotenv
+from pydantic import BaseModel, ValidationError
+from codemeticulous.standards import STANDARDS
+from codemeticulous.prompt_strategies import DefaultPrompt
+from codemeticulous.summarize_schema import check_schema
+
+logging.basicConfig(level=logging.INFO)
+load_dotenv()
+
+# Toggle for additional llm debugging
+# litellm._turn_on_debug()
+
+
+def extract_json(llm_output: str) -> dict:
+    # Try to extract JSON from markdown code block and disregard it
+    json_match = re.search(r'```json\s*(.*?)\s*```', llm_output, re.DOTALL)
+
+    if json_match:
+        json_str = json_match.group(1)
+    else:
+        # If no code block found, assume the whole string is JSON obj
+        json_str = llm_output
+    return json.loads(json_str)
+
+
+def structured_completion(llm_model: str, messages: list, target_model: BaseModel) -> BaseModel | None:
+    try:
+        response = litellm.completion(
+            model=llm_model,
+            messages=messages,
+        )
+        output = extract_json(response.choices[0].message.content)
+        # logging.info(output)
+    except Exception as e:
+        logging.error(f"ERROR: structured output failed: {e}") 
+        raise
+
+    try:
+        return target_model(**output)
+    except ValidationError as e: #TODO: add llm retries if there's validation errors
+        logging.error(f"Pydantic validation error: {e}")
+        raise
+
+
+def convert_ai(llm_model: str, source_format: str, target_format: str, source_data):
+    """
+    Automate metadata standard conversion using LLM and canonical representation.
+
+    Args:
+    - llm_model: LLM model string (e.g., "openrouter/openai/gpt-4o").
+    - source_format: string representation of the source metadata standard.
+    - target_format: string representation of the target metadata standard.
+    - model: LLM model string (e.g., "openrouter/openai/gpt-4o")
+    - source_data: dict or pydantic.BaseModel instance representing the source metadata
+    """
+
+    # Build prompt messages using pydantic schemas and the source data
+    source_model = STANDARDS[source_format]["model"]
+    target_model = STANDARDS[target_format]["model"]
+
+    # Creates pydantic model instance of source data
+    if isinstance(source_data, dict):
+        source_instance = source_model(**source_data)
+    elif isinstance(source_data, source_model):
+        source_instance = source_data
+
+    # Create summarized schema of source pydantic model according to data instance
+    source_schema_dict = check_schema(source_format, llm_model, source_instance)
+    source_schema = json.dumps(source_schema_dict, indent=2)
+
+    target_schema = check_schema(target_format, llm_model)
+    target_schema = json.dumps(target_schema, indent=2)
+    # logging.info(target_schema)
+
+    strategy = DefaultPrompt()
+    messages = strategy.generate_system_prompt(source_instance, source_schema, target_schema)
+    target_data = structured_completion(llm_model, messages, target_model)
+
+    return target_data
diff --git a/codemeticulous/cli.py b/codemeticulous/cli.py
@@ -5,7 +5,7 @@
 import yaml
 
 from codemeticulous.convert import STANDARDS, convert as _convert
-
+from codemeticulous.ai_convert import convert_ai as _convert_ai
 
 @click.group()
 def cli():
@@ -104,6 +104,79 @@ def validate(format_name, input_file, verbose):
             traceback.print_exc()
 
 
+@cli.command()
+@click.option(
+    "-m",
+    "--model",
+    "llm_model",
+    type=str,
+    required=True,
+    help="LLM model to use for conversion (e.g., 'openrouter/openai/gpt-4o')",
+)
+@click.option(
+    "-f",
+    "--from",
+    "source_format",
+    type=click.Choice(STANDARDS.keys()),
+    required=True,
+    help="Source format",
+)
+@click.option(
+    "-t",
+    "--to",
+    "target_format",
+    type=click.Choice(STANDARDS.keys()),
+    required=True,
+    help="Target format",
+)
+@click.option(
+    "-o",
+    "--output",
+    "output_file",
+    type=click.File("w"),
+    default=None,
+    help="Output file name (by default prints to stdout)",
+)
+@click.option(
+    "-v",
+    "--verbose",
+    is_flag=True,
+    default=False,
+    help="Print verbose output",
+)
+@click.argument("input_file", type=click.Path(exists=True))
+def ai_convert(llm_model: str, source_format: str, target_format: str, input_file, output_file, verbose):
+    try:
+        input_data = load_file_autodetect(input_file)
+    except Exception as e:
+        click.echo(f"Failed to load file: {input_file}. {str(e)}", err=True)
+        if verbose:
+            traceback.print_exc()
+    try:
+        converted_data = _convert_ai(llm_model, source_format, target_format, input_data)
+    except Exception as e:
+        click.echo(f"Error during AI-assisted conversion: {str(e)}", err=True)
+        if verbose:
+            traceback.print_exc()
+        return
+
+    output_format = STANDARDS[target_format]["format"]
+
+    try:
+        output_data = dump_data(converted_data, output_format)
+    except Exception as e:
+        click.echo(f"Error during serialization: {str(e)}", err=True)
+        if verbose:
+            traceback.print_exc()
+        return
+
+    if output_file:
+        output_file.write(output_data)
+        click.echo(f"Data written to {output_file.name}")
+    else:
+        click.echo(output_data)
+
+
 def dump_data(data, format):
     if format == "json":
         return data.json()
@@ -136,4 +209,4 @@ def load_file_autodetect(file_path):
             else:
                 raise ValueError(f"Unsupported file extension: {ext}.")
     except Exception as e:
-        raise ValueError(f"Failed to load file: {file_path}. {str(e)}")
+        raise ValueError(f"Failed to load file: {file_path}. {str(e)}")
diff --git a/codemeticulous/convert.py b/codemeticulous/convert.py
@@ -4,28 +4,7 @@
 from codemeticulous.codemeta.convert import canonical_to_codemeta, codemeta_to_canonical
 from codemeticulous.datacite.convert import canonical_to_datacite, datacite_to_canonical
 from codemeticulous.cff.convert import canonical_to_cff, cff_to_canonical
-
-
-STANDARDS = {
-    "codemeta": {
-        "model": CodeMeta,
-        "format": "json",
-        "to_canonical": codemeta_to_canonical,
-        "from_canonical": canonical_to_codemeta,
-    },
-    "datacite": {
-        "model": DataCite,
-        "format": "json",
-        "to_canonical": datacite_to_canonical,
-        "from_canonical": canonical_to_datacite,
-    },
-    "cff": {
-        "model": CitationFileFormat,
-        "format": "yaml",
-        "to_canonical": cff_to_canonical,
-        "from_canonical": canonical_to_cff,
-    },
-}
+from codemeticulous.standards import STANDARDS
 
 
 def to_canonical(source_format: str, source_data):

diff --git a/codemeticulous/prompt_strategies.py b/codemeticulous/prompt_strategies.py
@@ -0,0 +1,39 @@
+from abc import ABC, abstractmethod
+
+class PromptStrategy(ABC):
+    @abstractmethod
+    def generate_system_prompt(self, source_instance) -> list:
+        pass
+
+
+class DefaultPrompt(PromptStrategy):
+    PROMPT = """
+    Your task is to convert source metadata from one format to another using the provided schemas.
+
+    INPUTS PROVIDED:
+    - Source data: A Pydantic model instance containing the original metadata
+    - Source schema: A JSON object containing the source Pydantic model's fields and descriptions
+    - Target schema: The Pydantic model definition for the output format
+
+    INSTRUCTIONS:
+    1. Analyze the source data and understand its structure.
+    2. Extract and map the relevant fields from the source data to the corresponding fields in the target format.
+    3. Transform data types and structures as needed to match the target schema requirements which could either be one-to-one or complex transformations.
+    4. Instantiate the target model using the mapped and transformed data, so that a new instance of the target Pyndantic model can be created.
+
+    OUTPUT REQUIREMENTS:
+    - Return ONLY a raw JSON without any further encoding such as escaping quotes.
+    - Ensure all required fields in the target schema are populated
+    - Use appropriate data types as defined in the target schema
+
+    The final output must be an instance of the target model schema that can be successfully validated by Pydantic.
+    """
+
+    def generate_system_prompt(self, source_instance, source_schema, target_schema) -> list:
+
+        return [
+            {"role": "system", "content": self.PROMPT},
+            {"role": "user", "content": "SOURCE_DATA:\n" + source_instance.json()},
+            {"role": "user", "content": "SOURCE SCHEMA\n" + source_schema},
+            {"role": "user", "content": "TARGET_MODEL:\n" + target_schema}
+        ]
diff --git a/codemeticulous/standards.py b/codemeticulous/standards.py
@@ -0,0 +1,30 @@
+from codemeticulous.codemeta.models import CodeMeta
+from codemeticulous.datacite.models import DataCite
+from codemeticulous.cff.models import CitationFileFormat
+from codemeticulous.codemeta.convert import canonical_to_codemeta, codemeta_to_canonical
+from codemeticulous.datacite.convert import canonical_to_datacite, datacite_to_canonical
+from codemeticulous.cff.convert import canonical_to_cff, cff_to_canonical
+
+STANDARDS = {
+    "codemeta": {
+        "model": CodeMeta,
+        "format": "json",
+        "to_canonical": codemeta_to_canonical,
+        "from_canonical": canonical_to_codemeta,
+        "schema": None
+    },
+    "datacite": {
+        "model": DataCite,
+        "format": "json",
+        "to_canonical": datacite_to_canonical,
+        "from_canonical": canonical_to_datacite,
+        "schema": "schema/datacite/schema46.json"
+    },
+    "cff": {
+        "model": CitationFileFormat,
+        "format": "yaml",
+        "to_canonical": cff_to_canonical,
+        "from_canonical": canonical_to_cff,
+        "schema": "schema/cff/1.2.0/schema.json"
+    },
+}
diff --git a/codemeticulous/summarize_schema.py b/codemeticulous/summarize_schema.py
@@ -0,0 +1,91 @@
+import re
+import csv
+import json
+import logging
+import litellm
+from pydantic import BaseModel
+from pathlib import Path
+from codemeticulous.standards import STANDARDS
+
+def generate_desc(model_name: str, data, llm_model: str) -> str:
+    prompt = f"""
+    For a Pydantic model '{model_name}', we have a list of lists, each containing a field and their field type. 
+
+    In one or two sentences, please provide brief descriptions of each field in relation to the model at the end of each sub-list.
+    Your response should be in a valid array consisting of the field name, field type, and new descriptions.
+    Please do not include outside explanatory text or unnecessary formatting syntax so your response can be piped into 'json.loads()'.
+
+    Here is the data:
+    {data}
+    """
+
+    try:
+        response = litellm.completion(
+            messages=[{"role": "user", "content": prompt}],
+            model=llm_model
+        )
+        return response.choices[0].message.content.strip()
+    except Exception as e:
+        print(f"ERROR: structured output failed: {e}") 
+        raise
+
+
+def check_schema(model: str, llm_model: str, instance_data: BaseModel = None): 
+    pydantic_model = STANDARDS[model]["model"]
+    schema_file = STANDARDS[model]["schema"]
+
+    if schema_file is not None: # iterate through fields if there's an instance that calls for the schema to be pruned
+        with open(schema_file, 'r') as f:
+            schema = json.load(f)
+
+        # TODO: prune if there's instance data
+        return schema
+
+    else: # check for schema_cache directory, return the data file if its exists
+        cache_directory = Path(__file__).parent.parent / "schema_cache"
+        file = cache_directory / f"{model}.csv"
+
+        if file.exists():
+            with open(file, 'r') as f:
+                reader = csv.reader(f)
+                next(reader)
+                field_descriptions = [row for row in reader]
+
+            return {
+                "model_name": model,
+                "fields": field_descriptions
+            }
+
+        else: # if it doesn't exist, use LLM to generate a csv of schema information
+            fields = []
+
+            for field_name, model_field in pydantic_model.__fields__.items():
+                # if there's an instance and the field isn't referenced in it, skip
+                if instance_data is not None and getattr(instance_data, field_name) is None:
+                    continue
+                field_type = model_field.annotation
+                field = [field_name, field_type]
+                fields.append(field)
+
+            llm_response = generate_desc(pydantic_model.__name__, fields, llm_model)
+            match = re.search(r'\{.*\}|\[.*\]', llm_response, re.DOTALL) # clean up LLM response
+
+            if match:
+                llm_response = match.group(0)
+
+            try:
+                field_descriptions = json.loads(llm_response)
+
+                # generate a csv retaining the final schema information and store to reuse
+                with open(file, "w", newline='') as f:
+                    writer = csv.writer(f)
+                    writer.writerow(["Field Name", "Field Type", "Description"])
+                    writer.writerows(field_descriptions)
+
+                return {
+                    "model_name": model,
+                    "fields": field_descriptions
+                }
+            except Exception as e:
+                logging.error(f"ERROR: failed to create list from llm response: ", e)  
+                raise 
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,6 +6,8 @@ readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
     "click>=8.1.7",
+    "instructor>=1.13.0",
+    "litellm>=1.77.1",
     "pydantic2-schemaorg==0.2.0",
     "pydantic>=2.9.2",
     "pyyaml>=6.0.2",
@@ -30,4 +32,4 @@ include-package-data = false
 
 [tool.setuptools.packages.find]
 include = ["codemeticulous", "codemeticulous.*"]
-exclude = ["tests*", "schema"]
+exclude = ["tests*", "schema"]