Use a git_diff_context and check the token size for review and commit functions.

TechNickAI · TechNickAI · commit 37612b882ac8 · 2023-06-26T16:50:20.000-06:00
diff --git a/aicodebot/cli.py b/aicodebot/cli.py
@@ -1,5 +1,5 @@
 from aicodebot import version as aicodebot_version
-from aicodebot.helpers import exec_and_get_output
+from aicodebot.helpers import exec_and_get_output, get_token_length, git_diff_context
 from dotenv import load_dotenv
 from langchain.chains import LLMChain
 from langchain.chat_models import ChatOpenAI
@@ -81,10 +81,10 @@ def alignment(verbose):
 
 @cli.command()
 @click.option("-v", "--verbose", count=True)
-@click.option("-t", "--max-tokens", type=int, default=250)
+@click.option("-t", "--response-token-size", type=int, default=250)
 @click.option("-y", "--yes", is_flag=True, default=False, help="Don't ask for confirmation before committing.")
 @click.option("--skip-pre-commit", is_flag=True, help="Skip running pre-commit (otherwise run it if it is found).")
-def commit(verbose, max_tokens, yes, skip_pre_commit):
+def commit(verbose, response_token_size, yes, skip_pre_commit):
     """Generate a git commit message and commit changes after you approve."""
     setup_environment()
 
@@ -99,36 +99,45 @@ def commit(verbose, max_tokens, yes, skip_pre_commit):
     # Load the prompt
     prompt = load_prompt(Path(__file__).parent / "prompts" / "commit_message.yaml")
 
-    # Set up the language model
-    llm = OpenAI(temperature=0.1, max_tokens=max_tokens)
-
-    # Set up the chain
-    chain = LLMChain(llm=llm, prompt=prompt, verbose=verbose)
-
     # Get the changes from git
     staged_files = exec_and_get_output(["git", "diff", "--name-only", "--cached"])
-    base_git_diff = ["git", "diff", "-U10"]  # Tell diff to provide 10 lines of context
     if not staged_files:
         # If no files are staged, Assume they want to commit all changed files
         exec_and_get_output(["git", "add", "-A"])
-        # Get the diff for all changes since the last commit
-        diff = exec_and_get_output(base_git_diff + ["HEAD"])
         # Get the list of files to be committed
         files = exec_and_get_output(["git", "diff", "--name-only", "--cached"])
     else:
-        # If some files are staged, get the diff for those files
-        diff = exec_and_get_output(base_git_diff + ["--cached"])
         # The list of files to be committed is the same as the list of staged files
         files = staged_files
 
-    if not diff:
+    diff_context = git_diff_context()
+
+    if not diff_context:
         console.print("No changes to commit.")
         sys.exit(0)
 
-    console.print("The following files will be committed:\n" + files)
+    # Check the size of the diff context and adjust accordingly
+    diff_context_token_size = get_token_length(diff_context)
+    if verbose:
+        console.print(f"Diff context token size: {diff_context_token_size}")
+
+    if diff_context_token_size + response_token_size > 16_000:
+        console.print("The diff context is too large to review. Bigger models coming soon.")
+        sys.exit(1)
+    elif diff_context_token_size + response_token_size > 4_000:
+        model = "gpt-3.5-turbo-16k"  # supports 16k tokens but is a bit slower and more expensive
+    else:
+        model = "gpt-3.5-turbo"  # supports 4k tokens
+
+    # Set up the language model
+    llm = ChatOpenAI(temperature=0.1, model=model, max_tokens=response_token_size)
 
+    # Set up the chain
+    chain = LLMChain(llm=llm, prompt=prompt, verbose=verbose)
+
+    console.print("The following files will be committed:\n" + files)
     with console.status("Thinking", spinner="point"):
-        response = chain.run(diff)
+        response = chain.run(diff_context)
 
     # Write the commit message to a temporary file
     with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp:
@@ -212,41 +221,42 @@ def fun_fact(verbose):
 
 
 @cli.command
-@click.option("--commit", "-c", help="The commit hash to review.")
-@click.option("--verbose", "-v")
+@click.option("-c", "--commit", help="The commit hash to review.")
+@click.option("-v", "--verbose", count=True)
 def review(commit, verbose):
     """Use AI to do a code review, with [un]staged changes, or a specified commit."""
     setup_environment()
 
-    if commit:
-        # If a commit hash is specified, get the diff for that commit
-        diff = exec_and_get_output(["git", "show", commit])
-    else:
-        # If no commit hash is specified, get the diff for changes, staged or not
-        staged_files = exec_and_get_output(["git", "diff", "--name-only", "--cached"])
-        base_git_diff = ["git", "diff", "-U10"]  # Tell diff to provide 10 lines of context
-        if not staged_files:
-            # Get the diff for all changes since the last commit
-            diff = exec_and_get_output(base_git_diff + ["HEAD"])
-        else:
-            # If some files are staged, get the diff for those files
-            diff = exec_and_get_output(base_git_diff + ["--cached"])
-
-        if not diff:
-            console.print("No changes to commit.")
-            sys.exit(0)
+    diff_context = git_diff_context(commit)
+    if not diff_context:
+        console.print("No changes to commit.")
+        sys.exit(0)
 
     # Load the prompt
     prompt = load_prompt(Path(__file__).parent / "prompts" / "review.yaml")
 
+    # Check the size of the diff context and adjust accordingly
+    response_token_size = DEFAULT_MAX_TOKENS / 2
+    diff_context_token_size = get_token_length(diff_context)
+    if verbose:
+        console.print(f"Diff context token size: {diff_context_token_size}")
+
+    if diff_context_token_size + response_token_size > 16_000:
+        console.print("The diff context is too large to review. Bigger models coming soon.")
+        sys.exit(1)
+    elif diff_context_token_size + response_token_size > 4_000:
+        model = "gpt-3.5-turbo-16k"  # supports 16k tokens but is a bit slower and more expensive
+    else:
+        model = "gpt-3.5-turbo"  # supports 4k tokens
+
     # Set up the language model
-    llm = OpenAI(temperature=0.1, max_tokens=DEFAULT_MAX_TOKENS)
+    llm = ChatOpenAI(temperature=0.1, model=model, max_tokens=response_token_size)
 
     # Set up the chain
     chain = LLMChain(llm=llm, prompt=prompt, verbose=verbose)
 
     with console.status("Reviewing", spinner="point"):
-        response = chain.run(diff)
+        response = chain.run(diff_context)
         console.print(response, style=bot_style)
 
 
diff --git a/aicodebot/prompts/commit_message.yaml b/aicodebot/prompts/commit_message.yaml
@@ -1,18 +1,20 @@
 _type: prompt
 template_format: f-string
-input_variables: ["diff"]
+input_variables: ["diff_context"]
 template: |
-    I have a diff of a code change that I need to commit to a git repository. The diff is as follows:
+    I have a diff of a code change that I need to commit to a git repository. The relevant diff context is as follows,
+    between the BEGIN DIFF and END DIFF markers:
 
     BEGIN DIFF
-    {diff}
+    {diff_context}
     END DIFF
 
     Generate a commit message for me. The commit message should follow best practices,
     which means it should have a short, single-line summary, followed by a blank line, and then a more
     detailed explanatory text, but only if necessary.
     Avoid redundancy between the summary line and the explanatory text. Don't repeat yourself.
     If the detailed explanatory text is not necessary, then omit it and just do the summary
+    Use imperative mood for the commit message, e.g. "Add feature" instead of "Added feature".
 
     The text can be in GitHub-flavored markdown format.
 
diff --git a/aicodebot/prompts/review.yaml b/aicodebot/prompts/review.yaml
@@ -1,17 +1,23 @@
 _type: prompt
 template_format: f-string
-input_variables: ["diff"]
+input_variables: ["diff_context"]
 template: |
     You are an expert code reviewer.
     You know how to give constructive feedback.
     You know how to give feedback that is actionable.
     You know how to give feedback that is kind.
     You know how to give feedback that is specific.
+    Contextually appropriate emojis are encouraged, but not required.
 
-    Review this code change:
+    DO NOT give comments that discuss formatting, as those will be handled with pre-commit with the black and isort hooks.
+    DO NOT respond with line numbers, use function names or file names instead (you're going to be wrong about the line numbers anyway).
+
+    Review this code change.  The relevant diff context is as follows, between the BEGIN DIFF and END DIFF markers:
 
     BEGIN DIFF
-    {diff}
+    {diff_context}
     END DIFF
 
-    If the changes look good and don't require any feedback, then just respond with "LGTM" (looks good to me).
+    The main focus is to tell the author how they could make the code better.
+
+    If the changes look good overall and don't require any feedback, then just respond with "LGTM" (looks good to me).