Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 22 additions & 4 deletions mcpunk/file_breakdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from threading import Lock, Timer
from typing import Literal

import more_itertools
from git import Repo
from pydantic import (
BaseModel,
Expand Down Expand Up @@ -171,6 +172,7 @@ def from_file_contents(
cls,
source_code: str,
file_path: Path,
max_chunk_size: int = 10_000,
) -> "File":
"""Extract all callables, calls and imports from the given source code file."""
chunks: list[Chunk] = []
Expand All @@ -180,6 +182,9 @@ def from_file_contents(
if chunker.can_chunk(source_code, file_path):
try:
chunks = chunker(source_code, file_path).chunk_file()
chunks = list(
more_itertools.flatten(x.split(max_size=max_chunk_size) for x in chunks),
)
break
except Exception:
logger.exception(f"Error chunking file {file_path} with {chunker}")
Expand All @@ -201,9 +206,11 @@ def __init__(
root: Path,
files_per_parallel_worker: int = 100,
file_watch_refresh_freq_seconds: float = 0.1,
max_chunk_size: int = 10_000,
) -> None:
self.root = root.expanduser().absolute()
self.files_per_parallel_worker = files_per_parallel_worker
self.max_chunk_size = max_chunk_size
self.file_map: dict[Path, File] = {}

git_repo: Repo | None
Expand Down Expand Up @@ -241,14 +248,21 @@ def load_files(self, files: list[Path]) -> None:

files_analysed: list[File]
if n_workers == 1:
files_analysed_maybe_none = [_analyze_file(file_path) for file_path in files]
files_analysed_maybe_none = [
_analyze_file(file_path, max_chunk_size=self.max_chunk_size) for file_path in files
]
files_analysed = [x for x in files_analysed_maybe_none if x is not None]
else:
logger.info(f"Using {n_workers} workers to process {len(files)} files")
files_analysed = []
with ProcessPoolExecutor(max_workers=n_workers) as executor:
future_to_file = {
executor.submit(_analyze_file, file_path): file_path for file_path in files
executor.submit(
_analyze_file,
file_path,
max_chunk_size=self.max_chunk_size,
): file_path
for file_path in files
}

for future in as_completed(future_to_file):
Expand Down Expand Up @@ -287,7 +301,7 @@ def _init_from_root_dir(self, root: Path) -> None:
self.load_files(files)


def _analyze_file(file_path: Path) -> File | None:
def _analyze_file(file_path: Path, max_chunk_size: int = 10_000) -> File | None:
try:
if not file_path.exists():
logger.warning(f"File {file_path} does not exist")
Expand All @@ -296,7 +310,11 @@ def _analyze_file(file_path: Path) -> File | None:
logger.warning(f"File {file_path} is not a file")
return None

return File.from_file_contents(file_path.read_text(), file_path)
return File.from_file_contents(
file_path.read_text(),
file_path,
max_chunk_size=max_chunk_size,
)
except Exception:
logger.exception(f"Error processing file {file_path}")
return None
78 changes: 78 additions & 0 deletions mcpunk/file_chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,81 @@ def matches_filter(
else:
assert_never(filter_on)
return matches_filter(filter_, data)

def split(
self,
max_size: int = 10_000,
split_chunk_prefix: str = (
"[This is a subsection of the chunk. Other parts contain the rest of the chunk]\n\n"
),
) -> list["Chunk"]:
"""Split this chunk into smaller chunks.

This will split the chunk at line boundaries, unless the
line is already longer than max_size.

Args:
max_size: Maximum size in characters for the chunk contents. At least 100.
split_chunk_prefix: Prefix to add the start of each newly created split chunk.
Unused if the chunk is not split. You can set to empty string to
suppress the prefix.

Returns:
List containing either the original chunk (if small enough) or multiple smaller chunks
"""
assert max_size >= 100, "max_size must be at least 100"
# If chunk is small enough, return it as is
if len(self.content) <= max_size:
return [self]
max_size -= len(split_chunk_prefix)
assert max_size > 0, f"{max_size} maybe decrease prefix length"

result: list[Chunk] = []
max_line_size = max_size - 50 # Leave some margin

# Preprocess to split long lines first. This could be avoided, but it does
# make the whole thing a bit simpler as we always know later on that a single line
# will never be longer than max_size.
processed_lines = []
for line in self.content.splitlines(keepends=True):
if len(line) > max_line_size:
# Split the line into chunks of max_line_size
for i in range(0, len(line), max_line_size):
processed_lines.append(line[i : i + max_line_size])
else:
processed_lines.append(line)

# Now split into chunks of max_size
current_content: list[str] = []
current_size = 0
part_num = 1

for line in processed_lines:
# If adding this line would exceed the limit, create a new chunk
if current_size + len(line) > max_size and current_content:
new_chunk = Chunk(
category=self.category,
name=f"{self.name}_part{part_num}",
content=split_chunk_prefix + "".join(current_content),
line=None,
)
result.append(new_chunk)
part_num += 1
current_content = []
current_size = 0

# Add the line to the current chunk
current_content.append(line)
current_size += len(line)

# Add the final chunk if there's anything left
if current_content:
new_chunk = Chunk(
category=self.category,
name=f"{self.name}_part{part_num}",
content=split_chunk_prefix + "".join(current_content),
line=None,
)
result.append(new_chunk)

return result
8 changes: 7 additions & 1 deletion mcpunk/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ class Settings(BaseSettings):
include_chars_in_response: bool = True
# Maximum number of characters in the response. If the response is longer than this
# then an error will be returned to the caller. This is handy to avoid blowing
# your context.
# your context. HOWEVER this is largely redundant with the max_chunk_size
# option. Likely to be removed in the future.
default_response_max_chars: int = 20_000
# Same as `default_response_max_chars` but for the tool that returns a git diff.
# Generally, git diffs are a bit larger than e.g. a function so nice to have it a
Expand All @@ -53,6 +54,11 @@ class Settings(BaseSettings):
# files during save (though this is not a guarantee).
file_watch_refresh_freq_seconds: float = 0.1

# Maximum size of a chunk in characters. If a chunk is larger than this,
# it will be split into multiple chunks. A chunk is something like a function,
# or maybe a whole file (depends on the chunker).
max_chunk_size: int = 10_000

@property
def task_queue_visibility_timeout(self) -> timedelta:
return timedelta(seconds=self.task_queue_visibility_timeout_seconds)
Expand Down
4 changes: 3 additions & 1 deletion mcpunk/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ def configure_project(
chunk_project=FileBreakdownProject(
root=path,
file_watch_refresh_freq_seconds=deps.settings().file_watch_refresh_freq_seconds,
max_chunk_size=deps.settings().max_chunk_size,
),
)
PROJECTS[project_name] = project
Expand Down Expand Up @@ -372,7 +373,8 @@ def find_matching_chunks_in_file(
- Finding a chunk where a specific function is defined
(e.g. find_matching_chunks_in_file(..., ["def my_funk"])

Returns array of {n: name, t: type, id: identifier, chars: length}
Some chunks are split into multiple parts, because they are too large. This
will look like 'chunkx_part1', 'chunkx_part2', ...
"""
proj_file = ProjectFile(project_name=project_name, rel_path=rel_path)
return _list_chunks_in_file(proj_file, filter_, "name_or_content").render()
Expand Down
Loading