diff --git a/.qwen/commands/qc/code-review.md b/.qwen/commands/qc/code-review.md index b5846485a9..021a80d9f6 100644 --- a/.qwen/commands/qc/code-review.md +++ b/.qwen/commands/qc/code-review.md @@ -14,6 +14,7 @@ You are an expert code reviewer. Follow these steps: - Any potential issues or risks Keep your review concise but thorough. Focus on: + - Code correctness - Following project conventions - Performance implications diff --git a/.qwen/commands/qc/commit.md b/.qwen/commands/qc/commit.md index 76ef6b4173..fab58da2e8 100644 --- a/.qwen/commands/qc/commit.md +++ b/.qwen/commands/qc/commit.md @@ -5,22 +5,26 @@ description: Commit staged changes with an AI-generated commit message and push # Commit and Push ## Overview + Generate a clear, concise commit message based on staged changes, confirm with the user, then commit and push. ## Steps ### 1. Check repository status + - Run `git status` to check: - Are there any staged changes? - Are there unstaged changes? - What is the current branch? ### 2. Handle unstaged changes + - If there are unstaged changes, notify the user and list them - Do NOT add or commit unstaged changes - Proceed only with staged changes ### 3. Review staged changes + - Run `git diff --staged` to see all staged changes - Analyze the changes in depth to understand: - What files were modified/added/deleted @@ -28,6 +32,7 @@ Generate a clear, concise commit message based on staged changes, confirm with t - The scope and impact of the changes ### 4. Handle branch logic + - Get current branch name with `git branch --show-current` - **If current branch is `main` or `master`:** - Generate a proper branch name based on the changes @@ -40,6 +45,7 @@ Generate a clear, concise commit message based on staged changes, confirm with t - Wait for user decision ### 5. Generate commit message + - Types: feat, fix, docs, style, refactor, test, chore - Guidelines: - Be clear and concise @@ -49,6 +55,7 @@ Generate a clear, concise commit message based on staged changes, confirm with t - Include a footer explaining the purpose/impact of the changes **Format:** + ``` (): - (optional) @@ -59,12 +66,14 @@ This . ``` ### 6. Present the result and confirm with user + - Present the generated commit message - Show which branch will be used - Ask for confirmation: "Proceed with commit and push?" - Wait for user approval ### 7. Commit and push + - After user confirms: - `git commit -m ""` - `git push -u origin ` (use `-u` for new branches) diff --git a/.qwen/commands/qc/create-issue.md b/.qwen/commands/qc/create-issue.md index 54317621bd..020ef00d03 100644 --- a/.qwen/commands/qc/create-issue.md +++ b/.qwen/commands/qc/create-issue.md @@ -5,9 +5,11 @@ description: Draft and submit a GitHub issue based on a user-provided idea # Create Issue ## Overview + Take the user's idea or bug description, investigate the codebase to understand the full context, draft a GitHub issue for review, and submit it once approved. ## Input + The user provides a brief description of a feature request or bug report: {{args}} ## Steps diff --git a/.qwen/commands/qc/create-pr.md b/.qwen/commands/qc/create-pr.md index bf3c3c1e49..f2b4919256 100644 --- a/.qwen/commands/qc/create-pr.md +++ b/.qwen/commands/qc/create-pr.md @@ -5,9 +5,11 @@ description: Create a pull request based on staged code changes # Create PR ## Overview + Create a well-structured pull request with proper description and title. ## Steps + 1. **Review staged changes** - Review all staged changes to understand what has been done - Do not touch unstaged changes @@ -31,4 +33,4 @@ Create a well-structured pull request with proper description and title. ## PR Template -@{.github/pull_request_template.md} \ No newline at end of file +@{.github/pull_request_template.md} diff --git a/.qwen/skills/terminal-capture/SKILL.md b/.qwen/skills/terminal-capture/SKILL.md index 7fc99a18d1..043f49542e 100644 --- a/.qwen/skills/terminal-capture/SKILL.md +++ b/.qwen/skills/terminal-capture/SKILL.md @@ -211,31 +211,31 @@ This tool is commonly used for visual verification during PR reviews. For the co ```typescript interface FlowStep { - type?: string; // Input text - key?: string | string[]; // Key press(es) - capture?: string; // Viewport screenshot filename - captureFull?: string; // Full scrollback screenshot filename + type?: string; // Input text + key?: string | string[]; // Key press(es) + capture?: string; // Viewport screenshot filename + captureFull?: string; // Full scrollback screenshot filename streaming?: { - delayMs?: number; // Delay before first capture (default: 0) - intervalMs: number; // Interval between captures in ms - count: number; // Maximum number of captures - gif?: boolean; // Generate animated GIF (default: true) + delayMs?: number; // Delay before first capture (default: 0) + intervalMs: number; // Interval between captures in ms + count: number; // Maximum number of captures + gif?: boolean; // Generate animated GIF (default: true) }; } interface ScenarioConfig { - name: string; // Scenario name (also used as screenshot subdirectory name) - spawn: string[]; // Launch command ["node", "dist/cli.js", "--yolo"] - flow: FlowStep[]; // Interaction steps + name: string; // Scenario name (also used as screenshot subdirectory name) + spawn: string[]; // Launch command ["node", "dist/cli.js", "--yolo"] + flow: FlowStep[]; // Interaction steps terminal?: { - cols?: number; // Number of columns, default 100 - rows?: number; // Number of rows, default 28 - theme?: string; // Theme: dracula|one-dark|github-dark|monokai|night-owl - chrome?: boolean; // macOS window decorations, default true - title?: string; // Window title, default "Terminal" - fontSize?: number; // Font size - cwd?: string; // Working directory (relative to config file) + cols?: number; // Number of columns, default 100 + rows?: number; // Number of rows, default 28 + theme?: string; // Theme: dracula|one-dark|github-dark|monokai|night-owl + chrome?: boolean; // macOS window decorations, default true + title?: string; // Window title, default "Terminal" + fontSize?: number; // Font size + cwd?: string; // Working directory (relative to config file) }; - outputDir?: string; // Screenshot output directory (relative to config file) + outputDir?: string; // Screenshot output directory (relative to config file) } ``` diff --git a/docs/developers/tools/file-system.md b/docs/developers/tools/file-system.md index bfa6de8d0f..1d781eeafa 100644 --- a/docs/developers/tools/file-system.md +++ b/docs/developers/tools/file-system.md @@ -24,7 +24,7 @@ Qwen Code provides a comprehensive suite of tools for interacting with the local ## 2. `read_file` (ReadFile) -`read_file` reads and returns the content of a specified file. This tool handles text, images (PNG, JPG, GIF, WEBP, SVG, BMP), and PDF files. For text files, it can read specific line ranges. Other binary file types are generally skipped. +`read_file` reads and returns the content of a specified file. This tool handles text files and media files (images, PDFs, audio, video) whose modality is supported by the current model. For text files, it can read specific line ranges. Media files whose modality is not supported by the current model are rejected with a helpful error message. Other binary file types are generally skipped. - **Tool name:** `read_file` - **Display name:** ReadFile @@ -35,11 +35,12 @@ Qwen Code provides a comprehensive suite of tools for interacting with the local - `limit` (number, optional): For text files, the maximum number of lines to read. If omitted, reads a default maximum (e.g., 2000 lines) or the entire file if feasible. - **Behavior:** - For text files: Returns the content. If `offset` and `limit` are used, returns only that slice of lines. Indicates if content was truncated due to line limits or line length limits. - - For image and PDF files: Returns the file content as a base64-encoded data structure suitable for model consumption. + - For media files (images, PDFs, audio, video): If the current model supports the file's modality, returns the file content as a base64-encoded `inlineData` object. If the model does not support the modality, returns an error message with guidance (e.g., suggesting skills or external tools). - For other binary files: Attempts to identify and skip them, returning a message indicating it's a generic binary file. - **Output:** (`llmContent`): - For text files: The file content, potentially prefixed with a truncation message (e.g., `[File content truncated: showing lines 1-100 of 500 total lines...]\nActual file content...`). - - For image/PDF files: An object containing `inlineData` with `mimeType` and base64 `data` (e.g., `{ inlineData: { mimeType: 'image/png', data: 'base64encodedstring' } }`). + - For supported media files: An object containing `inlineData` with `mimeType` and base64 `data` (e.g., `{ inlineData: { mimeType: 'image/png', data: 'base64encodedstring' } }`). + - For unsupported media files: An error message string explaining that the current model does not support this modality, with suggestions for alternatives. - For other binary files: A message like `Cannot display content of binary file: /path/to/data.bin`. - **Confirmation:** No. diff --git a/packages/cli/src/acp-integration/service/filesystem.test.ts b/packages/cli/src/acp-integration/service/filesystem.test.ts index 2ff8e2b6b2..a8683c7c5f 100644 --- a/packages/cli/src/acp-integration/service/filesystem.test.ts +++ b/packages/cli/src/acp-integration/service/filesystem.test.ts @@ -13,12 +13,10 @@ const RESOURCE_NOT_FOUND_CODE = -32002; const INTERNAL_ERROR_CODE = -32603; const createFallback = (): FileSystemService => ({ - readTextFile: vi - .fn() - .mockResolvedValue({ - content: '', - _meta: { bom: false, encoding: 'utf-8' }, - }), + readTextFile: vi.fn().mockResolvedValue({ + content: '', + _meta: { bom: false, encoding: 'utf-8' }, + }), writeTextFile: vi.fn().mockResolvedValue({ _meta: undefined }), findFiles: vi.fn().mockReturnValue([]), }); diff --git a/packages/core/src/tools/read-file.test.ts b/packages/core/src/tools/read-file.test.ts index ec07a69955..f6f140afc2 100644 --- a/packages/core/src/tools/read-file.test.ts +++ b/packages/core/src/tools/read-file.test.ts @@ -44,6 +44,9 @@ describe('ReadFileTool', () => { }, getTruncateToolOutputThreshold: () => 2500, getTruncateToolOutputLines: () => 500, + getContentGeneratorConfig: () => ({ + modalities: { image: true, pdf: true, audio: true, video: true }, + }), } as unknown as Config; tool = new ReadFileTool(mockConfigInstance); }); diff --git a/packages/core/src/utils/fileUtils.test.ts b/packages/core/src/utils/fileUtils.test.ts index 0daa68c4ec..b2210c3ecf 100644 --- a/packages/core/src/utils/fileUtils.test.ts +++ b/packages/core/src/utils/fileUtils.test.ts @@ -59,6 +59,10 @@ describe('fileUtils', () => { getTruncateToolOutputThreshold: () => 2500, getTruncateToolOutputLines: () => 500, getTargetDir: () => tempRootDir, + getModel: () => 'qwen3.5-plus', + getContentGeneratorConfig: () => ({ + modalities: { image: true, video: true }, + }), getFileSystemService: () => fsService, } as unknown as Config; @@ -891,29 +895,73 @@ describe('fileUtils', () => { expect(result.returnDisplay).toContain('Read image file: image.png'); }); - it('should process a PDF file', async () => { + it('should reject image files when model does not support image', async () => { + const fakePngData = Buffer.from('fake png data'); + actualNodeFs.writeFileSync(testImageFilePath, fakePngData); + mockMimeGetType.mockReturnValue('image/png'); + + const mockConfigNoImage = { + ...mockConfig, + getContentGeneratorConfig: () => ({ modalities: {} }), + } as unknown as Config; + + const result = await processSingleFileContent( + testImageFilePath, + mockConfigNoImage, + ); + expect(typeof result.llmContent).toBe('string'); + expect(result.llmContent).toContain('Unsupported image file'); + expect(result.llmContent).toContain('does not support image input'); + expect(result.returnDisplay).toContain('Skipped image file'); + }); + + it('should reject PDF files when model does not support PDF', async () => { const fakePdfData = Buffer.from('fake pdf data'); actualNodeFs.writeFileSync(testPdfFilePath, fakePdfData); mockMimeGetType.mockReturnValue('application/pdf'); + + const mockConfigNoPdf = { + ...mockConfig, + getContentGeneratorConfig: () => ({ + modalities: { image: true }, + }), + } as unknown as Config; + const result = await processSingleFileContent( testPdfFilePath, - mockConfig, + mockConfigNoPdf, ); - expect( - (result.llmContent as { inlineData: unknown }).inlineData, - ).toBeDefined(); + expect(typeof result.llmContent).toBe('string'); + expect(result.llmContent).toContain('Unsupported pdf file'); + expect(result.llmContent).toContain( + 'does not support PDF input directly', + ); + expect(result.llmContent).toContain('/extensions install'); + expect(result.returnDisplay).toContain('Skipped pdf file'); + }); + + it('should accept PDF files when model supports PDF', async () => { + const fakePdfData = Buffer.from('fake pdf data'); + actualNodeFs.writeFileSync(testPdfFilePath, fakePdfData); + mockMimeGetType.mockReturnValue('application/pdf'); + + const mockConfigWithPdf = { + ...mockConfig, + getContentGeneratorConfig: () => ({ + modalities: { image: true, pdf: true }, + }), + } as unknown as Config; + + const result = await processSingleFileContent( + testPdfFilePath, + mockConfigWithPdf, + ); + expect(result.llmContent).toHaveProperty('inlineData'); expect( (result.llmContent as { inlineData: { mimeType: string } }).inlineData .mimeType, ).toBe('application/pdf'); - expect( - (result.llmContent as { inlineData: { data: string } }).inlineData.data, - ).toBe(fakePdfData.toString('base64')); - expect( - (result.llmContent as { inlineData: { displayName?: string } }) - .inlineData.displayName, - ).toBe('document.pdf'); - expect(result.returnDisplay).toContain('Read pdf file: document.pdf'); + expect(result.returnDisplay).toContain('Read pdf file'); }); it('should read an SVG file as text when under 1MB', async () => { diff --git a/packages/core/src/utils/fileUtils.ts b/packages/core/src/utils/fileUtils.ts index 41029138ed..4730bfd352 100644 --- a/packages/core/src/utils/fileUtils.ts +++ b/packages/core/src/utils/fileUtils.ts @@ -18,6 +18,7 @@ import { ToolErrorType } from '../tools/tool-error.js'; import { BINARY_EXTENSIONS } from './ignorePatterns.js'; import type { Config } from '../config/config.js'; import { createDebugLogger } from './debugLogger.js'; +import type { InputModalities } from '../core/contentGenerator.js'; import { detectEncodingFromBuffer } from './systemEncoding.js'; const debugLogger = createDebugLogger('FILE_UTILS'); @@ -507,6 +508,42 @@ export interface ProcessedFileReadResult { linesShown?: [number, number]; // For text files [startLine, endLine] (1-based for display) } +/** + * For media file types, returns the corresponding modality key. + * Returns undefined for non-media types (text, binary, svg) which are always supported. + */ +function mediaModalityKey( + fileType: 'image' | 'pdf' | 'audio' | 'video' | 'text' | 'binary' | 'svg', +): keyof InputModalities | undefined { + if ( + fileType === 'image' || + fileType === 'pdf' || + fileType === 'audio' || + fileType === 'video' + ) { + return fileType; + } + return undefined; +} + +/** + * Build the same unsupported-modality message used by the converter, + * so the LLM sees a consistent hint regardless of where the check fires. + */ +function unsupportedModalityMessage( + modality: string, + displayName: string, +): string { + let hint: string; + if (modality === 'pdf') { + hint = + 'This model does not support PDF input directly. The read_file tool cannot extract PDF content either. To extract text from the PDF file, try using skills if applicable, or guide user to install pdf skill by running this slash command:\n/extensions install https://github.com/anthropics/skills:document-skills'; + } else { + hint = `This model does not support ${modality} input. The read_file tool cannot process this type of file either. To handle this file, try using skills if applicable, or any tools installed at system wide, or let the user know you cannot process this type of file.`; + } + return `[Unsupported ${modality} file: "${displayName}". ${hint}]`; +} + /** * Reads and processes a single file, handling text, images, and PDFs. * @param filePath Absolute path to the file. @@ -561,6 +598,26 @@ export async function processSingleFileContent( .replace(/\\/g, '/'); const displayName = path.basename(filePath); + + // Check modality support for media files using the resolved config + // (same source of truth the converter uses at API-call time). + const modality = mediaModalityKey(fileType); + if (modality) { + const modalities: InputModalities = + config.getContentGeneratorConfig()?.modalities ?? {}; + if (!modalities[modality]) { + const message = unsupportedModalityMessage(modality, displayName); + debugLogger.warn( + `Model '${config.getModel()}' does not support ${modality} input. ` + + `Skipping file: ${relativePathForDisplay}`, + ); + return { + llmContent: message, + returnDisplay: `Skipped ${fileType} file: ${relativePathForDisplay} (model doesn't support ${modality} input)`, + }; + } + } + switch (fileType) { case 'binary': { return { diff --git a/packages/core/src/utils/pathReader.test.ts b/packages/core/src/utils/pathReader.test.ts index 282a7d6d1e..97717d0a31 100644 --- a/packages/core/src/utils/pathReader.test.ts +++ b/packages/core/src/utils/pathReader.test.ts @@ -31,6 +31,9 @@ const createMockConfig = ( getFileService: () => mockFileService, getTruncateToolOutputThreshold: () => 2500, getTruncateToolOutputLines: () => 500, + getContentGeneratorConfig: () => ({ + modalities: { image: true, pdf: true, audio: true, video: true }, + }), } as unknown as Config; };