sst · shanjairaj7 · Nov 4, 2025 · Nov 4, 2025 · Nov 4, 2025 · Nov 4, 2025
diff --git a/packages/opencode/src/tool/webfetch.ts b/packages/opencode/src/tool/webfetch.ts
@@ -4,10 +4,165 @@ import TurndownService from "turndown"
 import DESCRIPTION from "./webfetch.txt"
 import { Config } from "../config/config"
 import { Permission } from "../permission"
+import { Provider } from "../provider/provider"
+import { Session } from "../session"
+import { Token } from "../util/token"
+import { SessionPrompt } from "../session/prompt"
 
-const MAX_RESPONSE_SIZE = 5 * 1024 * 1024 // 5MB
-const DEFAULT_TIMEOUT = 30 * 1000 // 30 seconds
-const MAX_TIMEOUT = 120 * 1000 // 2 minutes
+const MAX_RESPONSE_SIZE = 5 * 1024 * 1024
+const DEFAULT_TIMEOUT = 30 * 1000
+const MAX_TIMEOUT = 120 * 1000
+const SEARCH_THRESHOLD = 50000
+
+const STOP_WORDS = new Set([
+  "a",
+  "an",
+  "and",
+  "are",
+  "as",
+  "at",
+  "be",
+  "by",
+  "for",
+  "from",
+  "has",
+  "he",
+  "in",
+  "is",
+  "it",
+  "its",
+  "of",
+  "on",
+  "that",
+  "the",
+  "to",
+  "was",
+  "will",
+  "with",
+])
+
+interface Chunk {
+  heading: string
+  content: string
+  position: number
+  tokens: number
+}
+
+function chunkContent(content: string): Chunk[] {
+  const chunks: Chunk[] = []
+  const sections = content.split(/^(#{1,3}\s+.+)$/m)
+  let currentHeading = "Introduction"
+  let position = 0
+
+  for (let i = 0; i < sections.length; i++) {
+    const section = sections[i].trim()
+    if (!section) continue
+
+    if (section.match(/^#{1,3}\s+/)) {
+      currentHeading = section.replace(/^#+\s+/, "")
+    } else {
+      const paragraphs = section.split(/\n\n+/)
+      for (const para of paragraphs) {
+        if (para.length < 50) continue
+        chunks.push({
+          heading: currentHeading,
+          content: para,
+          position: position++,
+          tokens: Token.estimate(para),
+        })
+      }
+    }
+  }
+  return chunks
+}
+
+function tokenize(text: string): string[] {
+  return text
+    .toLowerCase()
+    .replace(/[^\w\s]/g, " ")
+    .split(/\s+/)
+    .filter((word) => word.length > 2 && !STOP_WORDS.has(word))
+}
+
+function searchChunks(chunks: Chunk[], query: string): Chunk[] {
+  if (chunks.length === 0) return []
+  const queryTerms = tokenize(query)
+  if (queryTerms.length === 0) return []
+
+  const totalLength = chunks.reduce((sum, c) => sum + tokenize(c.content).length, 0)
+  const avgLength = totalLength / chunks.length
+
+  const scored = chunks.map((chunk) => {
+    const docTokens = tokenize(chunk.content + " " + chunk.heading)
+    const docLength = docTokens.length
+    let score = 0
+
+    for (const term of queryTerms) {
+      const termFreq = docTokens.filter((t) => t === term).length
+      if (termFreq === 0) continue
+
+      const docsWithTerm = chunks.filter((c) =>
+        tokenize(c.content + " " + c.heading).includes(term),
+      ).length
+
+      const idf = Math.log((chunks.length - docsWithTerm + 0.5) / (docsWithTerm + 0.5))
+      const k1 = 1.5
+      const b = 0.75
+      const tf = (termFreq * (k1 + 1)) / (termFreq + k1 * (1 - b + b * (docLength / avgLength)))
+      let termScore = idf * tf
+
+      if (tokenize(chunk.heading).includes(term)) {
+        termScore *= 2
+      }
+      score += termScore
+    }
+    return { chunk, score }
+  })
+
+  return scored
+    .filter((s) => s.score > 0)
+    .sort((a, b) => b.score - a.score)
+    .map((s) => s.chunk)
+}
+
+function mergeChunks(chunks: Chunk[], maxTokens: number): string {
+  let output = ""
+  let tokens = 0
+  let lastPosition = -1
+  let lastHeading = ""
+
+  for (const chunk of chunks) {
+    const needsHeading = chunk.heading !== lastHeading
+    const heading = needsHeading ? `\n\n## ${chunk.heading}\n\n` : "\n\n"
+    const chunkText = heading + chunk.content
+    const chunkTokens = Token.estimate(chunkText)
+
+    if (tokens + chunkTokens > maxTokens) break
+
+    if (lastPosition >= 0 && chunk.position > lastPosition + 1) {
+      output += "\n\n[...]\n"
+    }
+
+    output += chunkText
+    tokens += chunkTokens
+    lastPosition = chunk.position
+    lastHeading = chunk.heading
+  }
+  return output.trim()
+}
+
+function generateTableOfContents(chunks: Chunk[]): string {
+  const sections: Record<string, number> = {}
+  for (const chunk of chunks) {
+    sections[chunk.heading] = (sections[chunk.heading] || 0) + chunk.tokens
+  }
+
+  let toc = "# Table of Contents\n\n"
+  for (const [heading, tokens] of Object.entries(sections)) {
+    toc += `- ${heading} (~${tokens} tokens)\n`
+  }
+  return toc
+}
 
 export const WebFetchTool = Tool.define("webfetch", {
   description: DESCRIPTION,
@@ -16,6 +171,12 @@ export const WebFetchTool = Tool.define("webfetch", {
     format: z
       .enum(["text", "markdown", "html"])
       .describe("The format to return the content in (text, markdown, or html)"),
+    search: z
+      .string()
+      .describe(
+        "Optional search query to find relevant sections in large pages. Use descriptive multi-word queries like 'economic indicators and GDP growth' rather than single words. Returns only matching sections with context.",
+      )
+      .optional(),
     timeout: z.number().describe("Optional timeout in seconds (max 120)").optional(),
   }),
   async execute(params, ctx) {
@@ -48,13 +209,15 @@ export const WebFetchTool = Tool.define("webfetch", {
     let acceptHeader = "*/*"
     switch (params.format) {
       case "markdown":
-        acceptHeader = "text/markdown;q=1.0, text/x-markdown;q=0.9, text/plain;q=0.8, text/html;q=0.7, */*;q=0.1"
+        acceptHeader =
+          "text/markdown;q=1.0, text/x-markdown;q=0.9, text/plain;q=0.8, text/html;q=0.7, */*;q=0.1"
         break
       case "text":
         acceptHeader = "text/plain;q=1.0, text/markdown;q=0.9, text/html;q=0.8, */*;q=0.1"
         break
       case "html":
-        acceptHeader = "text/html;q=1.0, application/xhtml+xml;q=0.9, text/plain;q=0.8, text/markdown;q=0.7, */*;q=0.1"
+        acceptHeader =
+          "text/html;q=1.0, application/xhtml+xml;q=0.9, text/plain;q=0.8, text/markdown;q=0.7, */*;q=0.1"
         break
       default:
         acceptHeader =
@@ -88,56 +251,88 @@ export const WebFetchTool = Tool.define("webfetch", {
       throw new Error("Response too large (exceeds 5MB limit)")
     }
 
-    const content = new TextDecoder().decode(arrayBuffer)
+    let content = new TextDecoder().decode(arrayBuffer)
     const contentType = response.headers.get("content-type") || ""
 
     const title = `${params.url} (${contentType})`
 
-    // Handle content based on requested format and actual content type
+    let output = ""
     switch (params.format) {
       case "markdown":
         if (contentType.includes("text/html")) {
-          const markdown = convertHTMLToMarkdown(content)
-          return {
-            output: markdown,
-            title,
-            metadata: {},
-          }
+          output = convertHTMLToMarkdown(content)
+        } else {
+          output = content
         }
-        return {
-          output: content,
-          title,
-          metadata: {},
-        }
-
+        break
       case "text":
         if (contentType.includes("text/html")) {
-          const text = await extractTextFromHTML(content)
-          return {
-            output: text,
-            title,
-            metadata: {},
-          }
-        }
-        return {
-          output: content,
-          title,
-          metadata: {},
+          output = await extractTextFromHTML(content)
+        } else {
+          output = content
         }
-
+        break
       case "html":
-        return {
-          output: content,
-          title,
-          metadata: {},
+        output = content
+        break
+      default:
+        output = content
+    }
+
+    if (ctx.extra?.providerID && ctx.extra?.modelID) {
+      const model = await Provider.getModel(ctx.extra.providerID, ctx.extra.modelID)
+      const messages = await Session.messages(ctx.sessionID)
+
+      let currentTokens = 0
+      for (const msg of messages) {
+        if (msg.info.role === "assistant") {
+          currentTokens +=
+            msg.info.tokens.input + msg.info.tokens.cache.read + msg.info.tokens.output
         }
+      }
 
-      default:
-        return {
-          output: content,
-          title,
-          metadata: {},
+      const contextLimit = model.info.limit.context
+      if (contextLimit > 0) {
+        const outputReserve =
+          Math.min(model.info.limit.output, SessionPrompt.OUTPUT_TOKEN_MAX) ||
+          SessionPrompt.OUTPUT_TOKEN_MAX
+        const availableTokens = contextLimit - outputReserve - currentTokens
+        const outputTokens = Token.estimate(output)
+
+        if (params.search && outputTokens > SEARCH_THRESHOLD) {
+          const chunks = chunkContent(output)
+          const relevantChunks = searchChunks(chunks, params.search)
+
+          if (relevantChunks.length > 0) {
+            const sortedChunks = relevantChunks.sort((a, b) => a.position - b.position)
+            output = mergeChunks(sortedChunks, Math.floor(availableTokens * 0.8))
+            output += `\n\n[Search Results: Found ${relevantChunks.length} relevant sections for "${params.search}". Showing ~${Token.estimate(output)} tokens from ${chunks.length} total sections. Use a different search query to find other content.]`
+          } else {
+            output = generateTableOfContents(chunks)
+            output += `\n\n[No results found for "${params.search}". Above is the table of contents with ~${chunks.length} sections. Try a different search query or fetch without search parameter to see full content.]`
+          }
+        } else if (outputTokens > availableTokens && availableTokens > 0) {
+          if (!params.search && outputTokens > SEARCH_THRESHOLD) {
+            const targetLength = Math.floor((availableTokens / outputTokens) * output.length * 0.8)
+            const truncatedTokens = Token.estimate(output.slice(0, targetLength))
+            output =
+              output.slice(0, targetLength) +
+              `\n\n[Content truncated: Original size was ~${outputTokens} tokens, reduced to ~${truncatedTokens} tokens due to context limit. Available context: ${availableTokens} tokens. The above content is approximately ${Math.round((targetLength / output.length) * 100)}% of the full page. Tip: Use the search parameter to find specific content: webfetch(url, format, search="your query")]`
+          } else {
+            const targetLength = Math.floor((availableTokens / outputTokens) * output.length * 0.8)
+            const truncatedTokens = Token.estimate(output.slice(0, targetLength))
+            output =
+              output.slice(0, targetLength) +
+              `\n\n[Content truncated: Original size was ~${outputTokens} tokens, reduced to ~${truncatedTokens} tokens due to context limit. Available context: ${availableTokens} tokens. The above content is approximately ${Math.round((targetLength / output.length) * 100)}% of the full page.]`
+          }
         }
+      }
+    }
+
+    return {
+      output,
+      title,
+      metadata: {},
     }
   },
 })
@@ -158,7 +353,9 @@ async function extractTextFromHTML(html: string) {
     .on("*", {
       element(element) {
         // Reset skip flag when entering other elements
-        if (!["script", "style", "noscript", "iframe", "object", "embed"].includes(element.tagName)) {
+        if (
+          !["script", "style", "noscript", "iframe", "object", "embed"].includes(element.tagName)
+        ) {
           skipContent = false
         }
       },

diff --git a/packages/opencode/src/tool/webfetch.txt b/packages/opencode/src/tool/webfetch.txt
@@ -1,14 +1,24 @@
 - Fetches content from a specified URL
-- Takes a URL and a prompt as input
-- Fetches the URL content, converts HTML to markdown
-- Returns the model's response about the content
-- Use this tool when you need to retrieve and analyze web content
+- Can search within large pages to extract only relevant sections
+- Converts HTML to markdown, text, or returns raw HTML
+
+Parameters:
+  - url: The URL to fetch content from
+  - format: Return format (text, markdown, or html)
+  - search: Optional search query to find specific content in large pages
+  - timeout: Optional timeout in seconds (max 120)
+
+When to use search:
+  - If you expect a page to be long and only need specific sections
+  - If webfetch returns truncated results, use search to get the parts you need
+  - Search returns only relevant sections that match your query
 
 Usage notes:
   - IMPORTANT: if another tool is present that offers better web fetching capabilities, is more targeted to the task, or has fewer restrictions, prefer using that tool instead of this one.
   - The URL must be a fully-formed valid URL
   - HTTP URLs will be automatically upgraded to HTTPS
-  - The prompt should describe what information you want to extract from the page
   - This tool is read-only and does not modify any files
-  - Results may be summarized if the content is very large
-  - Includes a self-cleaning 15-minute cache for faster responses when repeatedly accessing the same URL
+
+Examples:
+  - webfetch(url, format="markdown") - Fetch full page
+  - webfetch(url, format="markdown", search="economic indicators and GDP") - Get specific sections only