stickerdaniel · NoahStarkenburg · Mar 5, 2026 · Mar 5, 2026
diff --git a/linkedin_mcp_server/scraping/extractor.py b/linkedin_mcp_server/scraping/extractor.py
@@ -358,29 +358,200 @@ async def scrape_job(self, job_id: str) -> dict[str, Any]:
             "sections_requested": ["job_posting"],
         }
 
+    async def _extract_job_listings(self) -> list[dict[str, str]]:
+        """Extract job IDs and titles from job link hrefs on the current page.
+
+        Returns:
+            [{job_id, title}]
+        """
+        return await self._page.evaluate("""() => {
+            const links = document.querySelectorAll('a[href*="/jobs/view/"]');
+            const seen = new Set();
+            const results = [];
+
+            for (const link of links) {
+                const match = link.href.match(/\\/jobs\\/view\\/(\\d+)/);
+                if (!match || seen.has(match[1])) continue;
+                seen.add(match[1]);
+
+                const title = link.innerText.trim().split('\\n')[0];
+                if (!title) continue;
+
+                results.push({job_id: match[1], title: title});
+            }
+            return results;
+        }""")
+
+    async def _scroll_job_list(
+        self, pause_time: float = 0.8, max_scrolls: int = 25
+    ) -> None:
+        """Scroll the job list sidebar to load all lazy-rendered cards.
+
+        Finds the scrollable ancestor of the first job link and scrolls it,
+        rather than relying on specific CSS class names.  Also scrolls the
+        window as a fallback for layouts that use page-level scroll.
+
+        Args:
+            pause_time: Time to pause between scrolls (seconds)
+            max_scrolls: Maximum number of scroll attempts
+        """
+        for i in range(max_scrolls):
+            prev_count = await self._page.evaluate(
+                """() => document.querySelectorAll('a[href*="/jobs/view/"]').length"""
+            )
+
+            # Scroll every scrollable ancestor of the job list
+            await self._page.evaluate("""() => {
+                const jobLink = document.querySelector('a[href*="/jobs/view/"]');
+                if (!jobLink) return;
+
+                let el = jobLink.parentElement;
+                while (el && el !== document.body) {
+                    // 10px buffer to ignore minor rounding/border differences
+                    if (el.scrollHeight > el.clientHeight + 10) {
+                        el.scrollTop = el.scrollHeight;
+                    }
+                    el = el.parentElement;
+                }
+
+                window.scrollTo(0, document.body.scrollHeight);
+            }""")
+
+            await asyncio.sleep(pause_time)
+
+            new_count = await self._page.evaluate(
+                """() => document.querySelectorAll('a[href*="/jobs/view/"]').length"""
+            )
+            logger.debug("Scroll %d: job links %d -> %d", i + 1, prev_count, new_count)
+
+            if new_count == prev_count:
+                # Extra pause on first scroll in case of slow loading
+                if i == 0:
+                    await asyncio.sleep(pause_time * 2)
+                    continue
+                logger.debug("No new jobs after scroll %d, stopping", i + 1)
+                break
+
+    async def _extract_job_page(self, url: str) -> tuple[str, list[dict[str, str]]]:
+        """Load a single job search results page, extract text and listings."""
+        await self._page.goto(url, wait_until="domcontentloaded", timeout=30000)
+        await detect_rate_limit(self._page)
+
+        try:
+            await self._page.wait_for_selector("main", timeout=5000)
+        except PlaywrightTimeoutError:
+            logger.debug("No <main> element found on %s", url)
+
+        await handle_modal_close(self._page)
+
+        # Scroll the job list sidebar to load all lazy-rendered cards
+        await self._scroll_job_list(pause_time=0.8, max_scrolls=20)
+
+        # Extract text
+        raw = await self._page.evaluate(
+            """() => {
+                const main = document.querySelector('main');
+                return main ? main.innerText : document.body.innerText;
+            }"""
+        )
+        text = strip_linkedin_noise(raw) if raw else ""
+
+        # Extract structured job listings
+        try:
+            listings = await self._extract_job_listings()
+        except Exception as e:
+            logger.warning("Failed to extract job listings from %s: %s", url, e)
+            listings = []
+
+        return text, listings
+
     async def search_jobs(
-        self, keywords: str, location: str | None = None
+        self,
+        keywords: str,
+        location: str | None = None,
+        max_pages: int = 3,
     ) -> dict[str, Any]:
-        """Search for jobs and extract the results page.
+        """Search for jobs and extract results across multiple pages.
+
+        Args:
+            keywords: Search keywords.
+            location: Optional location filter.
+            max_pages: Number of search result pages to scrape (1-100, default 3).
+                       Each page has ~10 results. Higher values take longer and
+                       increase the risk of rate limiting.
 
         Returns:
-            {url, sections: {name: text}, pages_visited, sections_requested}
+            {url, sections: {name: text}, job_listings: [{job_id, title}],
+             pages_visited, sections_requested}
         """
-        params = f"keywords={quote_plus(keywords)}"
+        max_pages = max(1, min(max_pages, 100))
+
+        base_params = f"keywords={quote_plus(keywords)}"
         if location:
-            params += f"&location={quote_plus(location)}"
+            base_params += f"&location={quote_plus(location)}"
 
-        url = f"https://www.linkedin.com/jobs/search/?{params}"
-        text = await self.extract_page(url)
+        all_listings: list[dict[str, str]] = []
+        seen_ids: set[str] = set()
+        all_text_parts: list[str] = []
+        pages_visited: list[str] = []
+        start = 0
+
+        for page_num in range(max_pages):
+            params = base_params if start == 0 else f"{base_params}&start={start}"
+            url = f"https://www.linkedin.com/jobs/search/?{params}"
+
+            try:
+                text, listings = await self._extract_job_page(url)
+            except LinkedInScraperException:
+                raise
+            except Exception as e:
+                logger.warning("Failed to load job search page %d: %s", page_num + 1, e)
+                break
+
+            pages_visited.append(url)
+            if text:
+                all_text_parts.append(text)
+
+            # Advance offset by the number of listings found on this page
+            start += len(listings) if listings else 25
+
+            # Deduplicate across pages
+            new_on_page = 0
+            for listing in listings:
+                if listing["job_id"] not in seen_ids:
+                    seen_ids.add(listing["job_id"])
+                    all_listings.append(listing)
+                    new_on_page += 1
+
+            logger.info(
+                "Page %d: found %d listings (%d new, %d total)",
+                page_num + 1,
+                len(listings),
+                new_on_page,
+                len(all_listings),
+            )
+
+            # Stop early if this page returned nothing new
+            if new_on_page == 0:
+                logger.info(
+                    "No new listings on page %d, stopping pagination", page_num + 1
+                )
+                break
+
+            # Delay between pages to avoid rate limiting
+            if page_num < max_pages - 1:
+                await asyncio.sleep(_NAV_DELAY)
 
         sections: dict[str, str] = {}
-        if text:
-            sections["search_results"] = text
+        if all_text_parts:
+            sections["search_results"] = "\n\n---\n\n".join(all_text_parts)
 
+        first_url = f"https://www.linkedin.com/jobs/search/?{base_params}"
         return {
-            "url": url,
+            "url": first_url,
             "sections": sections,
-            "pages_visited": [url],
+            "job_listings": all_listings,
+            "pages_visited": pages_visited,
             "sections_requested": ["search_results"],
         }
 

diff --git a/linkedin_mcp_server/tools/job.py b/linkedin_mcp_server/tools/job.py
@@ -76,6 +76,7 @@ async def search_jobs(
         keywords: str,
         ctx: Context,
         location: str | None = None,
+        max_pages: int = 3,
     ) -> dict[str, Any]:
         """
         Search for jobs on LinkedIn.
@@ -84,18 +85,22 @@ async def search_jobs(
             keywords: Search keywords (e.g., "software engineer", "data scientist")
             ctx: FastMCP context for progress reporting
             location: Optional location filter (e.g., "San Francisco", "Remote")
+            max_pages: Number of result pages to scrape (1-100, default 3).
+                       Each page returns ~10 jobs. Higher values are slower
+                       and increase rate-limit risk.
 
         Returns:
-            Dict with url, sections (name -> raw text), pages_visited, and sections_requested.
-            The LLM should parse the raw text to extract job listings.
+            Dict with url, sections (name -> raw text), job_listings (list of
+            {job_id, title} dicts), pages_visited, and sections_requested.
         """
         try:
             await ensure_authenticated()
 
             logger.info(
-                "Searching jobs: keywords='%s', location='%s'",
+                "Searching jobs: keywords='%s', location='%s', max_pages=%d",
                 keywords,
                 location,
+                max_pages,
             )
 
             browser = await get_or_create_browser()
@@ -105,7 +110,7 @@ async def search_jobs(
                 progress=0, total=100, message="Starting job search"
             )
 
-            result = await extractor.search_jobs(keywords, location)
+            result = await extractor.search_jobs(keywords, location, max_pages)
 
             await ctx.report_progress(progress=100, total=100, message="Complete")