diff --git a/AGENTS.md b/AGENTS.md index 94d20690..1317f588 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -60,6 +60,7 @@ This is a **LinkedIn MCP (Model Context Protocol) Server** that enables AI assis | `get_company_posts` | Get recent posts from company feed | | `get_job_details` | Get job posting details | | `search_jobs` | Search jobs by keywords and location | +| `get_saved_jobs` | Get saved/bookmarked jobs from the job tracker (paginated, optional `max_pages`) | | `close_session` | Close browser session and clean up resources | | `search_people` | Search for people by keywords and location | diff --git a/README.md b/README.md index 6d082455..6c00e116 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,7 @@ What has Anthropic been posting about recently? https://www.linkedin.com/company | `search_jobs` | Search for jobs with keywords and location filters | Working | | `search_people` | Search for people by keywords and location | Working | | `get_job_details` | Get detailed information about a specific job posting | Working | +| `get_saved_jobs` | Get saved/bookmarked jobs from your LinkedIn job tracker | Working | | `close_session` | Close browser session and clean up resources | Working | > [!IMPORTANT] diff --git a/docs/docker-hub.md b/docs/docker-hub.md index e122abc5..a4c21a94 100644 --- a/docs/docker-hub.md +++ b/docs/docker-hub.md @@ -8,6 +8,7 @@ A Model Context Protocol (MCP) server that connects AI assistants to LinkedIn. A - **Company Profiles**: Extract comprehensive company data - **Job Details**: Retrieve job posting information - **Job Search**: Search for jobs with keywords and location filters +- **Saved Jobs**: Get saved/bookmarked jobs from your LinkedIn job tracker - **People Search**: Search for people by keywords and location - **Company Posts**: Get recent posts from a company's LinkedIn feed diff --git a/linkedin_mcp_server/scraping/extractor.py b/linkedin_mcp_server/scraping/extractor.py index 2a34a397..92842144 100644 --- a/linkedin_mcp_server/scraping/extractor.py +++ b/linkedin_mcp_server/scraping/extractor.py @@ -3,6 +3,7 @@ import asyncio import logging import re +from collections.abc import Awaitable, Callable from typing import Any from urllib.parse import quote_plus @@ -384,6 +385,141 @@ async def search_jobs( "sections_requested": ["search_results"], } + _EXTRACT_JOB_IDS_JS = """() => { + const seen = new Set(); + const ids = []; + document.querySelectorAll('a[href*="/jobs/view/"]').forEach(a => { + const match = a.href.match(/\\/jobs\\/view\\/(\\d+)/); + if (match && !seen.has(match[1])) { seen.add(match[1]); ids.push(match[1]); } + }); + return ids; + }""" + + _EXTRACT_MAIN_TEXT_JS = """() => { + const main = document.querySelector('main'); + return main ? main.innerText : document.body.innerText; + }""" + + async def scrape_saved_jobs( + self, + max_pages: int = 10, + on_progress: Callable[[int, int, str], Awaitable[None]] | None = None, + ) -> dict[str, Any]: + """Scrape the user's saved/bookmarked jobs from the jobs tracker page. + + Automatically paginates through all pages using numbered page buttons. + Extracts job IDs from link hrefs (``/jobs/view//``) since they are + not present in the page's innerText. + + Args: + max_pages: Safety cap on pages to scrape (default 10). + on_progress: Optional async callback ``(page, total, message)`` + invoked after each page is scraped. + + Returns: + {url, sections: {name: text}, pages_visited, sections_requested, + job_ids: list[str]} + """ + url = "https://www.linkedin.com/jobs-tracker/" + text = await self.extract_page(url) + + all_text_parts: list[str] = [] + all_job_ids: list[str] = [] + + if text: + all_text_parts.append(text) + + # Collect job IDs from page 1. + page_ids: list[str] = await self._page.evaluate(self._EXTRACT_JOB_IDS_JS) + all_job_ids.extend(page_ids) + logger.info("Page 1: found %d job IDs", len(page_ids)) + + # Determine total pages from pagination buttons (10 jobs per page). + page_buttons = self._page.locator('button[aria-label^="Page "]') + total_pages = min(max(await page_buttons.count(), 1), max_pages) + logger.info("Total pages detected: %d", total_pages) + + if on_progress: + await on_progress(1, total_pages, "Fetched saved jobs page 1") + + # Paginate through remaining pages using numbered page buttons. + for page_num in range(2, max_pages + 1): + page_btn = self._page.locator(f'button[aria-label="Page {page_num}"]') + if not await page_btn.count(): + logger.info( + "No page %d button found — stopping at page %d", + page_num, + page_num - 1, + ) + break + + logger.info("Navigating to saved jobs page %d", page_num) + prev_ids = set(all_job_ids) + await page_btn.scroll_into_view_if_needed() + await page_btn.click() + await asyncio.sleep(_NAV_DELAY) + + # Wait for the DOM to reflect new job links. + try: + await self._page.wait_for_function( + """(prevIds) => { + const prev = new Set(prevIds); + const links = document.querySelectorAll('a[href*="/jobs/view/"]'); + for (const a of links) { + const match = a.href.match(/\\/jobs\\/view\\/(\\d+)/); + if (match && !prev.has(match[1])) return true; + } + return false; + }""", + arg=list(prev_ids), + timeout=15000, + ) + except PlaywrightTimeoutError: + logger.info("No new job IDs appeared on page %d — stopping", page_num) + break + + await scroll_to_bottom(self._page, pause_time=0.5, max_scrolls=3) + + raw = await self._page.evaluate(self._EXTRACT_MAIN_TEXT_JS) + if raw: + cleaned = strip_linkedin_noise(raw) + if cleaned: + all_text_parts.append(cleaned) + + page_ids = await self._page.evaluate(self._EXTRACT_JOB_IDS_JS) + new_ids = [jid for jid in page_ids if jid not in prev_ids] + logger.info("Page %d: found %d new job IDs", page_num, len(new_ids)) + if not new_ids: + break + all_job_ids.extend(new_ids) + + if on_progress: + await on_progress( + page_num, total_pages, f"Fetched saved jobs page {page_num}" + ) + + # Append a summary of job IDs so they are always visible in the text. + id_summary = "\n".join( + f"- Job ID: {jid} (https://www.linkedin.com/jobs/view/{jid}/)" + for jid in all_job_ids + ) + if id_summary: + all_text_parts.append(f"--- Saved Job IDs ---\n{id_summary}") + + sections: dict[str, str] = {} + if all_text_parts: + sections["saved_jobs"] = "\n\n".join(all_text_parts) + + logger.info("Total saved jobs found: %d across all pages", len(all_job_ids)) + + return { + "url": url, + "sections": sections, + "pages_visited": [url], + "sections_requested": ["saved_jobs"], + "job_ids": all_job_ids, + } + async def search_people( self, keywords: str, diff --git a/linkedin_mcp_server/tools/job.py b/linkedin_mcp_server/tools/job.py index 3eadf552..b45e4a51 100644 --- a/linkedin_mcp_server/tools/job.py +++ b/linkedin_mcp_server/tools/job.py @@ -64,6 +64,53 @@ async def get_job_details(job_id: str, ctx: Context) -> dict[str, Any]: except Exception as e: return handle_tool_error(e, "get_job_details") + @mcp.tool( + annotations=ToolAnnotations( + title="Get Saved Jobs", + readOnlyHint=True, + destructiveHint=False, + openWorldHint=True, + ) + ) + async def get_saved_jobs(ctx: Context, max_pages: int = 10) -> dict[str, Any]: + """ + Get the user's saved/bookmarked jobs from LinkedIn's job tracker. + + Args: + max_pages: Maximum number of pages to scrape (default 10, ~10 jobs/page). + + Returns: + Dict with url, sections (name -> raw text), pages_visited, sections_requested, + and job_ids (list of LinkedIn job ID strings). + The LLM should parse the raw text to extract saved job listings. + """ + try: + await ensure_authenticated() + + logger.info("Scraping saved jobs (max_pages=%d)", max_pages) + + browser = await get_or_create_browser() + extractor = LinkedInExtractor(browser.page) + + await ctx.report_progress( + progress=0, total=100, message="Fetching saved jobs" + ) + + async def _report(page: int, total: int, msg: str) -> None: + pct = min(int(page / max(total, 1) * 100), 99) + await ctx.report_progress(progress=pct, total=100, message=msg) + + result = await extractor.scrape_saved_jobs( + max_pages=max_pages, on_progress=_report + ) + + await ctx.report_progress(progress=100, total=100, message="Complete") + + return result + + except Exception as e: + return handle_tool_error(e, "get_saved_jobs") + @mcp.tool( annotations=ToolAnnotations( title="Search Jobs", diff --git a/tests/test_scraping.py b/tests/test_scraping.py index 7493e153..32a8cc5d 100644 --- a/tests/test_scraping.py +++ b/tests/test_scraping.py @@ -416,6 +416,174 @@ async def test_search_jobs(self, mock_page): assert result["sections_requested"] == ["search_results"] +class TestScrapeSavedJobs: + async def test_scrape_saved_jobs_single_page(self, mock_page): + """Single page of results — no Next button. Progress callback fires.""" + mock_page.evaluate = AsyncMock(return_value=["111", "222"]) + mock_next = MagicMock() + mock_next.count = AsyncMock(return_value=0) + mock_page.locator = MagicMock(return_value=mock_next) + on_progress = AsyncMock() + extractor = LinkedInExtractor(mock_page) + with patch.object( + extractor, + "extract_page", + new_callable=AsyncMock, + return_value="Saved Job 1\nSaved Job 2", + ): + result = await extractor.scrape_saved_jobs(on_progress=on_progress) + + assert result["url"] == "https://www.linkedin.com/jobs-tracker/" + assert "saved_jobs" in result["sections"] + assert result["sections_requested"] == ["saved_jobs"] + assert result["job_ids"] == ["111", "222"] + assert "Job ID: 111" in result["sections"]["saved_jobs"] + assert "Job ID: 222" in result["sections"]["saved_jobs"] + on_progress.assert_awaited_once_with(1, 1, "Fetched saved jobs page 1") + + async def test_scrape_saved_jobs_paginates(self, mock_page): + """Clicks page buttons, collects IDs, fires progress, caps total_pages.""" + # Page 1 returns IDs 111, 222; page 2 returns 333, 444 + call_count = 0 + + async def evaluate_side_effect(js, *args): + nonlocal call_count + call_count += 1 + if "jobs/view" in js: + # First call: page 1 IDs; second call: page 2 IDs + if call_count <= 2: + return ["111", "222"] + return ["333", "444"] + if "innerText" in js: + return "Page 2 jobs" + return None + + mock_page.evaluate = AsyncMock(side_effect=evaluate_side_effect) + + # Page button exists for page 2, not for page 3 + page_btn_click_count = 0 + mock_page_btn = MagicMock() + + async def page_btn_count(): + return 1 if page_btn_click_count == 0 else 0 + + mock_page_btn.count = AsyncMock(side_effect=page_btn_count) + mock_page_btn.scroll_into_view_if_needed = AsyncMock() + + async def page_btn_click(): + nonlocal page_btn_click_count + page_btn_click_count += 1 + + mock_page_btn.click = AsyncMock(side_effect=page_btn_click) + mock_page.locator = MagicMock(return_value=mock_page_btn) + mock_page.wait_for_function = AsyncMock() + on_progress = AsyncMock() + + extractor = LinkedInExtractor(mock_page) + with ( + patch.object( + extractor, + "extract_page", + new_callable=AsyncMock, + return_value="Page 1 jobs", + ), + patch( + "linkedin_mcp_server.scraping.extractor.scroll_to_bottom", + new_callable=AsyncMock, + ), + patch( + "linkedin_mcp_server.scraping.extractor.asyncio.sleep", + new_callable=AsyncMock, + ), + ): + result = await extractor.scrape_saved_jobs(on_progress=on_progress) + + assert result["job_ids"] == ["111", "222", "333", "444"] + assert "Page 1 jobs" in result["sections"]["saved_jobs"] + assert "Page 2 jobs" in result["sections"]["saved_jobs"] + for jid in ["111", "222", "333", "444"]: + assert f"Job ID: {jid}" in result["sections"]["saved_jobs"] + # Progress was reported for both pages + assert on_progress.await_count == 2 + + async def test_scrape_saved_jobs_timeout_stops_gracefully(self, mock_page): + """PlaywrightTimeoutError on page 2 returns page 1 results only.""" + from patchright.async_api import TimeoutError as PlaywrightTimeoutError + + mock_page.evaluate = AsyncMock(return_value=["111", "222"]) + + mock_page_btn = MagicMock() + mock_page_btn.count = AsyncMock(return_value=1) + mock_page_btn.scroll_into_view_if_needed = AsyncMock() + mock_page_btn.click = AsyncMock() + mock_page.locator = MagicMock(return_value=mock_page_btn) + mock_page.wait_for_function = AsyncMock( + side_effect=PlaywrightTimeoutError("Timeout") + ) + + extractor = LinkedInExtractor(mock_page) + with ( + patch.object( + extractor, + "extract_page", + new_callable=AsyncMock, + return_value="Page 1 jobs", + ), + patch( + "linkedin_mcp_server.scraping.extractor.asyncio.sleep", + new_callable=AsyncMock, + ), + ): + result = await extractor.scrape_saved_jobs() + + assert result["job_ids"] == ["111", "222"] + assert "Job ID: 111" in result["sections"]["saved_jobs"] + assert "Job ID: 222" in result["sections"]["saved_jobs"] + + async def test_scrape_saved_jobs_stops_at_max_pages_despite_more_buttons( + self, mock_page + ): + """max_pages=1 stops after page 1 even if more buttons exist.""" + mock_page.evaluate = AsyncMock(return_value=["111", "222"]) + + # Simulate page buttons existing (count=3) but max_pages=1 + mock_page_btn = MagicMock() + mock_page_btn.count = AsyncMock(return_value=3) + mock_page.locator = MagicMock(return_value=mock_page_btn) + + extractor = LinkedInExtractor(mock_page) + with patch.object( + extractor, + "extract_page", + new_callable=AsyncMock, + return_value="Page 1 jobs", + ): + result = await extractor.scrape_saved_jobs(max_pages=1) + + assert result["job_ids"] == ["111", "222"] + # click should never have been called (loop range(2, 2) is empty) + mock_page_btn.click.assert_not_called() + + async def test_scrape_saved_jobs_empty(self, mock_page): + mock_page.evaluate = AsyncMock(return_value=[]) + mock_next = MagicMock() + mock_next.count = AsyncMock(return_value=0) + mock_page.locator = MagicMock(return_value=mock_next) + extractor = LinkedInExtractor(mock_page) + with patch.object( + extractor, + "extract_page", + new_callable=AsyncMock, + return_value="", + ): + result = await extractor.scrape_saved_jobs() + + assert result["url"] == "https://www.linkedin.com/jobs-tracker/" + assert result["sections"] == {} + assert result["sections_requested"] == ["saved_jobs"] + assert result["job_ids"] == [] + + class TestStripLinkedInNoise: def test_strips_footer(self): text = "Bill Gates\nChair, Gates Foundation\n\nAbout\nAccessibility\nTalent Solutions\nCareers" diff --git a/tests/test_tools.py b/tests/test_tools.py index 9f0f1b7f..26a33e88 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -41,6 +41,7 @@ def _make_mock_extractor(scrape_result: dict) -> MagicMock: mock.scrape_job = AsyncMock(return_value=scrape_result) mock.search_jobs = AsyncMock(return_value=scrape_result) mock.search_people = AsyncMock(return_value=scrape_result) + mock.scrape_saved_jobs = AsyncMock(return_value=scrape_result) mock.extract_page = AsyncMock(return_value="some text") return mock @@ -223,6 +224,47 @@ async def test_get_job_details(self, mock_context, patch_tool_deps, monkeypatch) result = await tool_fn("12345", mock_context) assert "job_posting" in result["sections"] + async def test_get_saved_jobs(self, mock_context, patch_tool_deps, monkeypatch): + expected = { + "url": "https://www.linkedin.com/jobs-tracker/", + "sections": {"saved_jobs": "Saved Job 1\nSaved Job 2"}, + "pages_visited": ["https://www.linkedin.com/jobs-tracker/"], + "sections_requested": ["saved_jobs"], + "job_ids": ["111", "222"], + } + mock_extractor = _make_mock_extractor(expected) + monkeypatch.setattr( + "linkedin_mcp_server.tools.job.LinkedInExtractor", + lambda *a, **kw: mock_extractor, + ) + + from linkedin_mcp_server.tools.job import register_job_tools + + mcp = FastMCP("test") + register_job_tools(mcp) + + tool_fn = await get_tool_fn(mcp, "get_saved_jobs") + result = await tool_fn(mock_context) + assert "saved_jobs" in result["sections"] + assert result["url"] == "https://www.linkedin.com/jobs-tracker/" + + async def test_get_saved_jobs_error(self, mock_context, monkeypatch): + from linkedin_mcp_server.exceptions import SessionExpiredError + + monkeypatch.setattr( + "linkedin_mcp_server.tools.job.ensure_authenticated", + AsyncMock(side_effect=SessionExpiredError()), + ) + + from linkedin_mcp_server.tools.job import register_job_tools + + mcp = FastMCP("test") + register_job_tools(mcp) + + tool_fn = await get_tool_fn(mcp, "get_saved_jobs") + result = await tool_fn(mock_context) + assert result["error"] == "session_expired" + async def test_search_jobs(self, mock_context, patch_tool_deps, monkeypatch): expected = { "url": "https://www.linkedin.com/jobs/search/?keywords=python",