Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 135 additions & 0 deletions linkedin_mcp_server/scraping/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import asyncio
import logging
import re
from collections.abc import Awaitable, Callable
from typing import Any
from urllib.parse import quote_plus

Expand Down Expand Up @@ -384,6 +385,140 @@ async def search_jobs(
"sections_requested": ["search_results"],
}

_EXTRACT_JOB_IDS_JS = """() => {
const ids = [];
document.querySelectorAll('a[href*="/jobs/view/"]').forEach(a => {
const match = a.href.match(/\\/jobs\\/view\\/(\\d+)/);
if (match && !ids.includes(match[1])) ids.push(match[1]);
});
return ids;
}"""

_EXTRACT_MAIN_TEXT_JS = """() => {
const main = document.querySelector('main');
return main ? main.innerText : document.body.innerText;
}"""

async def scrape_saved_jobs(
self,
max_pages: int = 10,
on_progress: Callable[[int, int, str], Awaitable[None]] | None = None,
) -> dict[str, Any]:
"""Scrape the user's saved/bookmarked jobs from the jobs tracker page.

Automatically paginates through all pages using numbered page buttons.
Extracts job IDs from link hrefs (``/jobs/view/<id>/``) since they are
not present in the page's innerText.

Args:
max_pages: Safety cap on pages to scrape (default 10).
on_progress: Optional async callback ``(page, total, message)``
invoked after each page is scraped.

Returns:
{url, sections: {name: text}, pages_visited, sections_requested,
job_ids: list[str]}
"""
url = "https://www.linkedin.com/jobs-tracker/"
text = await self.extract_page(url)

all_text_parts: list[str] = []
all_job_ids: list[str] = []

if text:
all_text_parts.append(text)

# Collect job IDs from page 1.
page_ids: list[str] = await self._page.evaluate(self._EXTRACT_JOB_IDS_JS)
all_job_ids.extend(page_ids)
logger.info("Page 1: found %d job IDs", len(page_ids))

# Determine total pages from pagination buttons (10 jobs per page).
page_buttons = self._page.locator('button[aria-label^="Page "]')
total_pages = min(max(await page_buttons.count(), 1), max_pages)
logger.info("Total pages detected: %d", total_pages)

if on_progress:
await on_progress(1, total_pages, "Fetched saved jobs page 1")

# Paginate through remaining pages using numbered page buttons.
for page_num in range(2, max_pages + 1):
page_btn = self._page.locator(f'button[aria-label="Page {page_num}"]')
if not await page_btn.count():
logger.info(
"No page %d button found — stopping at page %d",
page_num,
page_num - 1,
)
break

logger.info("Navigating to saved jobs page %d", page_num)
prev_ids = set(all_job_ids)
await page_btn.scroll_into_view_if_needed()
await page_btn.click()
await asyncio.sleep(_NAV_DELAY)

# Wait for the DOM to reflect new job links.
try:
await self._page.wait_for_function(
"""(prevIds) => {
const prev = new Set(prevIds);
const links = document.querySelectorAll('a[href*="/jobs/view/"]');
for (const a of links) {
const match = a.href.match(/\\/jobs\\/view\\/(\\d+)/);
if (match && !prev.has(match[1])) return true;
}
return false;
}""",
arg=list(prev_ids),
timeout=15000,
)
except PlaywrightTimeoutError:
logger.info("No new job IDs appeared on page %d — stopping", page_num)
break

await scroll_to_bottom(self._page, pause_time=0.5, max_scrolls=3)

raw = await self._page.evaluate(self._EXTRACT_MAIN_TEXT_JS)
if raw:
cleaned = strip_linkedin_noise(raw)
if cleaned:
all_text_parts.append(cleaned)

page_ids = await self._page.evaluate(self._EXTRACT_JOB_IDS_JS)
new_ids = [jid for jid in page_ids if jid not in prev_ids]
logger.info("Page %d: found %d new job IDs", page_num, len(new_ids))
if not new_ids:
break
all_job_ids.extend(new_ids)

if on_progress:
await on_progress(
page_num, total_pages, f"Fetched saved jobs page {page_num}"
)

# Append a summary of job IDs so they are always visible in the text.
id_summary = "\n".join(
f"- Job ID: {jid} (https://www.linkedin.com/jobs/view/{jid}/)"
for jid in all_job_ids
)
if id_summary:
all_text_parts.append(f"--- Saved Job IDs ---\n{id_summary}")

sections: dict[str, str] = {}
if all_text_parts:
sections["saved_jobs"] = "\n\n".join(all_text_parts)

logger.info("Total saved jobs found: %d across all pages", len(all_job_ids))

return {
"url": url,
"sections": sections,
"pages_visited": [url],
"sections_requested": ["saved_jobs"],
"job_ids": all_job_ids,
}

async def search_people(
self,
keywords: str,
Expand Down
42 changes: 42 additions & 0 deletions linkedin_mcp_server/tools/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,48 @@ async def get_job_details(job_id: str, ctx: Context) -> dict[str, Any]:
except Exception as e:
return handle_tool_error(e, "get_job_details")

@mcp.tool(
annotations=ToolAnnotations(
title="Get Saved Jobs",
readOnlyHint=True,
destructiveHint=False,
openWorldHint=True,
)
)
async def get_saved_jobs(ctx: Context) -> dict[str, Any]:
"""
Get the user's saved/bookmarked jobs from LinkedIn's job tracker.

Returns:
Dict with url, sections (name -> raw text), pages_visited, sections_requested,
and job_ids (list of LinkedIn job ID strings).
The LLM should parse the raw text to extract saved job listings.
"""
try:
await ensure_authenticated()

logger.info("Scraping saved jobs")

browser = await get_or_create_browser()
extractor = LinkedInExtractor(browser.page)

await ctx.report_progress(
progress=0, total=100, message="Fetching saved jobs"
)

async def _report(page: int, total: int, msg: str) -> None:
pct = min(int(page / max(total, 1) * 100), 99)
await ctx.report_progress(progress=pct, total=100, message=msg)

result = await extractor.scrape_saved_jobs(on_progress=_report)

await ctx.report_progress(progress=100, total=100, message="Complete")

return result

except Exception as e:
return handle_tool_error(e, "get_saved_jobs")

@mcp.tool(
annotations=ToolAnnotations(
title="Search Jobs",
Expand Down
168 changes: 168 additions & 0 deletions tests/test_scraping.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,174 @@ async def test_search_jobs(self, mock_page):
assert result["sections_requested"] == ["search_results"]


class TestScrapeSavedJobs:
async def test_scrape_saved_jobs_single_page(self, mock_page):
"""Single page of results — no Next button. Progress callback fires."""
mock_page.evaluate = AsyncMock(return_value=["111", "222"])
mock_next = MagicMock()
mock_next.count = AsyncMock(return_value=0)
mock_page.locator = MagicMock(return_value=mock_next)
on_progress = AsyncMock()
extractor = LinkedInExtractor(mock_page)
with patch.object(
extractor,
"extract_page",
new_callable=AsyncMock,
return_value="Saved Job 1\nSaved Job 2",
):
result = await extractor.scrape_saved_jobs(on_progress=on_progress)

assert result["url"] == "https://www.linkedin.com/jobs-tracker/"
assert "saved_jobs" in result["sections"]
assert result["sections_requested"] == ["saved_jobs"]
assert result["job_ids"] == ["111", "222"]
assert "Job ID: 111" in result["sections"]["saved_jobs"]
assert "Job ID: 222" in result["sections"]["saved_jobs"]
on_progress.assert_awaited_once_with(1, 1, "Fetched saved jobs page 1")

async def test_scrape_saved_jobs_paginates(self, mock_page):
"""Clicks page buttons, collects IDs, fires progress, caps total_pages."""
# Page 1 returns IDs 111, 222; page 2 returns 333, 444
call_count = 0

async def evaluate_side_effect(js, *args):
nonlocal call_count
call_count += 1
if "jobs/view" in js:
# First call: page 1 IDs; second call: page 2 IDs
if call_count <= 2:
return ["111", "222"]
return ["333", "444"]
if "innerText" in js:
return "Page 2 jobs"
return None

mock_page.evaluate = AsyncMock(side_effect=evaluate_side_effect)

# Page button exists for page 2, not for page 3
page_btn_click_count = 0
mock_page_btn = MagicMock()

async def page_btn_count():
return 1 if page_btn_click_count == 0 else 0

mock_page_btn.count = AsyncMock(side_effect=page_btn_count)
mock_page_btn.scroll_into_view_if_needed = AsyncMock()

async def page_btn_click():
nonlocal page_btn_click_count
page_btn_click_count += 1

mock_page_btn.click = AsyncMock(side_effect=page_btn_click)
mock_page.locator = MagicMock(return_value=mock_page_btn)
mock_page.wait_for_function = AsyncMock()
on_progress = AsyncMock()

extractor = LinkedInExtractor(mock_page)
with (
patch.object(
extractor,
"extract_page",
new_callable=AsyncMock,
return_value="Page 1 jobs",
),
patch(
"linkedin_mcp_server.scraping.extractor.scroll_to_bottom",
new_callable=AsyncMock,
),
patch(
"linkedin_mcp_server.scraping.extractor.asyncio.sleep",
new_callable=AsyncMock,
),
):
result = await extractor.scrape_saved_jobs(on_progress=on_progress)

assert result["job_ids"] == ["111", "222", "333", "444"]
assert "Page 1 jobs" in result["sections"]["saved_jobs"]
assert "Page 2 jobs" in result["sections"]["saved_jobs"]
for jid in ["111", "222", "333", "444"]:
assert f"Job ID: {jid}" in result["sections"]["saved_jobs"]
# Progress was reported for both pages
assert on_progress.await_count == 2

async def test_scrape_saved_jobs_timeout_stops_gracefully(self, mock_page):
"""PlaywrightTimeoutError on page 2 returns page 1 results only."""
from patchright.async_api import TimeoutError as PlaywrightTimeoutError

mock_page.evaluate = AsyncMock(return_value=["111", "222"])

mock_page_btn = MagicMock()
mock_page_btn.count = AsyncMock(return_value=1)
mock_page_btn.scroll_into_view_if_needed = AsyncMock()
mock_page_btn.click = AsyncMock()
mock_page.locator = MagicMock(return_value=mock_page_btn)
mock_page.wait_for_function = AsyncMock(
side_effect=PlaywrightTimeoutError("Timeout")
)

extractor = LinkedInExtractor(mock_page)
with (
patch.object(
extractor,
"extract_page",
new_callable=AsyncMock,
return_value="Page 1 jobs",
),
patch(
"linkedin_mcp_server.scraping.extractor.asyncio.sleep",
new_callable=AsyncMock,
),
):
result = await extractor.scrape_saved_jobs()

assert result["job_ids"] == ["111", "222"]
assert "Job ID: 111" in result["sections"]["saved_jobs"]
assert "Job ID: 222" in result["sections"]["saved_jobs"]

async def test_scrape_saved_jobs_stops_at_max_pages_despite_more_buttons(
self, mock_page
):
"""max_pages=1 stops after page 1 even if more buttons exist."""
mock_page.evaluate = AsyncMock(return_value=["111", "222"])

# Simulate page buttons existing (count=3) but max_pages=1
mock_page_btn = MagicMock()
mock_page_btn.count = AsyncMock(return_value=3)
mock_page.locator = MagicMock(return_value=mock_page_btn)

extractor = LinkedInExtractor(mock_page)
with patch.object(
extractor,
"extract_page",
new_callable=AsyncMock,
return_value="Page 1 jobs",
):
result = await extractor.scrape_saved_jobs(max_pages=1)

assert result["job_ids"] == ["111", "222"]
# click should never have been called (loop range(2, 2) is empty)
mock_page_btn.click.assert_not_called()

async def test_scrape_saved_jobs_empty(self, mock_page):
mock_page.evaluate = AsyncMock(return_value=[])
mock_next = MagicMock()
mock_next.count = AsyncMock(return_value=0)
mock_page.locator = MagicMock(return_value=mock_next)
extractor = LinkedInExtractor(mock_page)
with patch.object(
extractor,
"extract_page",
new_callable=AsyncMock,
return_value="",
):
result = await extractor.scrape_saved_jobs()

assert result["url"] == "https://www.linkedin.com/jobs-tracker/"
assert result["sections"] == {}
assert result["sections_requested"] == ["saved_jobs"]
assert result["job_ids"] == []


class TestStripLinkedInNoise:
def test_strips_footer(self):
text = "Bill Gates\nChair, Gates Foundation\n\nAbout\nAccessibility\nTalent Solutions\nCareers"
Expand Down
Loading