Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 182 additions & 11 deletions linkedin_mcp_server/scraping/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,29 +358,200 @@ async def scrape_job(self, job_id: str) -> dict[str, Any]:
"sections_requested": ["job_posting"],
}

async def _extract_job_listings(self) -> list[dict[str, str]]:
"""Extract job IDs and titles from job link hrefs on the current page.

Returns:
[{job_id, title}]
"""
return await self._page.evaluate("""() => {
const links = document.querySelectorAll('a[href*="/jobs/view/"]');
const seen = new Set();
const results = [];

for (const link of links) {
const match = link.href.match(/\\/jobs\\/view\\/(\\d+)/);
if (!match || seen.has(match[1])) continue;
seen.add(match[1]);

const title = link.innerText.trim().split('\\n')[0];
if (!title) continue;

results.push({job_id: match[1], title: title});
}
return results;
}""")

async def _scroll_job_list(
self, pause_time: float = 0.8, max_scrolls: int = 25
) -> None:
"""Scroll the job list sidebar to load all lazy-rendered cards.

Finds the scrollable ancestor of the first job link and scrolls it,
rather than relying on specific CSS class names. Also scrolls the
window as a fallback for layouts that use page-level scroll.

Args:
pause_time: Time to pause between scrolls (seconds)
max_scrolls: Maximum number of scroll attempts
"""
for i in range(max_scrolls):
prev_count = await self._page.evaluate(
"""() => document.querySelectorAll('a[href*="/jobs/view/"]').length"""
)

# Scroll every scrollable ancestor of the job list
await self._page.evaluate("""() => {
const jobLink = document.querySelector('a[href*="/jobs/view/"]');
if (!jobLink) return;

let el = jobLink.parentElement;
while (el && el !== document.body) {
// 10px buffer to ignore minor rounding/border differences
if (el.scrollHeight > el.clientHeight + 10) {
el.scrollTop = el.scrollHeight;
}
el = el.parentElement;
}

window.scrollTo(0, document.body.scrollHeight);
}""")

await asyncio.sleep(pause_time)

new_count = await self._page.evaluate(
"""() => document.querySelectorAll('a[href*="/jobs/view/"]').length"""
)
logger.debug("Scroll %d: job links %d -> %d", i + 1, prev_count, new_count)

if new_count == prev_count:
# Extra pause on first scroll in case of slow loading
if i == 0:
await asyncio.sleep(pause_time * 2)
continue
logger.debug("No new jobs after scroll %d, stopping", i + 1)
break

async def _extract_job_page(self, url: str) -> tuple[str, list[dict[str, str]]]:
"""Load a single job search results page, extract text and listings."""
await self._page.goto(url, wait_until="domcontentloaded", timeout=30000)
await detect_rate_limit(self._page)

try:
await self._page.wait_for_selector("main", timeout=5000)
except PlaywrightTimeoutError:
logger.debug("No <main> element found on %s", url)

await handle_modal_close(self._page)

# Scroll the job list sidebar to load all lazy-rendered cards
await self._scroll_job_list(pause_time=0.8, max_scrolls=20)

# Extract text
raw = await self._page.evaluate(
"""() => {
const main = document.querySelector('main');
return main ? main.innerText : document.body.innerText;
}"""
)
text = strip_linkedin_noise(raw) if raw else ""

# Extract structured job listings
try:
listings = await self._extract_job_listings()
except Exception as e:
logger.warning("Failed to extract job listings from %s: %s", url, e)
listings = []

return text, listings

async def search_jobs(
self, keywords: str, location: str | None = None
self,
keywords: str,
location: str | None = None,
max_pages: int = 3,
) -> dict[str, Any]:
"""Search for jobs and extract the results page.
"""Search for jobs and extract results across multiple pages.

Args:
keywords: Search keywords.
location: Optional location filter.
max_pages: Number of search result pages to scrape (1-100, default 3).
Each page has ~10 results. Higher values take longer and
increase the risk of rate limiting.

Returns:
{url, sections: {name: text}, pages_visited, sections_requested}
{url, sections: {name: text}, job_listings: [{job_id, title}],
pages_visited, sections_requested}
"""
params = f"keywords={quote_plus(keywords)}"
max_pages = max(1, min(max_pages, 100))

base_params = f"keywords={quote_plus(keywords)}"
if location:
params += f"&location={quote_plus(location)}"
base_params += f"&location={quote_plus(location)}"

url = f"https://www.linkedin.com/jobs/search/?{params}"
text = await self.extract_page(url)
all_listings: list[dict[str, str]] = []
seen_ids: set[str] = set()
all_text_parts: list[str] = []
pages_visited: list[str] = []
start = 0

for page_num in range(max_pages):
params = base_params if start == 0 else f"{base_params}&start={start}"
url = f"https://www.linkedin.com/jobs/search/?{params}"

try:
text, listings = await self._extract_job_page(url)
except LinkedInScraperException:
raise
except Exception as e:
logger.warning("Failed to load job search page %d: %s", page_num + 1, e)
break

pages_visited.append(url)
if text:
all_text_parts.append(text)

# Advance offset by the number of listings found on this page
start += len(listings) if listings else 25

# Deduplicate across pages
new_on_page = 0
for listing in listings:
if listing["job_id"] not in seen_ids:
seen_ids.add(listing["job_id"])
all_listings.append(listing)
new_on_page += 1

logger.info(
"Page %d: found %d listings (%d new, %d total)",
page_num + 1,
len(listings),
new_on_page,
len(all_listings),
)

# Stop early if this page returned nothing new
if new_on_page == 0:
logger.info(
"No new listings on page %d, stopping pagination", page_num + 1
)
break

# Delay between pages to avoid rate limiting
if page_num < max_pages - 1:
await asyncio.sleep(_NAV_DELAY)

sections: dict[str, str] = {}
if text:
sections["search_results"] = text
if all_text_parts:
sections["search_results"] = "\n\n---\n\n".join(all_text_parts)

first_url = f"https://www.linkedin.com/jobs/search/?{base_params}"
return {
"url": url,
"url": first_url,
"sections": sections,
"pages_visited": [url],
"job_listings": all_listings,
"pages_visited": pages_visited,
"sections_requested": ["search_results"],
}

Expand Down
13 changes: 9 additions & 4 deletions linkedin_mcp_server/tools/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ async def search_jobs(
keywords: str,
ctx: Context,
location: str | None = None,
max_pages: int = 3,
) -> dict[str, Any]:
"""
Search for jobs on LinkedIn.
Expand All @@ -84,18 +85,22 @@ async def search_jobs(
keywords: Search keywords (e.g., "software engineer", "data scientist")
ctx: FastMCP context for progress reporting
location: Optional location filter (e.g., "San Francisco", "Remote")
max_pages: Number of result pages to scrape (1-100, default 3).
Each page returns ~10 jobs. Higher values are slower
and increase rate-limit risk.

Returns:
Dict with url, sections (name -> raw text), pages_visited, and sections_requested.
The LLM should parse the raw text to extract job listings.
Dict with url, sections (name -> raw text), job_listings (list of
{job_id, title} dicts), pages_visited, and sections_requested.
"""
try:
await ensure_authenticated()

logger.info(
"Searching jobs: keywords='%s', location='%s'",
"Searching jobs: keywords='%s', location='%s', max_pages=%d",
keywords,
location,
max_pages,
)

browser = await get_or_create_browser()
Expand All @@ -105,7 +110,7 @@ async def search_jobs(
progress=0, total=100, message="Starting job search"
)

result = await extractor.search_jobs(keywords, location)
result = await extractor.search_jobs(keywords, location, max_pages)

await ctx.report_progress(progress=100, total=100, message="Complete")

Expand Down
Loading