Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 98 additions & 19 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,25 @@
# Celcat Calendar Scraper
# Celcat Calendar Scraper 📆

An asynchronous Python library for scraping Celcat calendar systems.

## Installation
## Installation 🚀

```sh
pip install celcat-scraper
```

## Usage
## Features 🌟

* Event attributes filtering 🔎
* Async/await support for better performance 🔀
* Rate limiting with adaptive backoff ⏳
* Optional caching support 💾
* Optional reusable aiohttp session ♻️
* Automatic session management 🍪
* Batch processing of events 📦
* Error handling and retries 🚨

## Usage ⚙️

Basic example of retrieving calendar events:

Expand All @@ -23,42 +34,110 @@ async def main():
url="https://university.com/calendar",
username="your_username",
password="your_password",
include_holidays=True
include_holidays=True,
)

# Create scraper instance and get events
async with CelcatScraperAsync(config) as scraper:

start_date = date.today()
end_date = start_date + timedelta(days=30)

# Recommended to store events locally and reduce the amout of requests
file_path = 'store.json'
file_path = "store.json"
events = scraper.deserialize_events(file_path)

events = await scraper.get_calendar_events(start_date, end_date, previous_events=events)


events = await scraper.get_calendar_events(
start_date, end_date, previous_events=events
)

for event in events:
print(f"Event {event['id']}")
print(f"Course: {event['category']} - {event['course']}")
print(f"Time: {event['start']} to {event['end']}")
print(f"Location: {', '.join(event['rooms'])} at {', '.join(event['sites'])} - {event['department']}")
print(f"Professors: {', '.join(event['professors'])}")
print("---")

# Save events for a future refresh
scraper.serialize_events(events, file_path)

if __name__ == "__main__":
asyncio.run(main())
```

## Features
## Filtering 🔍

Celcat Calendar data is often messy, and needs to be processed before it can be used.
For example, the same course may have several different names in different events.
Filtering allows these attributes to be standardized.

### Usage ⚙️

> ℹ️ **Info**: Each filter argument is optional. When course_strip_redundant is enabled, using remembered_strips is recommended.

* Async/await support for better performance
* Rate limiting with adaptive backoff
* Optional caching support
* Optional reusable aiohttp session
* Automatic session management
* Batch processing of events
* Error handling and retries
> ⚠️ **Warning**: Disabling filters will require you to reset your previous events and refetch to undo changes.

```python
import asyncio
from datetime import date, timedelta
import json
from celcat_scraper import CelcatFilterConfig, FilterType, CelcatConfig, CelcatScraperAsync

async def main():
# Load remembered_strips from a file
remembered_strips = []
try:
with open("remembered_strips.json", "r") as f:
remembered_strips = json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
remembered_strips = []

# Create a list of manual course replacements
course_replacements = {"English - S2": "English", "Mathematics": "Maths"}

# Configure a filter
filter_config = CelcatFilterConfig(
filters = {
FilterType.COURSE_TITLE,
FilterType.COURSE_STRIP_MODULES,
FilterType.COURSE_STRIP_CATEGORY,
FilterType.COURSE_STRIP_PUNCTUATION,
FilterType.COURSE_GROUP_SIMILAR,
FilterType.COURSE_STRIP_REDUNDANT,
FilterType.PROFESSORS_TITLE,
FilterType.ROOMS_TITLE,
FilterType.ROOMS_STRIP_AFTER_NUMBER,
FilterType.SITES_TITLE,
FilterType.SITES_REMOVE_DUPLICATES,
}
course_remembered_strips=remembered_strips,
course_replacements=course_replacements,
)

config = CelcatConfig(
url="https://university.com/calendar",
username="your_username",
password="your_password",
include_holidays=True,
# Pass the filter as an argument
filter_config=filter_config,
)

async with CelcatScraperAsync(config) as scraper:
start_date = date.today()
end_date = start_date + timedelta(days=30)

events = scraper.deserialize_events("store.json")
events = await scraper.get_calendar_events(
start_date, end_date, previous_events=events
)

scraper.serialize_events(events, file_path)

# Save the updated remembered_strips back to file
with open("remembered_strips.json", "w") as f:
json.dump(scraper.filter_config.course_remembered_strips, f)

if __name__ == "__main__":
asyncio.run(main())
```
5 changes: 4 additions & 1 deletion celcat_scraper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@

This package provides a complete interface for interacting with Celcat Calendar.
"""
from .config import CelcatConfig, CelcatConstants

from .config import CelcatConfig, CelcatFilterConfig, CelcatConstants, FilterType
from .exceptions import CelcatError, CelcatCannotConnectError, CelcatInvalidAuthError
from .scraper import CelcatScraperAsync
from .types import EventData

__all__ = [
"CelcatConfig",
"CelcatFilterConfig",
"CelcatConstants",
"FilterType",
"CelcatScraperAsync",
"EventData",
"CelcatError",
Expand Down
64 changes: 31 additions & 33 deletions celcat_scraper/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,26 +16,31 @@

_LOGGER = logging.getLogger(__name__)


class CelcatAPI:
"""Class for interacting with Celcat Calendar API."""

def __init__(self):
def __init__(self, config: CelcatConfig):
"""Initialize the Celcat API client."""
self.rate_limiter = RateLimiter(1/CelcatConfig.rate_limit)
self.rate_limiter = RateLimiter(config.rate_limit)
self.semaphore = asyncio.Semaphore(CelcatConstants.CONCURRENT_REQUESTS)
self.timeout = CelcatConstants.TIMEOUT

async def validate_response(self, response: ClientResponse, expected_type: str = None) -> Any:

async def validate_response(
self, response: ClientResponse, expected_type: str = None
) -> Any:
"""Validate server response and return appropriate data type."""
if response.status != 200:
error_text = await response.text(encoding='latin1')
error_text = await response.text(encoding="latin1")
raise CelcatCannotConnectError(
f"Server returned status {response.status}: {error_text[:200]}"
)

if expected_type == "json":
if "application/json" not in response.headers.get("Content-Type", ""):
raise CelcatCannotConnectError("Expected JSON response but got different content type")
raise CelcatCannotConnectError(
"Expected JSON response but got different content type"
)
return await response.json()

return await response.text()
Expand All @@ -50,7 +55,9 @@ async def handle_error_response(self, response: ClientResponse) -> None:
elif response.status == 429:
retry_after = int(response.headers.get("Retry-After", 30))
self.rate_limiter.increase_backoff()
raise CelcatCannotConnectError(f"Rate limited. Retry after {retry_after} seconds")
raise CelcatCannotConnectError(
f"Rate limited. Retry after {retry_after} seconds"
)
else:
raise CelcatCannotConnectError(f"HTTP {response.status}: {error_msg}")

Expand All @@ -60,7 +67,7 @@ async def get_calendar_raw_data(
url: str,
federation_ids: str,
start_date: date,
end_date: date
end_date: date,
) -> List[Dict[str, Any]]:
"""Fetch raw calendar data for given time period."""
_LOGGER.info("Getting calendar raw data")
Expand All @@ -73,38 +80,25 @@ async def get_calendar_raw_data(
"end": end_date.strftime("%Y-%m-%d"),
"resType": "104",
"calView": "month",
"federationIds[]": federation_ids
"federationIds[]": federation_ids,
}

url_calendar_data = url + "/Home/GetCalendarData"

return await self.fetch_with_retry(
session,
"POST",
"json",
url_calendar_data,
data=calendar_data
session, "POST", "json", url_calendar_data, data=calendar_data
)

async def get_side_bar_event_raw_data(
self,
session: ClientSession,
url: str,
event_id: str
self, session: ClientSession, url: str, event_id: str
) -> dict:
"""Fetch detailed event data by ID."""
sidebar_data = {
"eventid": event_id
}
sidebar_data = {"eventid": event_id}

url_sidebar_data = url + "/Home/GetSideBarEvent"

return await self.fetch_with_retry(
session,
"POST",
"json",
url_sidebar_data,
data=sidebar_data
session, "POST", "json", url_sidebar_data, data=sidebar_data
)

async def fetch_with_retry(
Expand All @@ -113,7 +107,7 @@ async def fetch_with_retry(
method: str,
expected_type: str,
url: str,
**kwargs
**kwargs,
) -> Any:
"""Make HTTP requests with retry logic."""
await self.rate_limiter.acquire()
Expand All @@ -126,12 +120,14 @@ async def fetch_with_retry(
async with session.request(method, url, **kwargs) as response:
if response.status == 200:
content_type = response.headers.get("Content-Type", "")

if expected_type == "json":
if "application/json" in content_type:
data = await response.json()
else:
raise CelcatCannotConnectError(f"Expected JSON response but got different content type: {content_type}")
raise CelcatCannotConnectError(
f"Expected JSON response but got different content type: {content_type}"
)
else:
data = await response.text()

Expand All @@ -143,5 +139,7 @@ async def fetch_with_retry(
except ClientError as exc:
self.rate_limiter.increase_backoff()
if attempt == CelcatConstants.MAX_RETRIES - 1:
raise CelcatCannotConnectError(f"Failed after {CelcatConstants.MAX_RETRIES} attempts") from exc
await asyncio.sleep(min(2 ** attempt, 10))
raise CelcatCannotConnectError(
f"Failed after {CelcatConstants.MAX_RETRIES} attempts"
) from exc
await asyncio.sleep(min(2**attempt, 10))
37 changes: 23 additions & 14 deletions celcat_scraper/auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,9 @@

_LOGGER = logging.getLogger(__name__)


async def authenticate(
session: ClientSession,
url: str,
username: str,
password: str
session: ClientSession, url: str, username: str, password: str
) -> Tuple[bool, Optional[str]]:
"""Authenticate to Celcat.

Expand Down Expand Up @@ -57,13 +55,13 @@ async def authenticate(
login_data = {
"Name": username,
"Password": password,
"__RequestVerificationToken": token_element["value"]
"__RequestVerificationToken": token_element["value"],
}

async with session.post(
f"{url}/LdapLogin/Logon",
data=login_data,
headers={"Content-Type": "application/x-www-form-urlencoded"}
headers={"Content-Type": "application/x-www-form-urlencoded"},
) as response:
if response.status != 200:
error_text = await response.text(encoding="latin1")
Expand All @@ -80,7 +78,9 @@ async def authenticate(
raise CelcatCannotConnectError("Failed to connect to Celcat service") from exc


def _process_login_response(response_url, page_content: str) -> Tuple[bool, Optional[str]]:
def _process_login_response(
response_url, page_content: str
) -> Tuple[bool, Optional[str]]:
"""Process login response and extract federation IDs.

Returns:
Expand All @@ -96,20 +96,29 @@ def _process_login_response(response_url, page_content: str) -> Tuple[bool, Opti

if login_button_state == "Log Out":
federation_ids = next(
(param.split("=")[1] for param in str(response_url).split("&")
if param.startswith("FederationIds=")),
None
(
param.split("=")[1]
for param in str(response_url).split("&")
if param.startswith("FederationIds=")
),
None,
)

if federation_ids is None:
_LOGGER.debug("FederationIds could not be retrieved. Trying to extract from page")
_LOGGER.debug(
"FederationIds could not be retrieved. Trying to extract from page"
)
extracted = soup.find("span", class_="small")
if extracted:
federation_ids = extracted.text.lstrip('-').strip()
federation_ids = extracted.text.lstrip("-").strip()
if not federation_ids.isdigit():
raise CelcatCannotConnectError(f"Federation ids could not be extracted from '{federation_ids}'")
raise CelcatCannotConnectError(
f"Federation ids could not be extracted from '{federation_ids}'"
)
else:
raise CelcatCannotConnectError("Federation ids class could not be found")
raise CelcatCannotConnectError(
"Federation ids class could not be found"
)

_LOGGER.debug("Successfully logged in to Celcat")
return True, federation_ids
Expand Down
Loading