Skip to content

Commit 8ca3858

Browse files
authored
Merge pull request #3 from etiennec78/attributes_filter
Added customizable filters to standardize events data
2 parents d3bbbf4 + 1c082bb commit 8ca3858

5 files changed

Lines changed: 427 additions & 38 deletions

File tree

README.md

Lines changed: 95 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,25 @@
1-
# Celcat Calendar Scraper
1+
# Celcat Calendar Scraper 📆
22

33
An asynchronous Python library for scraping Celcat calendar systems.
44

5-
## Installation
5+
## Installation 🚀
66

77
```sh
88
pip install celcat-scraper
99
```
1010

11-
## Usage
11+
## Features 🌟
12+
13+
* Event attributes filtering 🔎
14+
* Async/await support for better performance 🔀
15+
* Rate limiting with adaptive backoff ⏳
16+
* Optional caching support 💾
17+
* Optional reusable aiohttp session ♻️
18+
* Automatic session management 🍪
19+
* Batch processing of events 📦
20+
* Error handling and retries 🚨
21+
22+
## Usage ⚙️
1223

1324
Basic example of retrieving calendar events:
1425

@@ -23,42 +34,107 @@ async def main():
2334
url="https://university.com/calendar",
2435
username="your_username",
2536
password="your_password",
26-
include_holidays=True
37+
include_holidays=True,
2738
)
2839

2940
# Create scraper instance and get events
3041
async with CelcatScraperAsync(config) as scraper:
31-
3242
start_date = date.today()
3343
end_date = start_date + timedelta(days=30)
34-
44+
3545
# Recommended to store events locally and reduce the amout of requests
36-
file_path = 'store.json'
46+
file_path = "store.json"
3747
events = scraper.deserialize_events(file_path)
38-
39-
events = await scraper.get_calendar_events(start_date, end_date, previous_events=events)
40-
48+
49+
events = await scraper.get_calendar_events(
50+
start_date, end_date, previous_events=events
51+
)
52+
4153
for event in events:
4254
print(f"Event {event['id']}")
4355
print(f"Course: {event['category']} - {event['course']}")
4456
print(f"Time: {event['start']} to {event['end']}")
4557
print(f"Location: {', '.join(event['rooms'])} at {', '.join(event['sites'])} - {event['department']}")
4658
print(f"Professors: {', '.join(event['professors'])}")
4759
print("---")
48-
60+
4961
# Save events for a future refresh
5062
scraper.serialize_events(events, file_path)
5163

5264
if __name__ == "__main__":
5365
asyncio.run(main())
5466
```
5567

56-
## Features
68+
## Filtering 🔍
69+
70+
Celcat Calendar data is often messy, and needs to be processed before it can be used.
71+
For example, the same course may have several different names in different events.
72+
Filtering allows these attributes to be standardized.
73+
74+
### Usage ⚙️
75+
76+
> ℹ️ **Info**: Each filter argument is optional. When course_strip_redundant is enabled, using remembered_strips is recommended.
5777
58-
* Async/await support for better performance
59-
* Rate limiting with adaptive backoff
60-
* Optional caching support
61-
* Optional reusable aiohttp session
62-
* Automatic session management
63-
* Batch processing of events
64-
* Error handling and retries
78+
> ⚠️ **Warning**: Disabling filters will require you to reset your previous events and refetch to undo changes.
79+
80+
```python
81+
import asyncio
82+
from datetime import date, timedelta
83+
import json
84+
from celcat_scraper import CelcatFilterConfig, CelcatConfig, CelcatScraperAsync
85+
86+
async def main():
87+
# Load remembered_strips from a file
88+
remembered_strips = []
89+
try:
90+
with open("remembered_strips.json", "r") as f:
91+
remembered_strips = json.load(f)
92+
except (FileNotFoundError, json.JSONDecodeError):
93+
remembered_strips = []
94+
95+
# Create a list of manual course replacements
96+
course_replacements = {"English - S2": "English", "Mathematics": "Maths"}
97+
98+
# Configure a filter
99+
celcat_filter = CelcatFilterConfig(
100+
course_title=True,
101+
course_strip_modules=True,
102+
course_strip_category=True,
103+
course_strip_punctuation=True,
104+
course_group_similar=True,
105+
course_strip_redundant=True,
106+
course_remembered_strips=remembered_strips,
107+
course_replacements=course_replacements,
108+
professors_title=True,
109+
rooms_title=True,
110+
rooms_strip_after_number=False,
111+
sites_title=True,
112+
)
113+
114+
config = CelcatConfig(
115+
url="https://university.com/calendar",
116+
username="your_username",
117+
password="your_password",
118+
include_holidays=True,
119+
# Pass the filter as an argument
120+
custom_filter=celcat_filter,
121+
)
122+
123+
async with CelcatScraperAsync(config) as scraper:
124+
start_date = date.today()
125+
end_date = start_date + timedelta(days=30)
126+
127+
events = scraper.deserialize_events("store.json")
128+
events = await scraper.get_calendar_events(
129+
start_date, end_date, previous_events=events
130+
)
131+
132+
scraper.serialize_events(events, file_path)
133+
134+
# Save the updated remembered_strips back to file
135+
with open("remembered_strips.json", "w") as f:
136+
json.dump(scraper.filter_config.course_remembered_strips, f)
137+
138+
if __name__ == "__main__":
139+
asyncio.run(main())
140+
```

celcat_scraper/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@
33
This package provides a complete interface for interacting with Celcat Calendar.
44
"""
55

6-
from .config import CelcatConfig, CelcatConstants
6+
from .config import CelcatConfig, CelcatFilterConfig, CelcatConstants
77
from .exceptions import CelcatError, CelcatCannotConnectError, CelcatInvalidAuthError
88
from .scraper import CelcatScraperAsync
99
from .types import EventData
1010

1111
__all__ = [
1212
"CelcatConfig",
13+
"CelcatFilterConfig",
1314
"CelcatConstants",
1415
"CelcatScraperAsync",
1516
"EventData",

celcat_scraper/config.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
the behavior of the Celcat scraper.
55
"""
66

7-
from dataclasses import dataclass
8-
from typing import Optional
7+
from dataclasses import dataclass, field
8+
from typing import Optional, Dict, List
99

1010
from aiohttp import ClientSession
1111

@@ -21,6 +21,39 @@ class CelcatConstants:
2121
CONNECTION_KEEP_ALIVE = 120
2222

2323

24+
@dataclass
25+
class CelcatFilterConfig:
26+
"""Configuration for Celcat data filter.
27+
28+
Attributes:
29+
course_title: Whether to convert course names to title case
30+
course_strip_modules: Whether to remove module codes from course names
31+
course_strip_category: Whether to remove category prefixes from course names
32+
course_strip_punctuation: Whether to remove punctuation from course names
33+
course_group_similar: Whether to group similar course names together
34+
course_strip_redundant: Whether to remove redundant elements found across multiple events
35+
course_remembered_strips: List of previously stripped strings to be reapplied in subsequent filter instances
36+
course_replacements: Dictionary of strings to replace in course names
37+
professors_title: Whether to convert professor names to title case
38+
rooms_title: Whether to convert room names to title case
39+
rooms_strip_after_number: Whether to remove text after room numbers
40+
sites_title: Whether to convert site names to title case
41+
"""
42+
43+
course_title: bool = True
44+
course_strip_modules: bool = True
45+
course_strip_category: bool = True
46+
course_strip_punctuation: bool = False
47+
course_group_similar: bool = False
48+
course_strip_redundant: bool = False
49+
course_remembered_strips: Optional[List[str]] = field(default_factory=list)
50+
course_replacements: Optional[Dict[str, str]] = field(default_factory=dict)
51+
professors_title: bool = True
52+
rooms_title: bool = True
53+
rooms_strip_after_number: bool = False
54+
sites_title: bool = True
55+
56+
2457
@dataclass
2558
class CelcatConfig:
2659
"""Configuration for Celcat scraper.
@@ -37,6 +70,7 @@ class CelcatConfig:
3770
url: str
3871
username: str
3972
password: str
73+
custom_filter: Optional[CelcatFilterConfig] = None
4074
include_holidays: bool = True
4175
rate_limit: float = 0.5
4276
session: Optional[ClientSession] = None

0 commit comments

Comments
 (0)