etiennec78 · etiennec78 · Mar 15, 2025 · Mar 2, 2025 · Mar 4, 2025 · Mar 4, 2025
diff --git a/README.md b/README.md
@@ -1,14 +1,25 @@
-# Celcat Calendar Scraper
+# Celcat Calendar Scraper 📆
 
 An asynchronous Python library for scraping Celcat calendar systems.
 
-## Installation
+## Installation 🚀
 
 ```sh
 pip install celcat-scraper
 ```
 
-## Usage
+## Features 🌟
+
+* Event attributes filtering 🔎
+* Async/await support for better performance 🔀
+* Rate limiting with adaptive backoff ⏳
+* Optional caching support 💾
+* Optional reusable aiohttp session ♻️
+* Automatic session management 🍪
+* Batch processing of events 📦
+* Error handling and retries 🚨
+
+## Usage ⚙️
 
 Basic example of retrieving calendar events:
 
@@ -23,42 +34,110 @@ async def main():
         url="https://university.com/calendar",
         username="your_username",
         password="your_password",
-        include_holidays=True
+        include_holidays=True,
     )
 
     # Create scraper instance and get events
     async with CelcatScraperAsync(config) as scraper:
-
         start_date = date.today()
         end_date = start_date + timedelta(days=30)
-        
+
         # Recommended to store events locally and reduce the amout of requests
-        file_path = 'store.json'
+        file_path = "store.json"
         events = scraper.deserialize_events(file_path)
-
-        events = await scraper.get_calendar_events(start_date, end_date, previous_events=events)
-
+
+        events = await scraper.get_calendar_events(
+            start_date, end_date, previous_events=events
+        )
+
         for event in events:
             print(f"Event {event['id']}")
             print(f"Course: {event['category']} - {event['course']}")
             print(f"Time: {event['start']} to {event['end']}")
             print(f"Location: {', '.join(event['rooms'])} at {', '.join(event['sites'])} - {event['department']}")
             print(f"Professors: {', '.join(event['professors'])}")
             print("---")
-        
+
         # Save events for a future refresh
         scraper.serialize_events(events, file_path)
 
 if __name__ == "__main__":
     asyncio.run(main())
 ```
 
-## Features
+## Filtering 🔍
+
+Celcat Calendar data is often messy, and needs to be processed before it can be used.
+For example, the same course may have several different names in different events.
+Filtering allows these attributes to be standardized.
+
+### Usage ⚙️
+
+> ℹ️ **Info**: Each filter argument is optional. When course_strip_redundant is enabled, using remembered_strips is recommended.
 
-* Async/await support for better performance
-* Rate limiting with adaptive backoff
-* Optional caching support
-* Optional reusable aiohttp session
-* Automatic session management
-* Batch processing of events
-* Error handling and retries
+> ⚠️ **Warning**: Disabling filters will require you to reset your previous events and refetch to undo changes.
+
+```python
+import asyncio
+from datetime import date, timedelta
+import json
+from celcat_scraper import CelcatFilterConfig, FilterType, CelcatConfig, CelcatScraperAsync
+
+async def main():
+    # Load remembered_strips from a file
+    remembered_strips = []
+    try:
+        with open("remembered_strips.json", "r") as f:
+            remembered_strips = json.load(f)
+    except (FileNotFoundError, json.JSONDecodeError):
+        remembered_strips = []
+
+    # Create a list of manual course replacements
+    course_replacements = {"English - S2": "English", "Mathematics": "Maths"}
+
+    # Configure a filter
+    filter_config = CelcatFilterConfig(
+        filters = {
+            FilterType.COURSE_TITLE,
+            FilterType.COURSE_STRIP_MODULES,
+            FilterType.COURSE_STRIP_CATEGORY,
+            FilterType.COURSE_STRIP_PUNCTUATION,
+            FilterType.COURSE_GROUP_SIMILAR,
+            FilterType.COURSE_STRIP_REDUNDANT,
+            FilterType.PROFESSORS_TITLE,
+            FilterType.ROOMS_TITLE,
+            FilterType.ROOMS_STRIP_AFTER_NUMBER,
+            FilterType.SITES_TITLE,
+            FilterType.SITES_REMOVE_DUPLICATES,
+        }
+        course_remembered_strips=remembered_strips,
+        course_replacements=course_replacements,
+    )
+
+    config = CelcatConfig(
+        url="https://university.com/calendar",
+        username="your_username",
+        password="your_password",
+        include_holidays=True,
+        # Pass the filter as an argument
+        filter_config=filter_config,
+    )
+
+    async with CelcatScraperAsync(config) as scraper:
+        start_date = date.today()
+        end_date = start_date + timedelta(days=30)
+
+        events = scraper.deserialize_events("store.json")
+        events = await scraper.get_calendar_events(
+            start_date, end_date, previous_events=events
+        )
+
+        scraper.serialize_events(events, file_path)
+
+    # Save the updated remembered_strips back to file
+    with open("remembered_strips.json", "w") as f:
+        json.dump(scraper.filter_config.course_remembered_strips, f)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
diff --git a/celcat_scraper/__init__.py b/celcat_scraper/__init__.py
@@ -2,14 +2,17 @@
 
 This package provides a complete interface for interacting with Celcat Calendar.
 """
-from .config import CelcatConfig, CelcatConstants
+
+from .config import CelcatConfig, CelcatFilterConfig, CelcatConstants, FilterType
 from .exceptions import CelcatError, CelcatCannotConnectError, CelcatInvalidAuthError
 from .scraper import CelcatScraperAsync
 from .types import EventData
 
 __all__ = [
     "CelcatConfig",
+    "CelcatFilterConfig",
     "CelcatConstants",
+    "FilterType",
     "CelcatScraperAsync",
     "EventData",
     "CelcatError",

diff --git a/celcat_scraper/api.py b/celcat_scraper/api.py
@@ -16,26 +16,31 @@
 
 _LOGGER = logging.getLogger(__name__)
 
+
 class CelcatAPI:
     """Class for interacting with Celcat Calendar API."""
 
-    def __init__(self):
+    def __init__(self, config: CelcatConfig):
         """Initialize the Celcat API client."""
-        self.rate_limiter = RateLimiter(1/CelcatConfig.rate_limit)
+        self.rate_limiter = RateLimiter(config.rate_limit)
         self.semaphore = asyncio.Semaphore(CelcatConstants.CONCURRENT_REQUESTS)
         self.timeout = CelcatConstants.TIMEOUT
-
-    async def validate_response(self, response: ClientResponse, expected_type: str = None) -> Any:
+
+    async def validate_response(
+        self, response: ClientResponse, expected_type: str = None
+    ) -> Any:
         """Validate server response and return appropriate data type."""
         if response.status != 200:
-            error_text = await response.text(encoding='latin1')
+            error_text = await response.text(encoding="latin1")
             raise CelcatCannotConnectError(
                 f"Server returned status {response.status}: {error_text[:200]}"
             )
 
         if expected_type == "json":
             if "application/json" not in response.headers.get("Content-Type", ""):
-                raise CelcatCannotConnectError("Expected JSON response but got different content type")
+                raise CelcatCannotConnectError(
+                    "Expected JSON response but got different content type"
+                )
             return await response.json()
 
         return await response.text()
@@ -50,7 +55,9 @@ async def handle_error_response(self, response: ClientResponse) -> None:
         elif response.status == 429:
             retry_after = int(response.headers.get("Retry-After", 30))
             self.rate_limiter.increase_backoff()
-            raise CelcatCannotConnectError(f"Rate limited. Retry after {retry_after} seconds")
+            raise CelcatCannotConnectError(
+                f"Rate limited. Retry after {retry_after} seconds"
+            )
         else:
             raise CelcatCannotConnectError(f"HTTP {response.status}: {error_msg}")
 
@@ -60,7 +67,7 @@ async def get_calendar_raw_data(
         url: str,
         federation_ids: str,
         start_date: date,
-        end_date: date
+        end_date: date,
     ) -> List[Dict[str, Any]]:
         """Fetch raw calendar data for given time period."""
         _LOGGER.info("Getting calendar raw data")
@@ -73,38 +80,25 @@ async def get_calendar_raw_data(
             "end": end_date.strftime("%Y-%m-%d"),
             "resType": "104",
             "calView": "month",
-            "federationIds[]": federation_ids
+            "federationIds[]": federation_ids,
         }
 
         url_calendar_data = url + "/Home/GetCalendarData"
-        
+
         return await self.fetch_with_retry(
-            session, 
-            "POST",
-            "json",
-            url_calendar_data, 
-            data=calendar_data
+            session, "POST", "json", url_calendar_data, data=calendar_data
         )
 
     async def get_side_bar_event_raw_data(
-        self,
-        session: ClientSession,
-        url: str,
-        event_id: str
+        self, session: ClientSession, url: str, event_id: str
     ) -> dict:
         """Fetch detailed event data by ID."""
-        sidebar_data = {
-            "eventid": event_id
-        }
+        sidebar_data = {"eventid": event_id}
 
         url_sidebar_data = url + "/Home/GetSideBarEvent"
-        
+
         return await self.fetch_with_retry(
-            session, 
-            "POST",
-            "json",
-            url_sidebar_data, 
-            data=sidebar_data
+            session, "POST", "json", url_sidebar_data, data=sidebar_data
         )
 
     async def fetch_with_retry(
@@ -113,7 +107,7 @@ async def fetch_with_retry(
         method: str,
         expected_type: str,
         url: str,
-        **kwargs
+        **kwargs,
     ) -> Any:
         """Make HTTP requests with retry logic."""
         await self.rate_limiter.acquire()
@@ -126,12 +120,14 @@ async def fetch_with_retry(
                     async with session.request(method, url, **kwargs) as response:
                         if response.status == 200:
                             content_type = response.headers.get("Content-Type", "")
-                            
+
                             if expected_type == "json":
                                 if "application/json" in content_type:
                                     data = await response.json()
                                 else:
-                                    raise CelcatCannotConnectError(f"Expected JSON response but got different content type: {content_type}")
+                                    raise CelcatCannotConnectError(
+                                        f"Expected JSON response but got different content type: {content_type}"
+                                    )
                             else:
                                 data = await response.text()
 
@@ -143,5 +139,7 @@ async def fetch_with_retry(
                 except ClientError as exc:
                     self.rate_limiter.increase_backoff()
                     if attempt == CelcatConstants.MAX_RETRIES - 1:
-                        raise CelcatCannotConnectError(f"Failed after {CelcatConstants.MAX_RETRIES} attempts") from exc
-                    await asyncio.sleep(min(2 ** attempt, 10))
+                        raise CelcatCannotConnectError(
+                            f"Failed after {CelcatConstants.MAX_RETRIES} attempts"
+                        ) from exc
+                    await asyncio.sleep(min(2**attempt, 10))
diff --git a/celcat_scraper/auth.py b/celcat_scraper/auth.py
@@ -14,11 +14,9 @@
 
 _LOGGER = logging.getLogger(__name__)
 
+
 async def authenticate(
-    session: ClientSession,
-    url: str,
-    username: str,
-    password: str
+    session: ClientSession, url: str, username: str, password: str
 ) -> Tuple[bool, Optional[str]]:
     """Authenticate to Celcat.
 
@@ -57,13 +55,13 @@ async def authenticate(
             login_data = {
                 "Name": username,
                 "Password": password,
-                "__RequestVerificationToken": token_element["value"]
+                "__RequestVerificationToken": token_element["value"],
             }
 
             async with session.post(
                 f"{url}/LdapLogin/Logon",
                 data=login_data,
-                headers={"Content-Type": "application/x-www-form-urlencoded"}
+                headers={"Content-Type": "application/x-www-form-urlencoded"},
             ) as response:
                 if response.status != 200:
                     error_text = await response.text(encoding="latin1")
@@ -80,7 +78,9 @@ async def authenticate(
         raise CelcatCannotConnectError("Failed to connect to Celcat service") from exc
 
 
-def _process_login_response(response_url, page_content: str) -> Tuple[bool, Optional[str]]:
+def _process_login_response(
+    response_url, page_content: str
+) -> Tuple[bool, Optional[str]]:
     """Process login response and extract federation IDs.
 
     Returns:
@@ -96,20 +96,29 @@ def _process_login_response(response_url, page_content: str) -> Tuple[bool, Opti
 
     if login_button_state == "Log Out":
         federation_ids = next(
-            (param.split("=")[1] for param in str(response_url).split("&")
-             if param.startswith("FederationIds=")),
-            None
+            (
+                param.split("=")[1]
+                for param in str(response_url).split("&")
+                if param.startswith("FederationIds=")
+            ),
+            None,
         )
 
         if federation_ids is None:
-            _LOGGER.debug("FederationIds could not be retrieved. Trying to extract from page")
+            _LOGGER.debug(
+                "FederationIds could not be retrieved. Trying to extract from page"
+            )
             extracted = soup.find("span", class_="small")
             if extracted:
-                federation_ids = extracted.text.lstrip('-').strip()
+                federation_ids = extracted.text.lstrip("-").strip()
                 if not federation_ids.isdigit():
-                    raise CelcatCannotConnectError(f"Federation ids could not be extracted from '{federation_ids}'")
+                    raise CelcatCannotConnectError(
+                        f"Federation ids could not be extracted from '{federation_ids}'"
+                    )
             else:
-                raise CelcatCannotConnectError("Federation ids class could not be found")
+                raise CelcatCannotConnectError(
+                    "Federation ids class could not be found"
+                )
 
         _LOGGER.debug("Successfully logged in to Celcat")
         return True, federation_ids