etiennec78
diff --git a/‎README.md‎
Lines changed: 95 additions & 19 deletions b/‎README.md‎
Lines changed: 95 additions & 19 deletions
diff --git a/‎celcat_scraper/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎celcat_scraper/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎celcat_scraper/config.py‎
Lines changed: 36 additions & 2 deletions b/‎celcat_scraper/config.py‎
Lines changed: 36 additions & 2 deletions
@@ -1,14 +1,25 @@
-# Celcat Calendar Scraper
+# Celcat Calendar Scraper 📆
 
 An asynchronous Python library for scraping Celcat calendar systems.
 
-## Installation
+## Installation 🚀
 
 ```sh
 pip install celcat-scraper
 ```
 
-## Usage
+## Features 🌟
+
+* Event attributes filtering 🔎
+* Async/await support for better performance 🔀
+* Rate limiting with adaptive backoff ⏳
+* Optional caching support 💾
+* Optional reusable aiohttp session ♻️
+* Automatic session management 🍪
+* Batch processing of events 📦
+* Error handling and retries 🚨
+
+## Usage ⚙️
 
 Basic example of retrieving calendar events:
 
@@ -23,42 +34,107 @@ async def main():
         url="https://university.com/calendar",
         username="your_username",
         password="your_password",
-        include_holidays=True
+        include_holidays=True,
     )
 
     # Create scraper instance and get events
     async with CelcatScraperAsync(config) as scraper:
-
         start_date = date.today()
         end_date = start_date + timedelta(days=30)
-        
+
         # Recommended to store events locally and reduce the amout of requests
-        file_path = 'store.json'
+        file_path = "store.json"
         events = scraper.deserialize_events(file_path)
-        
-        events = await scraper.get_calendar_events(start_date, end_date, previous_events=events)
-        
+
+        events = await scraper.get_calendar_events(
+            start_date, end_date, previous_events=events
+        )
+
         for event in events:
             print(f"Event {event['id']}")
             print(f"Course: {event['category']} - {event['course']}")
             print(f"Time: {event['start']} to {event['end']}")
             print(f"Location: {', '.join(event['rooms'])} at {', '.join(event['sites'])} - {event['department']}")
             print(f"Professors: {', '.join(event['professors'])}")
             print("---")
-        
+
         # Save events for a future refresh
         scraper.serialize_events(events, file_path)
 
 if __name__ == "__main__":
     asyncio.run(main())
 ```
 
-## Features
+## Filtering 🔍
+
+Celcat Calendar data is often messy, and needs to be processed before it can be used.
+For example, the same course may have several different names in different events.
+Filtering allows these attributes to be standardized.
+
+### Usage ⚙️
+
+> ℹ️ **Info**: Each filter argument is optional. When course_strip_redundant is enabled, using remembered_strips is recommended.
 
-* Async/await support for better performance
-* Rate limiting with adaptive backoff
-* Optional caching support
-* Optional reusable aiohttp session
-* Automatic session management
-* Batch processing of events
-* Error handling and retries
+> ⚠️ **Warning**: Disabling filters will require you to reset your previous events and refetch to undo changes.
+
+```python
+import asyncio
+from datetime import date, timedelta
+import json
+from celcat_scraper import CelcatFilterConfig, CelcatConfig, CelcatScraperAsync
+
+async def main():
+    # Load remembered_strips from a file
+    remembered_strips = []
+    try:
+        with open("remembered_strips.json", "r") as f:
+            remembered_strips = json.load(f)
+    except (FileNotFoundError, json.JSONDecodeError):
+        remembered_strips = []
+
+    # Create a list of manual course replacements
+    course_replacements = {"English - S2": "English", "Mathematics": "Maths"}
+
+    # Configure a filter
+    celcat_filter = CelcatFilterConfig(
+        course_title=True,
+        course_strip_modules=True,
+        course_strip_category=True,
+        course_strip_punctuation=True,
+        course_group_similar=True,
+        course_strip_redundant=True,
+        course_remembered_strips=remembered_strips,
+        course_replacements=course_replacements,
+        professors_title=True,
+        rooms_title=True,
+        rooms_strip_after_number=False,
+        sites_title=True,
+    )
+
+    config = CelcatConfig(
+        url="https://university.com/calendar",
+        username="your_username",
+        password="your_password",
+        include_holidays=True,
+        # Pass the filter as an argument
+        custom_filter=celcat_filter,
+    )
+
+    async with CelcatScraperAsync(config) as scraper:
+        start_date = date.today()
+        end_date = start_date + timedelta(days=30)
+
+        events = scraper.deserialize_events("store.json")
+        events = await scraper.get_calendar_events(
+            start_date, end_date, previous_events=events
+        )
+
+        scraper.serialize_events(events, file_path)
+
+    # Save the updated remembered_strips back to file
+    with open("remembered_strips.json", "w") as f:
+        json.dump(scraper.filter_config.course_remembered_strips, f)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
@@ -3,13 +3,14 @@
 This package provides a complete interface for interacting with Celcat Calendar.
 """
 
-from .config import CelcatConfig, CelcatConstants
+from .config import CelcatConfig, CelcatFilterConfig, CelcatConstants
 from .exceptions import CelcatError, CelcatCannotConnectError, CelcatInvalidAuthError
 from .scraper import CelcatScraperAsync
 from .types import EventData
 
 __all__ = [
     "CelcatConfig",
+    "CelcatFilterConfig",
     "CelcatConstants",
     "CelcatScraperAsync",
     "EventData",
 
@@ -4,8 +4,8 @@
 the behavior of the Celcat scraper.
 """
 
-from dataclasses import dataclass
-from typing import Optional
+from dataclasses import dataclass, field
+from typing import Optional, Dict, List
 
 from aiohttp import ClientSession
 
@@ -21,6 +21,39 @@ class CelcatConstants:
     CONNECTION_KEEP_ALIVE = 120
 
 
+@dataclass
+class CelcatFilterConfig:
+    """Configuration for Celcat data filter.
+
+    Attributes:
+        course_title: Whether to convert course names to title case
+        course_strip_modules: Whether to remove module codes from course names
+        course_strip_category: Whether to remove category prefixes from course names
+        course_strip_punctuation: Whether to remove punctuation from course names
+        course_group_similar: Whether to group similar course names together
+        course_strip_redundant: Whether to remove redundant elements found across multiple events
+        course_remembered_strips: List of previously stripped strings to be reapplied in subsequent filter instances
+        course_replacements: Dictionary of strings to replace in course names
+        professors_title: Whether to convert professor names to title case
+        rooms_title: Whether to convert room names to title case
+        rooms_strip_after_number: Whether to remove text after room numbers
+        sites_title: Whether to convert site names to title case
+    """
+
+    course_title: bool = True
+    course_strip_modules: bool = True
+    course_strip_category: bool = True
+    course_strip_punctuation: bool = False
+    course_group_similar: bool = False
+    course_strip_redundant: bool = False
+    course_remembered_strips: Optional[List[str]] = field(default_factory=list)
+    course_replacements: Optional[Dict[str, str]] = field(default_factory=dict)
+    professors_title: bool = True
+    rooms_title: bool = True
+    rooms_strip_after_number: bool = False
+    sites_title: bool = True
+
+
 @dataclass
 class CelcatConfig:
     """Configuration for Celcat scraper.
@@ -37,6 +70,7 @@ class CelcatConfig:
     url: str
     username: str
     password: str
+    custom_filter: Optional[CelcatFilterConfig] = None
     include_holidays: bool = True
     rate_limit: float = 0.5
     session: Optional[ClientSession] = None