Greatly improve Reddit subdomain handling (#422)

trag1c · web-flow · commit c8f656c94b8c · 2025-10-24T18:31:34.000+02:00
diff --git a/app/components/fixup_embeds.py b/app/components/fixup_embeds.py
@@ -20,10 +20,42 @@
 
     from app.bot import GhosttyBot
 
-type SiteTransformation = tuple[re.Pattern[str], Callable[[re.Match[str]], str]]
+type SiteTransformation = tuple[re.Pattern[str], Callable[[re.Match[str]], str | None]]
+
+
+def _reddit_transformer(match: re.Match[str]) -> str | None:
+    # Reddit supports `foo.reddit.com` as an alias for `reddit.com/r/foo`, but Rxddit
+    # does not. However, Reddit also has a *bunch* of random subdomains. Rxddit handles
+    # the skins (old.reddit.com and new.reddit.com) properly, so those are appended to
+    # the URL. Apparently there's also a subdomain for every two-letter sequence, with
+    # some being language codes and others being unused, which Rxddit doesn't handle, so
+    # they're simply dropped by the regex below.
+
+    # Post links have either a subdomain (representing the subreddit) or a subreddit, so
+    # ignore everything else.
+    if bool(match["subdomain"]) == bool(match["subreddit"]):
+        return None
+
+    skin = f"{s}." if (s := match["skin"]) else ""
+    if subreddit := match["subreddit"]:
+        # https://reddit.com/r///foo/comments/bar works apparently, but Rxddit doesn't
+        # support it. Honestly don't blame them.
+        subreddit = "r/" + subreddit.removeprefix("r").strip("/")
+    else:
+        # Append the subdomain as a subreddit if we don't already have one.
+        subreddit = f"r/{match['subdomain']}"
+    return f"https://{skin}rxddit.com/{subreddit}/{match['post']}"
+
 
 VALID_URI_CHARS = r"[A-Za-z0-9-._~:/?#\[\]@!$&'()*+,;%=]"
 EMBED_SITES: tuple[SiteTransformation, ...] = (
+    (
+        re.compile(
+            r"https://(?:(?:www|(?P<skin>old|new)|\w\w|(?P<subdomain>[A-Za-z0-9_]+))\.)?reddit\.com/+"
+            rf"(?P<subreddit>r/+[A-Za-z0-9_]+/+)?(?P<post>{VALID_URI_CHARS}+)"
+        ),
+        _reddit_transformer,
+    ),
     (
         re.compile(
             r"https://(?:www\.)?(?P<site>x|twitter)\.com/"
@@ -39,17 +71,6 @@
         ),
         lambda match: f"https://phixiv.net/{match[1]}",
     ),
-    (
-        re.compile(
-            r"https://(?:(?:www|(?P<subreddit>\w+))\.)?reddit\.com/"
-            rf"(?P<post>{VALID_URI_CHARS}+)"
-        ),
-        # Reddit supports `foo.reddit.com` as an alias for `reddit.com/r/foo`, but
-        # Rxddit does not.
-        lambda match: "https://rxddit.com/"
-        + (f"r/{subreddit}/" if (subreddit := match["subreddit"]) else "")
-        + match["post"],
-    ),
 )
 IGNORED_LINK = re.compile(rf"\<https://{VALID_URI_CHARS}+\>")
 
@@ -68,12 +89,12 @@ def __init__(self, bot: GhosttyBot) -> None:
         FixUpActions.linker = self.linker
 
     async def process(self, message: dc.Message) -> ProcessedMessage:
-        matches: list[str] = []
+        matches: list[str | None] = []
         message_content = IGNORED_LINK.sub("", message.content)
         for pattern, transformer in EMBED_SITES:
             matches.extend(map(transformer, pattern.finditer(message_content)))
 
-        links = list(dict.fromkeys(matches))
+        links = list(filter(None, dict.fromkeys(matches)))
         omitted = False
         if len(links) > 5:
             omitted = True