edgi-govdata-archiving · Mr0grog · Jun 17, 2025 · Jun 17, 2025 · Jun 17, 2025
diff --git a/analyst_sheets/normalize.py b/analyst_sheets/normalize.py
@@ -11,17 +11,29 @@
 import re
 import soupsieve
 from surt import surt
+import unicodedata
 from urllib.parse import parse_qs, urlencode, urljoin
 
 
 INFIX_PUNCTUATION = re.compile(r'''['‘’]''')
 NON_WORDS = re.compile(r'\W+')
 
 
+def remove_accents(text: str) -> str:
+    """
+    Remove all the accent characters in a unicode string.
+    """
+    clean = ''.join(
+        c for c in unicodedata.normalize('NFD', text)
+        if not unicodedata.combining(c)
+    )
+    return unicodedata.normalize('NFKC', clean)
+
+
 def normalize_text(text):
     """
     Normalize a chunk of text from an HTML document by casefolding and removing
-    punctuation.
+    punctuation and accents/combining marks.
     """
     # Remove punctuation between words
     return NON_WORDS.sub(
@@ -30,7 +42,7 @@ def normalize_text(text):
         INFIX_PUNCTUATION.sub(
             '',
             # Lower-case and normalize unicode characters
-            text.casefold()
+            remove_accents(text).casefold()
         )
     )
 

diff --git a/analyst_sheets/terms.py b/analyst_sheets/terms.py
@@ -1,3 +1,5 @@
+from .normalize import normalize_text
+
 KEY_TERMS = (
     'adaptation',
     'agency mission',
@@ -85,6 +87,14 @@
     'underserved',
     'water quality',
     'wildfires',
+
+    # Español
+    'clima',
+    'climática',
+    'climático',
 )
 
+# Ensure key terms are normalized in the same way as our text will be.
+KEY_TERMS = list(set(normalize_text(term) for term in KEY_TERMS))
+
 KEY_TERM_GRAMS = max((len(term.split(' ')) for term in KEY_TERMS))