Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions analyst_sheets/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,29 @@
import re
import soupsieve
from surt import surt
import unicodedata
from urllib.parse import parse_qs, urlencode, urljoin


INFIX_PUNCTUATION = re.compile(r'''['‘’]''')
NON_WORDS = re.compile(r'\W+')


def remove_accents(text: str) -> str:
"""
Remove all the accent characters in a unicode string.
"""
clean = ''.join(
c for c in unicodedata.normalize('NFD', text)
if not unicodedata.combining(c)
)
return unicodedata.normalize('NFKC', clean)


def normalize_text(text):
"""
Normalize a chunk of text from an HTML document by casefolding and removing
punctuation.
punctuation and accents/combining marks.
"""
# Remove punctuation between words
return NON_WORDS.sub(
Expand All @@ -30,7 +42,7 @@ def normalize_text(text):
INFIX_PUNCTUATION.sub(
'',
# Lower-case and normalize unicode characters
text.casefold()
remove_accents(text).casefold()
)
)

Expand Down
10 changes: 10 additions & 0 deletions analyst_sheets/terms.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from .normalize import normalize_text

KEY_TERMS = (
'adaptation',
'agency mission',
Expand Down Expand Up @@ -85,6 +87,14 @@
'underserved',
'water quality',
'wildfires',

# Español
'clima',
'climática',
'climático',
)

# Ensure key terms are normalized in the same way as our text will be.
KEY_TERMS = list(set(normalize_text(term) for term in KEY_TERMS))

KEY_TERM_GRAMS = max((len(term.split(' ')) for term in KEY_TERMS))