Skip to content

Commit 6265ed9

Browse files
committed
Fix some Wikipedia edge cases (closes #185)
- Disambiguation cases are now plainly stated as such - Pages that have a banner followed by an empty <p> tag now correctly find the first non-empty <p> tag. - URLs are now listed as the URL Wikipedia redirected us to, rather than the naive URL we attempted - Failures to find a Wikipedia page with a space in the name now show the original name requested rather than the underscored version.
1 parent d605e94 commit 6265ed9

File tree

1 file changed

+21
-11
lines changed

1 file changed

+21
-11
lines changed

plugins/wikipedia/plugin.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@
1616
DEFAULT_MAX_DESCRIPTION_LENGTH = 150
1717

1818

19+
# This is used to filter out blank paragraphs
20+
def class_is_not_mw_empty_elt(css_class):
21+
return css_class != 'mw-empty-elt'
22+
23+
1924
class WikipediaPlugin(object):
2025
def __init__(self, cardinal, config):
2126
"""Registers a callback for URL detection."""
@@ -37,15 +42,15 @@ def __init__(self, cardinal, config):
3742
self._language_code = DEFAULT_LANGUAGE_CODE
3843

3944
def _get_article_info(self, name):
40-
name = name.replace(' ', '_')
4145
url = "https://%s.wikipedia.org/wiki/%s" % (
4246
self._language_code,
43-
name,
47+
name.replace(' ', '_'),
4448
)
4549

4650
try:
4751
uh = urllib.request.urlopen(url)
48-
soup = BeautifulSoup(uh)
52+
url = uh.url
53+
soup = BeautifulSoup(uh, features="html.parser")
4954
except Exception as e:
5055
self.logger.warning(
5156
"Couldn't query Wikipedia (404?) for: %s" % name, exc_info=True
@@ -58,14 +63,19 @@ def _get_article_info(self, name):
5863
title = soup.find("h1").get_text()
5964

6065
# Manipulation to get first paragraph without HTML markup
61-
content = soup.find_all("div", id="mw-content-text")[0]
62-
first_paragraph = content.p.get_text()
63-
64-
if len(first_paragraph) > self._max_description_length:
65-
first_paragraph = first_paragraph[:self._max_description_length] + \
66-
'...'
66+
is_disambiguation = soup.find("table", id="disambigbox") is not None
67+
if is_disambiguation:
68+
summary = "Disambiguation"
6769
else:
68-
first_paragraph = first_paragraph
70+
content = soup.find_all("div", id="mw-content-text")[0]
71+
first_paragraph = content.find(
72+
"p", class_=class_is_not_mw_empty_elt).get_text().strip()
73+
74+
if len(first_paragraph) > self._max_description_length:
75+
summary = first_paragraph[:self._max_description_length] + \
76+
'...'
77+
else:
78+
summary = first_paragraph
6979
except Exception as e:
7080
self.logger.error(
7181
"Error parsing Wikipedia result for: %s" % name,
@@ -74,7 +84,7 @@ def _get_article_info(self, name):
7484

7585
return "Error parsing Wikipedia result for: %s" % name
7686

77-
return "[ Wikipedia: %s | %s | %s ]" % (title, first_paragraph, url)
87+
return "[ Wikipedia: %s | %s | %s ]" % (title, summary, url)
7888

7989
@event('urls.detection')
8090
def url_callback(self, cardinal, channel, url):

0 commit comments

Comments
 (0)