Skip to content

Commit 5391e1b

Browse files
committed
HOTFIX: linksearchtotal_collect query performance
reworked query template to go from this: ``` SELECT COUNT(*) FROM externallinks WHERE el_to_domain_index LIKE '%com.example%' AND el_to_path LIKE '%%' ``` to this: ``` SELECT COUNT(*) FROM externallinks WHERE el_to_domain_index LIKE 'http://com.example%' SELECT COUNT(*) FROM externallinks WHERE el_to_domain_index LIKE 'https://com.example%' ``` and from this: ``` SELECT COUNT(*) FROM externallinks WHERE el_to_domain_index LIKE '%com.example%' AND el_to_path LIKE '%/examples/%' ``` to this: ``` SELECT COUNT(*) FROM externallinks WHERE el_to_domain_index LIKE 'http://com.example%' AND el_to_path LIKE '/examples/%' SELECT COUNT(*) FROM externallinks WHERE el_to_domain_index LIKE 'https://com.example%' AND el_to_path LIKE '/examples/%' ``` which is significantly faster since it avoids wildcards at the start of the condition. See: https://phabricator.wikimedia.org/T403209 Bug: T403209
1 parent c5cb81a commit 5391e1b

1 file changed

Lines changed: 22 additions & 15 deletions

File tree

extlinks/links/management/commands/linksearchtotal_collect.py

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@ class Command(BaseCommand):
1616
help = "Updates link totals from externallinks table"
1717

1818
def _handle(self, *args, **options):
19+
protocols = ["http", "https"]
1920

21+
print("reading wiki-list")
2022
with open(os.path.join(BASE_DIR, "wiki-list.csv"), "r") as wiki_list:
2123
csv_reader = csv.reader(wiki_list)
2224
wiki_list_data = []
@@ -27,6 +29,7 @@ def _handle(self, *args, **options):
2729

2830
total_links_dictionary = {}
2931
for i, language in enumerate(wiki_list_data):
32+
print("connecting to db {}".format(language))
3033
db = MySQLdb.connect(
3134
host="{lang}wiki.analytics.db.svc.wikimedia.cloud".format(
3235
lang=language
@@ -39,6 +42,7 @@ def _handle(self, *args, **options):
3942
cur = db.cursor()
4043

4144
for urlpattern in all_urlpatterns:
45+
print("searching url pattern {}".format(urlpattern))
4246
# For the first language, initialise tracking
4347
if i == 0:
4448
total_links_dictionary[urlpattern.pk] = 0
@@ -50,27 +54,30 @@ def _handle(self, *args, **options):
5054
url = urlpattern.url
5155

5256
url_parsed = urlparse(url)
53-
url_path = url_parsed.path
5457
url_host = url_parsed.hostname
55-
56-
query = f"""
57-
SELECT COUNT(*) FROM externallinks
58-
WHERE el_to_domain_index LIKE '%{reverse_host(url_host)}%'
59-
AND el_to_path LIKE '%{url_path}%'
60-
"""
61-
62-
cur.execute(
63-
query
64-
)
65-
66-
this_num_urls = cur.fetchone()[0]
67-
68-
total_links_dictionary[urlpattern.pk] += this_num_urls
58+
url_path = url_parsed.path
59+
for protocol in protocols:
60+
query = f"""
61+
SELECT COUNT(*) FROM externallinks
62+
WHERE el_to_domain_index LIKE '{protocol}://{reverse_host(url_host)}%'
63+
"""
64+
if len(url_path) > 0:
65+
cond = f"""AND el_to_path LIKE '{url_path}%'
66+
"""
67+
query += cond
68+
print("executing query {}".format(query))
69+
cur.execute(query)
70+
71+
this_num_urls = cur.fetchone()[0]
72+
73+
print("found {}".format(this_num_urls))
74+
total_links_dictionary[urlpattern.pk] += this_num_urls
6975

7076
for urlpattern_pk, total_count in total_links_dictionary.items():
7177
linksearch_object = LinkSearchTotal(
7278
url=URLPattern.objects.get(pk=urlpattern_pk), total=total_count
7379
)
80+
print("saving linksearch_object {}".format(linksearch_object))
7481
linksearch_object.save()
7582

7683
close_old_connections()

0 commit comments

Comments
 (0)