diff --git a/indexdigest/query.py b/indexdigest/query.py index 4717d929..a3284112 100644 --- a/indexdigest/query.py +++ b/indexdigest/query.py @@ -1,17 +1,36 @@ """ This module provides SQL query parsing functions """ +import re + import sqlparse from sqlparse.sql import TokenList from sqlparse.tokens import Name, Whitespace, Wildcard +def preprocess_query(query): + """ + Perform initial query cleanup + + :type query str + :rtype str + """ + # 1. remove aliases + # FROM `dimension_wikis` `dw` + # INNER JOIN `fact_wam_scores` `fwN` + query = re.sub(r'(\s(FROM|JOIN)\s`[^`]+`)\s`[^`]+`', r'\1', query, flags=re.IGNORECASE) + + return query + + def get_query_tokens(query): """ :type query str :rtype: list[sqlparse.sql.Token] """ + query = preprocess_query(query) + tokens = TokenList(sqlparse.parse(query)[0].tokens).flatten() # print([(token.value, token.ttype) for token in tokens]) @@ -76,9 +95,11 @@ def get_query_tables(query): # print([last_keyword, last_token, token.value]) # analyze the name tokens, column names and where condition values if last_keyword in ['FROM', 'JOIN', 'INNER JOIN', 'LEFT JOIN', 'RIGHT JOIN', 'INTO'] \ - and last_token not in ['AS']: - if token.value not in tables: - tables.append(token.value.strip('`')) + and last_token not in ['AS'] \ + and token.value not in ['AS']: + table_name = token.value.strip('`') + if table_name not in tables: + tables.append(table_name) last_token = token.value.upper() diff --git a/indexdigest/test/core/test_query.py b/indexdigest/test/core/test_query.py index 8e08da50..cfbd89bf 100644 --- a/indexdigest/test/core/test_query.py +++ b/indexdigest/test/core/test_query.py @@ -1,10 +1,21 @@ from unittest import TestCase -from indexdigest.query import get_query_columns, get_query_tables +from indexdigest.query import preprocess_query, get_query_columns, get_query_tables class TestUtils(TestCase): + def test_preprocess_query(self): + self.assertEquals( + preprocess_query('SELECT DISTINCT dw.lang FROM `dimension_wikis` `dw` INNER JOIN `fact_wam_scores` `fwN` ON ((dw.wiki_id = fwN.wiki_id)) WHERE fwN.time_id = FROM_UNIXTIME(N) ORDER BY dw.lang ASC'), + 'SELECT DISTINCT dw.lang FROM `dimension_wikis` INNER JOIN `fact_wam_scores` ON ((dw.wiki_id = fwN.wiki_id)) WHERE fwN.time_id = FROM_UNIXTIME(N) ORDER BY dw.lang ASC' + ) + + self.assertEquals( + preprocess_query("SELECT count(fwN.wiki_id) as wam_results_total FROM `fact_wam_scores` `fwN` left join `fact_wam_scores` `fwN` ON ((fwN.wiki_id = fwN.wiki_id) AND (fwN.time_id = FROM_UNIXTIME(N))) left join `dimension_wikis` `dw` ON ((fwN.wiki_id = dw.wiki_id)) WHERE (fwN.time_id = FROM_UNIXTIME(N)) AND (dw.url like X OR dw.title like X) AND fwN.vertical_id IN (XYZ) AND dw.lang = X AND (fwN.wiki_id NOT IN (XYZ)) AND ((dw.url IS NOT NULL AND dw.title IS NOT NULL))"), + "SELECT count(fwN.wiki_id) as wam_results_total FROM `fact_wam_scores` left join `fact_wam_scores` ON ((fwN.wiki_id = fwN.wiki_id) AND (fwN.time_id = FROM_UNIXTIME(N))) left join `dimension_wikis` ON ((fwN.wiki_id = dw.wiki_id)) WHERE (fwN.time_id = FROM_UNIXTIME(N)) AND (dw.url like X OR dw.title like X) AND fwN.vertical_id IN (XYZ) AND dw.lang = X AND (fwN.wiki_id NOT IN (XYZ)) AND ((dw.url IS NOT NULL AND dw.title IS NOT NULL))" + ) + def test_get_query_columns(self): self.assertListEqual(['*'], get_query_columns('SELECT * FROM `test_table`')) @@ -48,6 +59,17 @@ def test_get_query_tables(self): self.assertListEqual(['events'], get_query_tables("SELECT COUNT( 0 ) AS cnt, date_format(event_date, '%Y-%m-%d') AS date FROM events WHERE event_date BETWEEN '2017-10-18 00:00:00' AND '2017-10-24 23:59:59' AND wiki_id = '1289985' GROUP BY date WITH ROLLUP")) + # complex queries + # @see https://github.com/macbre/query-digest/issues/16 + self.assertListEqual(['report_wiki_recent_pageviews', 'dimension_wikis'], + get_query_tables("SELECT r.wiki_id AS id, pageviews_Nday AS pageviews FROM report_wiki_recent_pageviews AS r INNER JOIN dimension_wikis AS d ON r.wiki_id = d.wiki_id WHERE d.public = X AND r.lang = X AND r.hub_name = X ORDER BY pageviews DESC LIMIT N")) + + self.assertListEqual(['dimension_wikis', 'fact_wam_scores'], + get_query_tables("SELECT DISTINCT dw.lang FROM `dimension_wikis` `dw` INNER JOIN `fact_wam_scores` `fwN` ON ((dw.wiki_id = fwN.wiki_id)) WHERE fwN.time_id = FROM_UNIXTIME(N) ORDER BY dw.lang ASC")) + + self.assertListEqual(['fact_wam_scores', 'dimension_wikis'], + get_query_tables("SELECT count(fwN.wiki_id) as wam_results_total FROM `fact_wam_scores` `fwN` left join `fact_wam_scores` `fwN` ON ((fwN.wiki_id = fwN.wiki_id) AND (fwN.time_id = FROM_UNIXTIME(N))) left join `dimension_wikis` `dw` ON ((fwN.wiki_id = dw.wiki_id)) WHERE (fwN.time_id = FROM_UNIXTIME(N)) AND (dw.url like X OR dw.title like X) AND fwN.vertical_id IN (XYZ) AND dw.lang = X AND (fwN.wiki_id NOT IN (XYZ)) AND ((dw.url IS NOT NULL AND dw.title IS NOT NULL))")) + # INSERT queries self.assertListEqual(['0070_insert_ignore_table'], get_query_tables("INSERT IGNORE INTO `0070_insert_ignore_table` VALUES (9, '123', '2017-01-01');"))