Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 24 additions & 3 deletions indexdigest/query.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,36 @@
"""
This module provides SQL query parsing functions
"""
import re

import sqlparse

from sqlparse.sql import TokenList
from sqlparse.tokens import Name, Whitespace, Wildcard


def preprocess_query(query):
"""
Perform initial query cleanup

:type query str
:rtype str
"""
# 1. remove aliases
# FROM `dimension_wikis` `dw`
# INNER JOIN `fact_wam_scores` `fwN`
query = re.sub(r'(\s(FROM|JOIN)\s`[^`]+`)\s`[^`]+`', r'\1', query, flags=re.IGNORECASE)

return query


def get_query_tokens(query):
"""
:type query str
:rtype: list[sqlparse.sql.Token]
"""
query = preprocess_query(query)

tokens = TokenList(sqlparse.parse(query)[0].tokens).flatten()
# print([(token.value, token.ttype) for token in tokens])

Expand Down Expand Up @@ -76,9 +95,11 @@ def get_query_tables(query):
# print([last_keyword, last_token, token.value])
# analyze the name tokens, column names and where condition values
if last_keyword in ['FROM', 'JOIN', 'INNER JOIN', 'LEFT JOIN', 'RIGHT JOIN', 'INTO'] \
and last_token not in ['AS']:
if token.value not in tables:
tables.append(token.value.strip('`'))
and last_token not in ['AS'] \
and token.value not in ['AS']:
table_name = token.value.strip('`')
if table_name not in tables:
tables.append(table_name)

last_token = token.value.upper()

Expand Down
24 changes: 23 additions & 1 deletion indexdigest/test/core/test_query.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,21 @@
from unittest import TestCase

from indexdigest.query import get_query_columns, get_query_tables
from indexdigest.query import preprocess_query, get_query_columns, get_query_tables


class TestUtils(TestCase):

def test_preprocess_query(self):
self.assertEquals(
preprocess_query('SELECT DISTINCT dw.lang FROM `dimension_wikis` `dw` INNER JOIN `fact_wam_scores` `fwN` ON ((dw.wiki_id = fwN.wiki_id)) WHERE fwN.time_id = FROM_UNIXTIME(N) ORDER BY dw.lang ASC'),
'SELECT DISTINCT dw.lang FROM `dimension_wikis` INNER JOIN `fact_wam_scores` ON ((dw.wiki_id = fwN.wiki_id)) WHERE fwN.time_id = FROM_UNIXTIME(N) ORDER BY dw.lang ASC'
)

self.assertEquals(
preprocess_query("SELECT count(fwN.wiki_id) as wam_results_total FROM `fact_wam_scores` `fwN` left join `fact_wam_scores` `fwN` ON ((fwN.wiki_id = fwN.wiki_id) AND (fwN.time_id = FROM_UNIXTIME(N))) left join `dimension_wikis` `dw` ON ((fwN.wiki_id = dw.wiki_id)) WHERE (fwN.time_id = FROM_UNIXTIME(N)) AND (dw.url like X OR dw.title like X) AND fwN.vertical_id IN (XYZ) AND dw.lang = X AND (fwN.wiki_id NOT IN (XYZ)) AND ((dw.url IS NOT NULL AND dw.title IS NOT NULL))"),
"SELECT count(fwN.wiki_id) as wam_results_total FROM `fact_wam_scores` left join `fact_wam_scores` ON ((fwN.wiki_id = fwN.wiki_id) AND (fwN.time_id = FROM_UNIXTIME(N))) left join `dimension_wikis` ON ((fwN.wiki_id = dw.wiki_id)) WHERE (fwN.time_id = FROM_UNIXTIME(N)) AND (dw.url like X OR dw.title like X) AND fwN.vertical_id IN (XYZ) AND dw.lang = X AND (fwN.wiki_id NOT IN (XYZ)) AND ((dw.url IS NOT NULL AND dw.title IS NOT NULL))"
)

def test_get_query_columns(self):
self.assertListEqual(['*'],
get_query_columns('SELECT * FROM `test_table`'))
Expand Down Expand Up @@ -48,6 +59,17 @@ def test_get_query_tables(self):
self.assertListEqual(['events'],
get_query_tables("SELECT COUNT( 0 ) AS cnt, date_format(event_date, '%Y-%m-%d') AS date FROM events WHERE event_date BETWEEN '2017-10-18 00:00:00' AND '2017-10-24 23:59:59' AND wiki_id = '1289985' GROUP BY date WITH ROLLUP"))

# complex queries
# @see https://github.com/macbre/query-digest/issues/16
self.assertListEqual(['report_wiki_recent_pageviews', 'dimension_wikis'],
get_query_tables("SELECT r.wiki_id AS id, pageviews_Nday AS pageviews FROM report_wiki_recent_pageviews AS r INNER JOIN dimension_wikis AS d ON r.wiki_id = d.wiki_id WHERE d.public = X AND r.lang = X AND r.hub_name = X ORDER BY pageviews DESC LIMIT N"))

self.assertListEqual(['dimension_wikis', 'fact_wam_scores'],
get_query_tables("SELECT DISTINCT dw.lang FROM `dimension_wikis` `dw` INNER JOIN `fact_wam_scores` `fwN` ON ((dw.wiki_id = fwN.wiki_id)) WHERE fwN.time_id = FROM_UNIXTIME(N) ORDER BY dw.lang ASC"))

self.assertListEqual(['fact_wam_scores', 'dimension_wikis'],
get_query_tables("SELECT count(fwN.wiki_id) as wam_results_total FROM `fact_wam_scores` `fwN` left join `fact_wam_scores` `fwN` ON ((fwN.wiki_id = fwN.wiki_id) AND (fwN.time_id = FROM_UNIXTIME(N))) left join `dimension_wikis` `dw` ON ((fwN.wiki_id = dw.wiki_id)) WHERE (fwN.time_id = FROM_UNIXTIME(N)) AND (dw.url like X OR dw.title like X) AND fwN.vertical_id IN (XYZ) AND dw.lang = X AND (fwN.wiki_id NOT IN (XYZ)) AND ((dw.url IS NOT NULL AND dw.title IS NOT NULL))"))

# INSERT queries
self.assertListEqual(['0070_insert_ignore_table'],
get_query_tables("INSERT IGNORE INTO `0070_insert_ignore_table` VALUES (9, '123', '2017-01-01');"))
Expand Down