From 6aa79be2b25d73941de7a2ea568237f5417f391f Mon Sep 17 00:00:00 2001 From: macbre Date: Tue, 22 Dec 2020 16:44:25 +0100 Subject: [PATCH 1/6] get_table_rows_estimate: "rows" in information_schema.tables is just an estimate --- indexdigest/database.py | 4 +++- indexdigest/test/core/test_database.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/indexdigest/database.py b/indexdigest/database.py index 3ebea59d..98233b5e 100644 --- a/indexdigest/database.py +++ b/indexdigest/database.py @@ -266,11 +266,13 @@ def get_table_metadata(self, table_name): :type table_name str :rtype: dict """ - # @see https://dev.mysql.com/doc/refman/5.7/en/tables-table.html + # https://dev.mysql.com/doc/refman/5.7/en/tables-table.html + # https://mariadb.com/kb/en/information-schema-tables-table/ stats = self.query_dict_row( "SELECT ENGINE, TABLE_ROWS, DATA_LENGTH, INDEX_LENGTH " "FROM information_schema.TABLES " + self._get_information_schema_where(table_name)) + # TODO: introduce dataclass return { 'engine': stats['ENGINE'], 'rows': stats['TABLE_ROWS'], # For InnoDB the row count is only a rough estimate diff --git a/indexdigest/test/core/test_database.py b/indexdigest/test/core/test_database.py index b646fd8c..131299f0 100644 --- a/indexdigest/test/core/test_database.py +++ b/indexdigest/test/core/test_database.py @@ -150,7 +150,7 @@ def test_get_table_metadata(self): # stats self.assertEqual(meta['engine'], 'InnoDB') - self.assertEqual(meta['rows'], 3) + self.assertAlmostEqual(meta['rows'], 3, delta=1) self.assertTrue(meta['index_size'] > 0) self.assertTrue(meta['data_size'] > 0) @@ -180,7 +180,7 @@ def test_get_table_columns(self): # assert False def test_get_table_rows_estimate(self): - self.assertEqual(self.connection.get_table_rows_estimate(self.TABLE_NAME), 3) + self.assertAlmostEqual(self.connection.get_table_rows_estimate(self.TABLE_NAME), 3, delta=1) class TestsWithDatabaseMocked(TestCase): From d621a96f5ba25fac5fc6ef83ad258b9fce152b5b Mon Sep 17 00:00:00 2001 From: macbre Date: Tue, 22 Dec 2020 16:48:10 +0100 Subject: [PATCH 2/6] get_low_cardinality_indices: add debug print() --- indexdigest/linters/linter_0031_low_cardinality_index.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/indexdigest/linters/linter_0031_low_cardinality_index.py b/indexdigest/linters/linter_0031_low_cardinality_index.py index 1d65e4d3..cc87c474 100644 --- a/indexdigest/linters/linter_0031_low_cardinality_index.py +++ b/indexdigest/linters/linter_0031_low_cardinality_index.py @@ -35,6 +35,8 @@ def get_low_cardinality_indices(database): table_name=table_name, database_name=database.db_name) ) + print('get_low_cardinality_indices', list(indices)) + for index in indices: # the cardinality is too high if index['CARDINALITY'] > INDEX_CARDINALITY_THRESHOLD: From e7fd3e138f2be60a5a7c96d4c455d69d68fb8c2d Mon Sep 17 00:00:00 2001 From: macbre Date: Tue, 22 Dec 2020 17:02:01 +0100 Subject: [PATCH 3/6] get_low_cardinality_indices: move debug print() --- indexdigest/linters/linter_0031_low_cardinality_index.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/indexdigest/linters/linter_0031_low_cardinality_index.py b/indexdigest/linters/linter_0031_low_cardinality_index.py index cc87c474..d5d6b74e 100644 --- a/indexdigest/linters/linter_0031_low_cardinality_index.py +++ b/indexdigest/linters/linter_0031_low_cardinality_index.py @@ -35,14 +35,14 @@ def get_low_cardinality_indices(database): table_name=table_name, database_name=database.db_name) ) - print('get_low_cardinality_indices', list(indices)) - for index in indices: + print('idx', table_name, rows_count, index) + # the cardinality is too high if index['CARDINALITY'] > INDEX_CARDINALITY_THRESHOLD: continue - yield (table_name, rows_count, index) + yield table_name, rows_count, index def check_low_cardinality_index(database): From 715696fd1c3b2e211e7ce65647c35758dbd6ebfa Mon Sep 17 00:00:00 2001 From: macbre Date: Tue, 22 Dec 2020 20:33:18 +0100 Subject: [PATCH 4/6] linter_0031_low_cardinality_index.py: increase thresholds --- indexdigest/linters/linter_0031_low_cardinality_index.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/indexdigest/linters/linter_0031_low_cardinality_index.py b/indexdigest/linters/linter_0031_low_cardinality_index.py index d5d6b74e..95d8d309 100644 --- a/indexdigest/linters/linter_0031_low_cardinality_index.py +++ b/indexdigest/linters/linter_0031_low_cardinality_index.py @@ -6,10 +6,10 @@ from indexdigest.utils import LinterEntry # skip small tables -ROWS_COUNT_THRESHOLD = 1000 +ROWS_COUNT_THRESHOLD = 100000 # cardinality threshold -INDEX_CARDINALITY_THRESHOLD = 5 +INDEX_CARDINALITY_THRESHOLD = 6 # the least frequent value should be used at most by x% rows INDEX_VALUE_PERCENTAGE_THRESHOLD = 20 From 625f30f9c94d4e423b4825ca67e7efb0e22c2141 Mon Sep 17 00:00:00 2001 From: macbre Date: Tue, 22 Dec 2020 20:33:37 +0100 Subject: [PATCH 5/6] Remove debug code --- indexdigest/linters/linter_0031_low_cardinality_index.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/indexdigest/linters/linter_0031_low_cardinality_index.py b/indexdigest/linters/linter_0031_low_cardinality_index.py index 95d8d309..399610c5 100644 --- a/indexdigest/linters/linter_0031_low_cardinality_index.py +++ b/indexdigest/linters/linter_0031_low_cardinality_index.py @@ -36,8 +36,6 @@ def get_low_cardinality_indices(database): ) for index in indices: - print('idx', table_name, rows_count, index) - # the cardinality is too high if index['CARDINALITY'] > INDEX_CARDINALITY_THRESHOLD: continue From 4fae8011543982a75ed731e283a2622d853a0603 Mon Sep 17 00:00:00 2001 From: macbre Date: Tue, 22 Dec 2020 20:36:08 +0100 Subject: [PATCH 6/6] test_0031_low_cardinality_index.py: cardinality threshold is now a bit greater --- .../linters/test_0031_low_cardinality_index.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/indexdigest/test/linters/test_0031_low_cardinality_index.py b/indexdigest/test/linters/test_0031_low_cardinality_index.py index 4a554ac1..f0970b7d 100644 --- a/indexdigest/test/linters/test_0031_low_cardinality_index.py +++ b/indexdigest/test/linters/test_0031_low_cardinality_index.py @@ -3,7 +3,7 @@ from unittest import TestCase from indexdigest.linters.linter_0031_low_cardinality_index import \ - check_low_cardinality_index, get_low_cardinality_indices + check_low_cardinality_index, get_low_cardinality_indices, INDEX_CARDINALITY_THRESHOLD from indexdigest.test import DatabaseTestMixin @@ -15,11 +15,13 @@ def test_get_low_cardinality_indices(self): print(indices) assert len(indices) == 1 - assert indices[0][0] == '0020_big_table' - assert indices[0][2]['INDEX_NAME'] == 'num_idx' - assert indices[0][2]['COLUMN_NAME'] == 'num' - assert indices[0][2]['CARDINALITY'] > 1 - assert indices[0][2]['CARDINALITY'] < 5 + + index = indices[0] + assert index[0] == '0020_big_table' + assert index[2]['INDEX_NAME'] == 'num_idx' + assert index[2]['COLUMN_NAME'] == 'num' + assert index[2]['CARDINALITY'] > 1 + assert index[2]['CARDINALITY'] <= INDEX_CARDINALITY_THRESHOLD def test_low_cardinality_index(self): reports = list(check_low_cardinality_index(self.connection))