macbre · macbre · Oct 17, 2018 · Sep 18, 2018 · Oct 16, 2018 · Oct 16, 2018
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,20 @@
+Developer notes
+===============
+
+## Testing locally with various version of MySQL
+
+Assume that you want to test `index-digest` locally against MySQL v5.5:
+
+```
+docker pull mysql:5.5
+sudo service mysql stop
+docker run -e MYSQL_ALLOW_EMPTY_PASSWORD=yes -d -p 3306:3306 mysql:5.5
+```
+
+Wait for mysql instance to start up. Then from the repository's main directory run:
+
+````
+mysql --protocol=tcp -u root -v < setup.sql
+./sql/populate.sh
+make sql-console
+```
diff --git a/README.md b/README.md
@@ -147,7 +147,7 @@ Outputs YML file with results and metadata.
 
 You can select which checks should be reported by the tool by using `--checks` command line option. Certain checks can also be skipped via `--skip-checks` option. Refer to `index_digest --help` for examples.
 
-> **Number of checks**: 23
+> **Number of checks**: 24
 
 * `redundant_indices`: reports indices that are redundant and covered by other
 * `non_utf_columns`: reports text columns that have characters encoding set to `latin1` (utf is the way to go)
@@ -157,6 +157,7 @@ You can select which checks should be reported by the tool by using `--checks` c
 * `empty_tables`: reports tables with no rows
 * `generic_primary_key`: reports tables with [a primary key on `id` column](https://github.com/jarulraj/sqlcheck/blob/master/docs/logical/1004.md) (a more meaningful name should be used)
 * `use_innodb`: reports table using storage engines different than `InnoDB` (a default for MySQL 5.5+ and MariaDB 10.2+)
+* `low_cardinality_index`: reports [indices with low cardinality](https://github.com/macbre/index-digest/issues/31)
 
 ### Additional checks performed on SQL log
 
@@ -414,6 +415,27 @@ having_clause → table affected: sales
 
 (...)
 
+------------------------------------------------------------
+low_cardinality_index → table affected: 0020_big_table
+
+✗ "num_idx" index on "num" column has low cardinality, check if it is needed
+
+  - column_name: num
+  - index_name: num_idx
+  - index_cardinality: 2
+  - schema: CREATE TABLE `0020_big_table` (
+      `item_id` int(9) NOT NULL AUTO_INCREMENT,
+      `val` int(9) NOT NULL,
+      `text` char(5) NOT NULL,
+      `num` int(3) NOT NULL,
+      PRIMARY KEY (`item_id`),
+      KEY `text_idx` (`text`),
+      KEY `num_idx` (`num`)
+    ) ENGINE=InnoDB AUTO_INCREMENT=100001 DEFAULT CHARSET=utf8
+  - value_usage: 33.24788541334185
+
+(...)
+
 ------------------------------------------------------------
 data_too_old → table affected: 0028_data_too_old
 

diff --git a/indexdigest/cli/script.py b/indexdigest/cli/script.py
@@ -65,7 +65,8 @@
     check_generic_primary_key, \
     check_high_offset_selects, \
     check_use_innodb, \
-    check_empty_database
+    check_empty_database, \
+    check_low_cardinality_index
 
 
 def get_reports(database, sql_log=None, analyze_data=False, check_empty_databases=False):
@@ -99,6 +100,7 @@ def get_reports(database, sql_log=None, analyze_data=False, check_empty_database
         check_empty_tables(database),
         check_generic_primary_key(database),
         check_use_innodb(database),
+        check_low_cardinality_index(database),
     )
 
     # checks that use SQL log

diff --git a/indexdigest/linters/__init__.py b/indexdigest/linters/__init__.py
@@ -24,3 +24,4 @@
 from .linter_0094_generic_primary_key import check_generic_primary_key
 from .linter_0118_high_offset_selects import check_high_offset_selects
 from .linter_0164_empty_database import check_empty_database
+from .linter_0031_low_cardinality_index import check_low_cardinality_index
diff --git a/indexdigest/linters/linter_0031_low_cardinality_index.py b/indexdigest/linters/linter_0031_low_cardinality_index.py
@@ -0,0 +1,81 @@
+"""
+This linter checks for ...
+"""
+from collections import OrderedDict
+
+from indexdigest.utils import LinterEntry
+
+# skip small tables
+ROWS_COUNT_THRESHOLD = 1000
+
+# cardinality threshold
+INDEX_CARDINALITY_THRESHOLD = 5
+
+# the least frequent value should be used at most by x% rows
+INDEX_VALUE_PERCENTAGE_THRESHOLD = 20
+
+
+def get_low_cardinality_indices(database):
+    """
+    :type database  indexdigest.database.Database
+    :rtype: list
+    """
+    for table_name in database.get_tables():
+        rows_count = database.get_table_rows_estimate(table_name)
+        if rows_count < ROWS_COUNT_THRESHOLD:
+            continue
+
+        # get table indices statistics
+        # @see https://dev.mysql.com/doc/refman/5.7/en/show-index.html
+        # @see https://www.percona.com/blog/2007/08/28/do-you-always-need-index-on-where-column/
+        indices = database.query_dict_rows(
+            "select TABLE_NAME, INDEX_NAME, COLUMN_NAME, CARDINALITY from"
+            " INFORMATION_SCHEMA.STATISTICS where"
+            " TABLE_NAME = '{table_name}' AND TABLE_SCHEMA = '{database_name}'".format(
+                table_name=table_name, database_name=database.db_name)
+        )
+
+        for index in indices:
+            # the cardinality is too high
+            if index['CARDINALITY'] > INDEX_CARDINALITY_THRESHOLD:
+                continue
+
+            yield (table_name, rows_count, index)
+
+
+def check_low_cardinality_index(database):
+    """
+    :type database  indexdigest.database.Database
+    :rtype: list[LinterEntry]
+    """
+    for table_name, rows_count, index in get_low_cardinality_indices(database):
+        # the least frequent value should be used in up to 20% of rows
+        # https://www.percona.com/blog/2007/08/28/do-you-always-need-index-on-where-column/
+        row = database.query_dict_row(
+            'SELECT {column} AS value, COUNT(*) AS cnt FROM `{table}` '
+            'GROUP BY 1 ORDER BY 2 ASC LIMIT 1'.format(
+                column=index['COLUMN_NAME'], table=index['TABLE_NAME']
+            )
+        )
+
+        value_usage = 100. * row['cnt'] / rows_count
+        # print(row, value_usage)
+
+        # the least frequent value is quite rare - it makes sense to have an index here
+        if value_usage < INDEX_VALUE_PERCENTAGE_THRESHOLD:
+            continue
+
+        print(value_usage, index, table_name)
+
+        context = OrderedDict()
+        context['column_name'] = index['COLUMN_NAME']
+        context['index_name'] = index['INDEX_NAME']
+        context['index_cardinality'] = int(index['CARDINALITY'])
+        context['schema'] = database.get_table_schema(table_name)
+        context['value_usage'] = value_usage
+
+        yield LinterEntry(linter_type='low_cardinality_index', table_name=table_name,
+                          message='"{}" index on "{}" column has low cardinality, '
+                                  'check if it is needed'.
+                          format(index['INDEX_NAME'], index['COLUMN_NAME']),
+                          context=context)
diff --git a/indexdigest/test/__init__.py b/indexdigest/test/__init__.py
@@ -63,7 +63,7 @@ def _insert_values(cursor, values):
             return
 
         # @see https://dev.mysql.com/doc/refman/5.7/en/insert.html
-        cursor.executemany('INSERT INTO 0020_big_table(item_id,val,text) VALUES(%s,%s,%s)', values)
+        cursor.executemany('INSERT INTO 0020_big_table(item_id,val,text,num) VALUES(%s,%s,%s,%s)', values)
         # print(values[0], cursor.lastrowid)
 
     def _prepare_big_table(self):
@@ -84,7 +84,10 @@ def _prepare_big_table(self):
 
         # no? populate it
         for row in self._rows():
-            values.append((row, val, '{:05x}'.format(row)[:5]))
+            # Report low cardinality indices, use only a few distinct values (#31)
+            num = row % 3
+
+            values.append((row, val, '{:05x}'.format(row)[:5], num))
 
             if row % 5 == 0:
                 val += 1
@@ -101,6 +104,9 @@ def _prepare_big_table(self):
 
         cursor.close()
 
+        # update key distribution statistics (#31)
+        self.connection.query('ANALYZE TABLE 0020_big_table')
+
     def table_populated(self):
         """
         :rtype: bool

diff --git a/indexdigest/test/linters/test_0031_low_cardinality_index.py b/indexdigest/test/linters/test_0031_low_cardinality_index.py
@@ -0,0 +1,38 @@
+from __future__ import print_function
+
+from unittest import TestCase
+
+from indexdigest.linters.linter_0031_low_cardinality_index import \
+    check_low_cardinality_index, get_low_cardinality_indices
+from indexdigest.test import DatabaseTestMixin
+
+
+class TestLinter(TestCase, DatabaseTestMixin):
+
+    def test_get_low_cardinality_indices(self):
+        indices = list(get_low_cardinality_indices(self.connection))
+
+        print(indices)
+
+        assert len(indices) == 1
+        assert indices[0][0] == '0020_big_table'
+        assert indices[0][2]['INDEX_NAME'] == 'num_idx'
+        assert indices[0][2]['COLUMN_NAME'] == 'num'
+        assert indices[0][2]['CARDINALITY'] > 1
+        assert indices[0][2]['CARDINALITY'] < 5
+
+    def test_low_cardinality_index(self):
+        reports = list(check_low_cardinality_index(self.connection))
+
+        print(reports, reports[0].context)
+
+        assert len(reports) == 1
+
+        assert str(reports[0]) == '0020_big_table: "num_idx" index on "num" column ' \
+                                  'has low cardinality, check if it is needed'
+        assert reports[0].table_name == '0020_big_table'
+
+        assert reports[0].context['column_name'] == 'num'
+        assert reports[0].context['index_name'] == 'num_idx'
+        assert isinstance(reports[0].context['index_cardinality'], int)
+        assert int(reports[0].context['value_usage']) == 33
diff --git a/sql/0020-big-table.sql b/sql/0020-big-table.sql
@@ -6,6 +6,8 @@ CREATE TABLE `0020_big_table` (
 	`item_id` int(9) NOT NULL AUTO_INCREMENT,
 	`val` int(9) NOT NULL,
 	`text` char(5) NOT NULL,
+	`num` int(3) NOT NULL,
 	PRIMARY KEY (`item_id`),
-	KEY text_idx (`text`)
+	KEY text_idx (`text`),
+	KEY num_idx (`num`) -- low cardinality (#31)
 ) CHARSET=utf8;