Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
Developer notes
===============

## Testing locally with various version of MySQL

Assume that you want to test `index-digest` locally against MySQL v5.5:

```
docker pull mysql:5.5
sudo service mysql stop
docker run -e MYSQL_ALLOW_EMPTY_PASSWORD=yes -d -p 3306:3306 mysql:5.5
```

Wait for mysql instance to start up. Then from the repository's main directory run:

````
mysql --protocol=tcp -u root -v < setup.sql
./sql/populate.sh
make sql-console
```
24 changes: 23 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ Outputs YML file with results and metadata.

You can select which checks should be reported by the tool by using `--checks` command line option. Certain checks can also be skipped via `--skip-checks` option. Refer to `index_digest --help` for examples.

> **Number of checks**: 23
> **Number of checks**: 24

* `redundant_indices`: reports indices that are redundant and covered by other
* `non_utf_columns`: reports text columns that have characters encoding set to `latin1` (utf is the way to go)
Expand All @@ -157,6 +157,7 @@ You can select which checks should be reported by the tool by using `--checks` c
* `empty_tables`: reports tables with no rows
* `generic_primary_key`: reports tables with [a primary key on `id` column](https://github.com/jarulraj/sqlcheck/blob/master/docs/logical/1004.md) (a more meaningful name should be used)
* `use_innodb`: reports table using storage engines different than `InnoDB` (a default for MySQL 5.5+ and MariaDB 10.2+)
* `low_cardinality_index`: reports [indices with low cardinality](https://github.com/macbre/index-digest/issues/31)

### Additional checks performed on SQL log

Expand Down Expand Up @@ -414,6 +415,27 @@ having_clause → table affected: sales

(...)

------------------------------------------------------------
low_cardinality_index → table affected: 0020_big_table

✗ "num_idx" index on "num" column has low cardinality, check if it is needed

- column_name: num
- index_name: num_idx
- index_cardinality: 2
- schema: CREATE TABLE `0020_big_table` (
`item_id` int(9) NOT NULL AUTO_INCREMENT,
`val` int(9) NOT NULL,
`text` char(5) NOT NULL,
`num` int(3) NOT NULL,
PRIMARY KEY (`item_id`),
KEY `text_idx` (`text`),
KEY `num_idx` (`num`)
) ENGINE=InnoDB AUTO_INCREMENT=100001 DEFAULT CHARSET=utf8
- value_usage: 33.24788541334185

(...)

------------------------------------------------------------
data_too_old → table affected: 0028_data_too_old

Expand Down
4 changes: 3 additions & 1 deletion indexdigest/cli/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@
check_generic_primary_key, \
check_high_offset_selects, \
check_use_innodb, \
check_empty_database
check_empty_database, \
check_low_cardinality_index


def get_reports(database, sql_log=None, analyze_data=False, check_empty_databases=False):
Expand Down Expand Up @@ -99,6 +100,7 @@ def get_reports(database, sql_log=None, analyze_data=False, check_empty_database
check_empty_tables(database),
check_generic_primary_key(database),
check_use_innodb(database),
check_low_cardinality_index(database),
)

# checks that use SQL log
Expand Down
1 change: 1 addition & 0 deletions indexdigest/linters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@
from .linter_0094_generic_primary_key import check_generic_primary_key
from .linter_0118_high_offset_selects import check_high_offset_selects
from .linter_0164_empty_database import check_empty_database
from .linter_0031_low_cardinality_index import check_low_cardinality_index
81 changes: 81 additions & 0 deletions indexdigest/linters/linter_0031_low_cardinality_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""
This linter checks for ...
"""
from collections import OrderedDict

from indexdigest.utils import LinterEntry

# skip small tables
ROWS_COUNT_THRESHOLD = 1000

# cardinality threshold
INDEX_CARDINALITY_THRESHOLD = 5

# the least frequent value should be used at most by x% rows
INDEX_VALUE_PERCENTAGE_THRESHOLD = 20


def get_low_cardinality_indices(database):
"""
:type database indexdigest.database.Database
:rtype: list
"""
for table_name in database.get_tables():
rows_count = database.get_table_rows_estimate(table_name)
if rows_count < ROWS_COUNT_THRESHOLD:
continue

# get table indices statistics
# @see https://dev.mysql.com/doc/refman/5.7/en/show-index.html
# @see https://www.percona.com/blog/2007/08/28/do-you-always-need-index-on-where-column/
indices = database.query_dict_rows(
"select TABLE_NAME, INDEX_NAME, COLUMN_NAME, CARDINALITY from"
" INFORMATION_SCHEMA.STATISTICS where"
" TABLE_NAME = '{table_name}' AND TABLE_SCHEMA = '{database_name}'".format(
table_name=table_name, database_name=database.db_name)
)

for index in indices:
# the cardinality is too high
if index['CARDINALITY'] > INDEX_CARDINALITY_THRESHOLD:
continue

yield (table_name, rows_count, index)


def check_low_cardinality_index(database):
"""
:type database indexdigest.database.Database
:rtype: list[LinterEntry]
"""
for table_name, rows_count, index in get_low_cardinality_indices(database):
# the least frequent value should be used in up to 20% of rows
# https://www.percona.com/blog/2007/08/28/do-you-always-need-index-on-where-column/
row = database.query_dict_row(
'SELECT {column} AS value, COUNT(*) AS cnt FROM `{table}` '
'GROUP BY 1 ORDER BY 2 ASC LIMIT 1'.format(
column=index['COLUMN_NAME'], table=index['TABLE_NAME']
)
)

value_usage = 100. * row['cnt'] / rows_count
# print(row, value_usage)

# the least frequent value is quite rare - it makes sense to have an index here
if value_usage < INDEX_VALUE_PERCENTAGE_THRESHOLD:
continue

print(value_usage, index, table_name)

context = OrderedDict()
context['column_name'] = index['COLUMN_NAME']
context['index_name'] = index['INDEX_NAME']
context['index_cardinality'] = int(index['CARDINALITY'])
context['schema'] = database.get_table_schema(table_name)
context['value_usage'] = value_usage

yield LinterEntry(linter_type='low_cardinality_index', table_name=table_name,
message='"{}" index on "{}" column has low cardinality, '
'check if it is needed'.
format(index['INDEX_NAME'], index['COLUMN_NAME']),
context=context)
10 changes: 8 additions & 2 deletions indexdigest/test/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def _insert_values(cursor, values):
return

# @see https://dev.mysql.com/doc/refman/5.7/en/insert.html
cursor.executemany('INSERT INTO 0020_big_table(item_id,val,text) VALUES(%s,%s,%s)', values)
cursor.executemany('INSERT INTO 0020_big_table(item_id,val,text,num) VALUES(%s,%s,%s,%s)', values)
# print(values[0], cursor.lastrowid)

def _prepare_big_table(self):
Expand All @@ -84,7 +84,10 @@ def _prepare_big_table(self):

# no? populate it
for row in self._rows():
values.append((row, val, '{:05x}'.format(row)[:5]))
# Report low cardinality indices, use only a few distinct values (#31)
num = row % 3

values.append((row, val, '{:05x}'.format(row)[:5], num))

if row % 5 == 0:
val += 1
Expand All @@ -101,6 +104,9 @@ def _prepare_big_table(self):

cursor.close()

# update key distribution statistics (#31)
self.connection.query('ANALYZE TABLE 0020_big_table')

def table_populated(self):
"""
:rtype: bool
Expand Down
38 changes: 38 additions & 0 deletions indexdigest/test/linters/test_0031_low_cardinality_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from __future__ import print_function

from unittest import TestCase

from indexdigest.linters.linter_0031_low_cardinality_index import \
check_low_cardinality_index, get_low_cardinality_indices
from indexdigest.test import DatabaseTestMixin


class TestLinter(TestCase, DatabaseTestMixin):

def test_get_low_cardinality_indices(self):
indices = list(get_low_cardinality_indices(self.connection))

print(indices)

assert len(indices) == 1
assert indices[0][0] == '0020_big_table'
assert indices[0][2]['INDEX_NAME'] == 'num_idx'
assert indices[0][2]['COLUMN_NAME'] == 'num'
assert indices[0][2]['CARDINALITY'] > 1
assert indices[0][2]['CARDINALITY'] < 5

def test_low_cardinality_index(self):
reports = list(check_low_cardinality_index(self.connection))

print(reports, reports[0].context)

assert len(reports) == 1

assert str(reports[0]) == '0020_big_table: "num_idx" index on "num" column ' \
'has low cardinality, check if it is needed'
assert reports[0].table_name == '0020_big_table'

assert reports[0].context['column_name'] == 'num'
assert reports[0].context['index_name'] == 'num_idx'
assert isinstance(reports[0].context['index_cardinality'], int)
assert int(reports[0].context['value_usage']) == 33
4 changes: 3 additions & 1 deletion sql/0020-big-table.sql
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ CREATE TABLE `0020_big_table` (
`item_id` int(9) NOT NULL AUTO_INCREMENT,
`val` int(9) NOT NULL,
`text` char(5) NOT NULL,
`num` int(3) NOT NULL,
PRIMARY KEY (`item_id`),
KEY text_idx (`text`)
KEY text_idx (`text`),
KEY num_idx (`num`) -- low cardinality (#31)
) CHARSET=utf8;