diff --git a/performance/compression-review/compression-review.py b/performance/compression-review/compression-review.py index eaec577..4eb2d86 100644 --- a/performance/compression-review/compression-review.py +++ b/performance/compression-review/compression-review.py @@ -47,7 +47,7 @@ def getData(appConfig): logFileHandle.write("\n") # output header to csv - logFileHandle.write("{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}\n".format('dbName','collName','numDocs','avgDocSize','sizeGB','storageGB','compRatio','compEnabled','minSample','maxSample','avgSample','minComp','maxComp','avgComp','compRatio','exceptions','compTime(ms)')) + logFileHandle.write("{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}\n".format('dbName','collName','numDocs','avgDocSize','sizeGB','storageGB','existingCompRatio','compEnabled','minSample','maxSample','avgSample','minComp','maxComp','avgComp','projectedCompRatio','exceptions','compTime(ms)')) # get databases - filter out admin, config, local, and system dbDict = client.admin.command("listDatabases",nameOnly=True,filter={"name":{"$nin":['admin','config','local','system']}})['databases'] diff --git a/sizing-tool/README.md b/sizing-tool/README.md new file mode 100644 index 0000000..5d23b3d --- /dev/null +++ b/sizing-tool/README.md @@ -0,0 +1,129 @@ +# Amazon DocumentDB Sizing Tool + +The sizing tool analyzes your MongoDB database and generates a CSV file for use with the [DocumentDB Cost Estimator](https://aws.improving.com/documentdb/cost-estimator/). The tool automatically measures compression ratios using zstd-3-dict (matching Amazon DocumentDB 8.0), collects database statistics, and produces a properly formatted CSV file ready for upload to the cost estimator. + +**Note:** The tool automatically excludes: +- System databases: `admin`, `config`, `local`, and `system` +- Views (only collections are analyzed) +- The `system.profile` collection +- Collections with no documents + +# Requirements + - Python 3.7+ + - pymongo Python package + - MongoDB 2.6 - 3.4 | pymongo 3.10 - 3.12 + - MongoDB 3.6 - 5.0 | pymongo 3.12 - 4.0 + - MongoDB 5.1+ | pymongo 4.0+ + - DocumentDB | pymongo 3.10+ + - If not installed - "$ pip3 install pymongo" + - lz4 Python package + - If not installed - "$ pip3 install lz4" + - zstandard Python package + - If not installed - "$ pip3 install zstandard" + - compression-review.py script (/performance/compression-review/compression-review.py) + +**Quick Install**: `pip3 install -r requirements.txt` + +## Using the Sizing Tool +`python3 sizing.py --uri ` + +- Automatically uses zstd-3-dict compression (matching DocumentDB 8.0) +- Samples 1000 documents per collection by default +- Run on any instance in the replica set +- Creates a single CSV file per execution: `sizing-.csv` +- The \ options can be found at https://www.mongodb.com/docs/manual/reference/connection-string/ + - If your URI contains ampersand (&) characters they must be escaped with the backslash or enclosed your URI in double quotes +- For DocumentDB use either the cluster endpoint or any of the instance endpoints + +### Optional Parameters + +| Parameter | Default | Description | +| ----------- | ----------- | ----------- | +| --sample-size | 1000 | Number of documents to sample per collection | +| --dictionary-sample-size | 100 | Number of documents for dictionary creation | + +### Example Usage + +Localhost (no authentication): +``` +python3 sizing.py --uri "mongodb://localhost:27017" +``` + +Remote server with authentication: +``` +python3 sizing.py --uri "mongodb://username:password@hostname:27017" +``` + +With custom sample size: +``` +python3 sizing.py --uri "mongodb://username:password@hostname:27017" --sample-size 2000 +``` + +## Output + +The tool generates a CSV file named: `sizing-.csv` in your current working directory (where you run the command). + +Example: `sizing-20260204123045.csv` + +### CSV Columns +- **SLNo** - Serial number +- **Database_Name** - Name of the database +- **Collection_Name** - Name of the collection +- **Document_Count** - Number of documents +- **Average_Document_Size** - Average document size (bytes) +- **Total_Indexes** - Number of indexes +- **Index_Size** - Total index size (GB) +- **Index_Working_Set** - Percentage of indexes in memory (%) +- **Data_Working_Set** - Percentage of data in memory (%) +- **Inserts_Per_Day** - Daily insert operations (count) +- **Updates_Per_Day** - Daily update operations (count) +- **Deletes_Per_Day** - Daily delete operations (count) +- **Reads_Per_Day** - Daily read operations (count) +- **Compression_Ratio** - Compression ratio + +### Important Note: Manual Updates Required + +The generated CSV includes default placeholder values for workload metrics that **MUST be manually updated** in a text editor: + +| Field | Default Value | Description | +|-------|---------------|-------------| +| **Index_Working_Set** | 100 | Percentage of indexes that need to be in memory | +| **Data_Working_Set** | 10 | Percentage of data that needs to be in memory | +| **Inserts_Per_Day** | 0 | Number of insert operations per day | +| **Updates_Per_Day** | 0 | Number of update operations per day | +| **Deletes_Per_Day** | 0 | Number of delete operations per day | +| **Reads_Per_Day** | 0 | Number of read operations per day | + +**Why manual updates are required:** +- These statistics cannot be calculated automatically from database metadata +- They require knowledge of your application's workload patterns +- Accurate values are critical for proper instance sizing and cost estimation + +**How to update:** +1. Locate the generated CSV file in your current working directory (where you ran the command) +2. Open the CSV file in a text editor (not Excel, which may corrupt the format) +3. Locate the columns for the fields above +4. Update each row with values based on your workload knowledge +5. Save the file +6. Upload to the [DocumentDB Cost Estimator](https://aws.improving.com/documentdb/cost-estimator/) + +**Tips for determining values:** +- **Working Sets**: Use MongoDB monitoring tools or `db.serverStatus()` to understand memory usage patterns +- **Daily Operations**: Check application logs, MongoDB profiler, or monitoring dashboards for operation counts +- **Conservative estimates**: If unsure, use higher working set percentages and operation counts for safer sizing + +## How It Works +1. Runs compression-review.py to analyze compression ratios using zstd-3-dict +2. Connects to MongoDB to gather collection statistics (document counts, sizes, indexes) +3. Combines compression data with collection metadata +4. Generates a CSV file formatted for the [DocumentDB Cost Estimator](https://aws.improving.com/documentdb/cost-estimator/) +5. Cleans up temporary files + +## Next Steps +1. Run the sizing tool to generate your CSV file +2. Open the CSV and update workload metrics (working sets and daily operations) with your actual values +3. Upload the CSV to the [DocumentDB Cost Estimator](https://aws.improving.com/documentdb/cost-estimator/) +4. Review the sizing recommendations + +## License +This tool is licensed under the Apache 2.0 License. diff --git a/sizing-tool/requirements.txt b/sizing-tool/requirements.txt new file mode 100644 index 0000000..220379f --- /dev/null +++ b/sizing-tool/requirements.txt @@ -0,0 +1,3 @@ +pymongo +lz4 +zstandard diff --git a/sizing-tool/sizing.py b/sizing-tool/sizing.py new file mode 100644 index 0000000..9775850 --- /dev/null +++ b/sizing-tool/sizing.py @@ -0,0 +1,395 @@ +import argparse +import sys +import csv +import glob +import os +import datetime as dt +import pymongo +import importlib.util + +# Compressor to use for compression analysis +# zstd-3-dict matches Amazon DocumentDB 8.0 dictionary-based compression +COMPRESSOR = 'zstd-3-dict' + +# Fixed dictionary size in Amazon DocumentDB 8.0 dictionary-based compression +DICTIONARY_SIZE_BYTES = 4096 + +# Server alias base for output file naming +SERVER_ALIAS_BASE = 'temp' + + +def load_compression_module(): + """ + Load the compression-review.py module dynamically. + + Returns: + module: The loaded compression_review module + + Raises: + RuntimeError: If the compression-review.py file does not exist or cannot be loaded + """ + script_dir = os.path.dirname(os.path.abspath(__file__)) + compression_script = os.path.join( + script_dir, '..', 'performance', 'compression-review', 'compression-review.py' + ) + + # Check if the file exists + if not os.path.exists(compression_script): + raise RuntimeError( + f"Compression module not found at: {compression_script}\n" + f"Expected location: ../performance/compression-review/compression-review.py\n" + f"Please ensure the compression-review tool is available in the correct directory." + ) + + # Check if it's a file (not a directory) + if not os.path.isfile(compression_script): + raise RuntimeError( + f"Path exists but is not a file: {compression_script}\n" + f"Expected a Python script at this location." + ) + + try: + spec = importlib.util.spec_from_file_location("compression_review", compression_script) + if spec is None or spec.loader is None: + raise RuntimeError( + f"Failed to create module spec for: {compression_script}\n" + f"The file may not be a valid Python module." + ) + + compression_module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(compression_module) + + # Verify the module has the required getData function + if not hasattr(compression_module, 'getData'): + raise RuntimeError( + f"Compression module loaded but missing required 'getData' function.\n" + f"The compression-review.py file may be corrupted or outdated." + ) + + return compression_module + + except Exception as e: + if isinstance(e, RuntimeError): + raise + raise RuntimeError( + f"Error loading compression module from {compression_script}: {e}" + ) + + +def cleanup_csv_files(csv_files): + """ + Remove CSV files and log any errors. + + Args: + csv_files: Iterable of CSV file paths to remove + """ + for csv_file in csv_files: + try: + os.remove(csv_file) + print(f"Cleaned up partial file: {csv_file}", file=sys.stderr) + except Exception as e: + print(f"Warning: Could not remove file {csv_file}: {e}", file=sys.stderr) + + +def run_compression_and_get_output(uri, sample_size, dictionary_sample_size): + """ + Run compression analysis and return the path to the generated CSV file. + + Args: + uri: MongoDB connection URI + sample_size: Number of documents to sample per collection + dictionary_sample_size: Number of documents for dictionary creation + + Returns: + str: Path to the generated compression CSV file + + Raises: + RuntimeError: If compression analysis fails or no CSV file is created + """ + print("Running compression analysis...") + + # Load the compression module + compression_module = load_compression_module() + + # Create server alias with PID for concurrency safety + server_alias = f"{SERVER_ALIAS_BASE}-{os.getpid()}" + + # Get list of existing CSV files before running compression analysis + csv_pattern = f"{server_alias}-*-compression-review.csv" + existing_csv_files = set(glob.glob(csv_pattern)) + + # Configure and run compression analysis + app_config = { + 'uri': uri, + 'serverAlias': server_alias, + 'sampleSize': sample_size, + 'compressor': COMPRESSOR, + 'dictionarySampleSize': dictionary_sample_size, + 'dictionarySize': DICTIONARY_SIZE_BYTES + } + + try: + compression_module.getData(app_config) + except Exception as e: + # Clean up any partial CSV files that may have been created + current_csv_files = set(glob.glob(csv_pattern)) + new_csv_files = current_csv_files - existing_csv_files + if new_csv_files: + cleanup_csv_files(new_csv_files) + raise RuntimeError(f"Error running compression analysis: {e}") + + # Find the newly created CSV file by comparing before and after + current_csv_files = set(glob.glob(csv_pattern)) + new_csv_files = current_csv_files - existing_csv_files + + if not new_csv_files: + raise RuntimeError(f"No new CSV file created. Expected pattern: {csv_pattern}") + + if len(new_csv_files) > 1: + print(f"Warning: Multiple new CSV files found: {new_csv_files}", file=sys.stderr) + # Use the most recent one + latest_csv = max(new_csv_files, key=os.path.getmtime) + else: + latest_csv = new_csv_files.pop() + + print(f"Parsing results from: {latest_csv}") + return latest_csv + + +def parse_compression_csv(csv_filepath): + """ + Parse compression review CSV and extract collection data. + + Args: + csv_filepath: Path to the compression review CSV file + + Returns: + dict: Dictionary mapping 'db.collection' to compression data + + Raises: + RuntimeError: If CSV header cannot be found or file is invalid + """ + comp_data = {} + + with open(csv_filepath, 'r') as f: + # Read all lines to find where the actual data starts + lines = f.readlines() + + # Find the header line (starts with dbName) + header_idx = None + for i, line in enumerate(lines): + if line.startswith('dbName'): + header_idx = i + break + + if header_idx is None: + raise RuntimeError("Could not find data header in CSV") + + # Use DictReader for named column access + reader = csv.DictReader(lines[header_idx:]) + + for row in reader: + try: + # Access columns by name instead of index + db_name = row['dbName'] + coll_name = row['collName'] + num_docs = int(row['numDocs']) + avg_doc_size = int(row['avgDocSize']) + comp_ratio = float(row['projectedCompRatio']) + + key = f"{db_name}.{coll_name}" + comp_data[key] = { + 'db_name': db_name, + 'coll_name': coll_name, + 'num_docs': num_docs, + 'avg_doc_size': avg_doc_size, + 'comp_ratio': comp_ratio + } + except (KeyError, ValueError) as e: + # Skip rows with missing columns or invalid data + print(f"Warning: Skipping row due to error: {e}", file=sys.stderr) + continue + + return comp_data + + +def generate_sizing_csv(comp_data, uri): + """ + Generate cost estimator CSV by combining compression data with MongoDB stats. + + Args: + comp_data: Dictionary of compression data from parse_compression_csv() + uri: MongoDB connection URI + + Returns: + str: Path to the generated sizing CSV file + """ + print("Connecting to MongoDB to gather additional stats...") + + # Create output CSV file + log_timestamp = dt.datetime.now(dt.timezone.utc).strftime('%Y%m%d%H%M%S') + output_filename = f"sizing-{log_timestamp}.csv" + + with pymongo.MongoClient(host=uri, appname='workload-calc', serverSelectionTimeoutMS=5000) as client: + with open(output_filename, 'w', newline='') as csvfile: + csvwriter = csv.writer(csvfile) + + # Write header + csvwriter.writerow([ + 'SLNo', 'Database_Name', 'Collection_Name', 'Document_Count', + 'Average_Document_Size', 'Total_Indexes', 'Index_Size', + 'Index_Working_Set', 'Data_Working_Set', 'Inserts_Per_Day', + 'Updates_Per_Day', 'Deletes_Per_Day', 'Reads_Per_Day', + 'Compression_Ratio' + ]) + + sl_no = 1 + + # Iterate through collections from compression data + for key, data in comp_data.items(): + db_name = data['db_name'] + coll_name = data['coll_name'] + + try: + # Get collection stats from MongoDB + stats = client[db_name].command("collStats", coll_name) + + doc_count = data['num_docs'] + avg_doc_size = data['avg_doc_size'] + total_indexes = stats.get('nindexes', 0) + index_size_bytes = stats.get('totalIndexSize', 0) + index_size_gb = index_size_bytes / (1024 * 1024 * 1024) + comp_ratio = data['comp_ratio'] + + # Default estimates for workload metrics + index_working_set = 100 + data_working_set = 10 + inserts_per_day = 0 + updates_per_day = 0 + deletes_per_day = 0 + reads_per_day = 0 + + # Write row + csvwriter.writerow([ + sl_no, + db_name, + coll_name, + doc_count, + avg_doc_size, + total_indexes, + f"{index_size_gb:.4f}", + index_working_set, + data_working_set, + inserts_per_day, + updates_per_day, + deletes_per_day, + reads_per_day, + f"{comp_ratio:.4f}" + ]) + + sl_no += 1 + + except Exception as e: + print(f"Error processing {db_name}.{coll_name}: {e}", file=sys.stderr) + continue + + return output_filename + + +def validate_args(args): + """ + Validate command-line arguments. + + Args: + args: Parsed arguments from argparse + + Raises: + ValueError: If any argument is invalid + """ + # Validate URI format + if not args.uri: + raise ValueError("MongoDB URI cannot be empty") + + if not (args.uri.startswith('mongodb://') or args.uri.startswith('mongodb+srv://')): + raise ValueError("MongoDB URI must start with 'mongodb://' or 'mongodb+srv://'") + + # Validate sample size (only check lower bound) + if args.sample_size <= 0: + raise ValueError(f"Sample size must be positive, got: {args.sample_size}") + + # Validate dictionary sample size (only check lower bound) + if args.dictionary_sample_size <= 0: + raise ValueError(f"Dictionary sample size must be positive, got: {args.dictionary_sample_size}") + + +def main(): + parser = argparse.ArgumentParser(description='Run compression review and analyze results') + + parser.add_argument('--uri', + required=True, + type=str, + help='MongoDB Connection URI') + + parser.add_argument('--sample-size', + required=False, + type=int, + default=1000, + help='Number of documents to sample in each collection, default 1000') + + parser.add_argument('--dictionary-sample-size', + required=False, + type=int, + default=100, + help='Number of documents to sample for dictionary creation') + + args = parser.parse_args() + + # Validate arguments + try: + validate_args(args) + except ValueError as e: + parser.error(str(e)) + + compression_csv = None # Initialize to handle cleanup in finally + + try: + # Run compression analysis and get the output CSV file + compression_csv = run_compression_and_get_output( + uri=args.uri, + sample_size=args.sample_size, + dictionary_sample_size=args.dictionary_sample_size + ) + + # Parse compression CSV to extract collection data + comp_data = parse_compression_csv(compression_csv) + + # Generate sizing CSV by combining compression data with MongoDB stats + output_filename = generate_sizing_csv(comp_data, args.uri) + + print(f"\nSizing CSV generated: {output_filename}") + print("\n" + "="*80) + print("IMPORTANT: Manual Updates Required") + print("="*80) + print("\nThe following fields have been set to default values and MUST be updated") + print("manually in a text editor based on your workload knowledge:\n") + print(" • Index_Working_Set (default: 100) - Percentage of indexes in memory") + print(" • Data_Working_Set (default: 10) - Percentage of data in memory") + print(" • Inserts_Per_Day (default: 0) - Daily insert operations") + print(" • Updates_Per_Day (default: 0) - Daily update operations") + print(" • Deletes_Per_Day (default: 0) - Daily delete operations") + print(" • Reads_Per_Day (default: 0) - Daily read operations") + print("\nThese statistics cannot be calculated automatically and require knowledge") + print("of your existing workload patterns. Open the CSV file in a text editor") + print("and update these values for accurate sizing recommendations.") + print("="*80 + "\n") + + except RuntimeError as e: + print(str(e), file=sys.stderr) + sys.exit(1) + finally: + # Clean up the compression-review CSV file if it was created + if compression_csv is not None: + cleanup_csv_files([compression_csv]) + +if __name__ == "__main__": + main() diff --git a/sizing-tool/test/README.md b/sizing-tool/test/README.md new file mode 100644 index 0000000..ea992d3 --- /dev/null +++ b/sizing-tool/test/README.md @@ -0,0 +1,77 @@ +# Sizing Tool Tests + +This directory contains unit tests for the sizing tool. + +## Prerequisites + +- Python 3.7+ +- No external dependencies required (tests use `unittest.mock` for all external calls) +- Tests do not require MongoDB connection or the compression-review.py script + +## Running Tests + +### Run all tests +```bash +# From the test directory +python -m unittest test_sizing + +# With verbose output +python -m unittest test_sizing -v +``` + +### Run specific test class +```bash +python -m unittest test_sizing.TestValidateArgs +``` + +### Run specific test +```bash +python -m unittest test_sizing.TestValidateArgs.test_valid_args +``` + +## Test Coverage + +The test suite includes unit tests for: + +- **Argument validation** - URI format, sample sizes, parameter bounds +- **CSV parsing** - Valid data, missing headers, invalid rows, empty files +- **Compression module loading** - File existence, module validation, error handling +- **Compression execution** - Successful runs, file creation, error scenarios, cleanup +- **Sizing CSV generation** - MongoDB stats collection, multiple collections, error handling + +## Test Structure + +All tests use mocks to avoid external dependencies: +- MongoDB connections are mocked using `unittest.mock` +- File system operations use temporary files +- The compression-review.py module is mocked for isolation + +This ensures tests run quickly and don't require any external services or configuration. + +## Adding New Tests + +When adding new functionality to sizing.py: + +1. Create a new test class or add to an existing one +2. Use descriptive test names that explain what is being tested +3. Mock all external dependencies (MongoDB, file system, external modules) +4. Test both success and failure scenarios +5. Include edge cases and boundary conditions + +Example test structure: +```python +class TestNewFeature(unittest.TestCase): + """Tests for new_feature function""" + + @patch('sizing.external_dependency') + def test_success_case(self, mock_dependency): + """Test successful execution""" + # Setup mocks + mock_dependency.return_value = expected_value + + # Execute + result = new_feature() + + # Assert + self.assertEqual(result, expected_value) +``` diff --git a/sizing-tool/test/test_sizing.py b/sizing-tool/test/test_sizing.py new file mode 100644 index 0000000..1cbe26d --- /dev/null +++ b/sizing-tool/test/test_sizing.py @@ -0,0 +1,534 @@ +import unittest +import os +import csv +import tempfile +from unittest.mock import Mock, patch, MagicMock +from argparse import Namespace +import sys + +# Import functions from sizing.py (parent directory) +sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) +from sizing import ( + validate_args, + parse_compression_csv, + run_compression_and_get_output, + generate_sizing_csv, + load_compression_module +) + + +class TestValidateArgs(unittest.TestCase): + """Tests for validate_args function""" + + def test_valid_args(self): + """Test that valid arguments pass validation""" + args = Namespace( + uri='mongodb://localhost:27017', + sample_size=1000, + dictionary_sample_size=100 + ) + # Should not raise any exception + validate_args(args) + + def test_valid_args_with_srv(self): + """Test that mongodb+srv:// URI is valid""" + args = Namespace( + uri='mongodb+srv://cluster.mongodb.net', + sample_size=1000, + dictionary_sample_size=100 + ) + validate_args(args) + + def test_empty_uri(self): + """Test that empty URI raises ValueError""" + args = Namespace( + uri='', + sample_size=1000, + dictionary_sample_size=100 + ) + with self.assertRaisesRegex(ValueError, "MongoDB URI cannot be empty"): + validate_args(args) + + def test_invalid_uri_format(self): + """Test that invalid URI format raises ValueError""" + args = Namespace( + uri='http://localhost:27017', + sample_size=1000, + dictionary_sample_size=100 + ) + with self.assertRaisesRegex(ValueError, "must start with 'mongodb://' or 'mongodb\\+srv://'"): + validate_args(args) + + def test_negative_sample_size(self): + """Test that negative sample size raises ValueError""" + args = Namespace( + uri='mongodb://localhost:27017', + sample_size=-100, + dictionary_sample_size=100 + ) + with self.assertRaisesRegex(ValueError, "Sample size must be positive"): + validate_args(args) + + def test_zero_sample_size(self): + """Test that zero sample size raises ValueError""" + args = Namespace( + uri='mongodb://localhost:27017', + sample_size=0, + dictionary_sample_size=100 + ) + with self.assertRaisesRegex(ValueError, "Sample size must be positive"): + validate_args(args) + + def test_negative_dictionary_sample_size(self): + """Test that negative dictionary sample size raises ValueError""" + args = Namespace( + uri='mongodb://localhost:27017', + sample_size=1000, + dictionary_sample_size=-10 + ) + with self.assertRaisesRegex(ValueError, "Dictionary sample size must be positive"): + validate_args(args) + + def test_large_values_accepted(self): + """Test that large values are accepted (no upper limits)""" + args = Namespace( + uri='mongodb://localhost:27017', + sample_size=10000000, # 10 million + dictionary_sample_size=5000000 # 5 million + ) + # Should not raise any exception + validate_args(args) + + +class TestParseCompressionCsv(unittest.TestCase): + """Tests for parse_compression_csv function""" + + def test_parse_valid_csv(self): + """Test parsing a valid compression CSV""" + csv_content = """compressor,docsSampled,dictDocsSampled,dictBytes +zstd-3-dict,1000,100,4096 + +dbName,collName,numDocs,avgDocSize,sizeGB,storageGB,existingCompRatio,compEnabled,minSample,maxSample,avgSample,minComp,maxComp,avgComp,projectedCompRatio,exceptions,compTime(ms) +testdb,users,10000,512,5.0,2.5,2.0,Y/1024,256,1024,512,128,512,256,2.0,0,123.45 +testdb,orders,5000,1024,5.0,2.0,2.5,Y/1024,512,2048,1024,256,1024,512,2.0,0,234.56 +""" + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.csv') as f: + f.write(csv_content) + temp_file = f.name + + try: + result = parse_compression_csv(temp_file) + + self.assertEqual(len(result), 2) + self.assertIn('testdb.users', result) + self.assertIn('testdb.orders', result) + + users_data = result['testdb.users'] + self.assertEqual(users_data['db_name'], 'testdb') + self.assertEqual(users_data['coll_name'], 'users') + self.assertEqual(users_data['num_docs'], 10000) + self.assertEqual(users_data['avg_doc_size'], 512) + self.assertEqual(users_data['comp_ratio'], 2.0) + + orders_data = result['testdb.orders'] + self.assertEqual(orders_data['db_name'], 'testdb') + self.assertEqual(orders_data['coll_name'], 'orders') + self.assertEqual(orders_data['num_docs'], 5000) + finally: + os.unlink(temp_file) + + def test_parse_csv_missing_header(self): + """Test that missing header raises RuntimeError""" + csv_content = """compressor,docsSampled,dictDocsSampled,dictBytes +zstd-3-dict,1000,100,4096 + +testdb,users,10000,512,5.0,2.5,2.0,Y/1024,256,1024,512,128,512,256,2.0,0,123.45 +""" + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.csv') as f: + f.write(csv_content) + temp_file = f.name + + try: + with self.assertRaisesRegex(RuntimeError, "Could not find data header in CSV"): + parse_compression_csv(temp_file) + finally: + os.unlink(temp_file) + + def test_parse_csv_with_invalid_row(self): + """Test that invalid rows are skipped with warning""" + csv_content = """compressor,docsSampled,dictDocsSampled,dictBytes +zstd-3-dict,1000,100,4096 + +dbName,collName,numDocs,avgDocSize,sizeGB,storageGB,existingCompRatio,compEnabled,minSample,maxSample,avgSample,minComp,maxComp,avgComp,projectedCompRatio,exceptions,compTime(ms) +testdb,users,10000,512,5.0,2.5,2.0,Y/1024,256,1024,512,128,512,256,2.0,0,123.45 +testdb,invalid,not_a_number,512,5.0,2.5,2.0,Y/1024,256,1024,512,128,512,256,2.0,0,123.45 +testdb,orders,5000,1024,5.0,2.0,2.5,Y/1024,512,2048,1024,256,1024,512,2.0,0,234.56 +""" + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.csv') as f: + f.write(csv_content) + temp_file = f.name + + try: + result = parse_compression_csv(temp_file) + + # Should have 2 valid rows (invalid row skipped) + self.assertEqual(len(result), 2) + self.assertIn('testdb.users', result) + self.assertIn('testdb.orders', result) + self.assertNotIn('testdb.invalid', result) + finally: + os.unlink(temp_file) + + def test_parse_empty_csv(self): + """Test parsing an empty CSV""" + csv_content = """compressor,docsSampled,dictDocsSampled,dictBytes +zstd-3-dict,1000,100,4096 + +dbName,collName,numDocs,avgDocSize,sizeGB,storageGB,existingCompRatio,compEnabled,minSample,maxSample,avgSample,minComp,maxComp,avgComp,projectedCompRatio,exceptions,compTime(ms) +""" + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.csv') as f: + f.write(csv_content) + temp_file = f.name + + try: + result = parse_compression_csv(temp_file) + self.assertEqual(len(result), 0) + finally: + os.unlink(temp_file) + + +class TestLoadCompressionModule(unittest.TestCase): + """Tests for load_compression_module function""" + + def test_load_module_file_not_found(self): + """Test that missing compression module raises RuntimeError""" + with patch('sizing.os.path.exists', return_value=False): + with self.assertRaisesRegex(RuntimeError, "Compression module not found"): + load_compression_module() + + def test_load_module_path_is_directory(self): + """Test that directory path raises RuntimeError""" + with patch('sizing.os.path.exists', return_value=True): + with patch('sizing.os.path.isfile', return_value=False): + with self.assertRaisesRegex(RuntimeError, "Path exists but is not a file"): + load_compression_module() + + def test_load_module_invalid_spec(self): + """Test that invalid module spec raises RuntimeError""" + with patch('sizing.os.path.exists', return_value=True): + with patch('sizing.os.path.isfile', return_value=True): + with patch('sizing.importlib.util.spec_from_file_location', return_value=None): + with self.assertRaisesRegex(RuntimeError, "Failed to create module spec"): + load_compression_module() + + def test_load_module_missing_getdata_function(self): + """Test that module without getData function raises RuntimeError""" + mock_module = MagicMock() + del mock_module.getData # Remove the getData attribute + + with patch('sizing.os.path.exists', return_value=True): + with patch('sizing.os.path.isfile', return_value=True): + with patch('sizing.importlib.util.spec_from_file_location') as mock_spec_from_file: + mock_spec = MagicMock() + mock_spec_from_file.return_value = mock_spec + with patch('sizing.importlib.util.module_from_spec', return_value=mock_module): + with self.assertRaisesRegex(RuntimeError, "missing required 'getData' function"): + load_compression_module() + + def test_load_module_success(self): + """Test successful module loading""" + mock_module = MagicMock() + mock_module.getData = MagicMock() + + with patch('sizing.os.path.exists', return_value=True): + with patch('sizing.os.path.isfile', return_value=True): + with patch('sizing.importlib.util.spec_from_file_location') as mock_spec_from_file: + mock_spec = MagicMock() + mock_spec_from_file.return_value = mock_spec + with patch('sizing.importlib.util.module_from_spec', return_value=mock_module): + result = load_compression_module() + self.assertEqual(result, mock_module) + self.assertTrue(hasattr(result, 'getData')) + + +class TestRunCompressionAndGetOutput(unittest.TestCase): + """Tests for run_compression_and_get_output function""" + + @patch('sizing.load_compression_module') + @patch('sizing.glob.glob') + def test_successful_compression_run(self, mock_glob, mock_load_compression): + """Test successful compression analysis run""" + # Setup mocks + mock_compression_module = MagicMock() + mock_load_compression.return_value = mock_compression_module + + mock_glob.side_effect = [ + [], # No existing files + ['temp-20260209120000-compression-review.csv'] # New file created + ] + + result = run_compression_and_get_output( + uri='mongodb://localhost:27017', + sample_size=1000, + dictionary_sample_size=100 + ) + + self.assertEqual(result, 'temp-20260209120000-compression-review.csv') + mock_compression_module.getData.assert_called_once() + mock_load_compression.assert_called_once() + + @patch('sizing.load_compression_module') + @patch('sizing.glob.glob') + def test_compression_run_with_existing_files(self, mock_glob, mock_load_compression): + """Test compression run when old files exist""" + # Setup mocks + mock_compression_module = MagicMock() + mock_load_compression.return_value = mock_compression_module + + mock_glob.side_effect = [ + ['temp-20260209110000-compression-review.csv'], # Existing file + [ + 'temp-20260209110000-compression-review.csv', + 'temp-20260209120000-compression-review.csv' + ] # Old + new file + ] + + result = run_compression_and_get_output( + uri='mongodb://localhost:27017', + sample_size=1000, + dictionary_sample_size=100 + ) + + self.assertEqual(result, 'temp-20260209120000-compression-review.csv') + + @patch('sizing.load_compression_module') + @patch('sizing.glob.glob') + def test_compression_run_no_file_created(self, mock_glob, mock_load_compression): + """Test error when no CSV file is created""" + # Setup mocks + mock_compression_module = MagicMock() + mock_load_compression.return_value = mock_compression_module + + mock_glob.side_effect = [[], []] + + with self.assertRaisesRegex(RuntimeError, "No new CSV file created"): + run_compression_and_get_output( + uri='mongodb://localhost:27017', + sample_size=1000, + dictionary_sample_size=100 + ) + + @patch('sizing.load_compression_module') + @patch('sizing.glob.glob') + def test_compression_run_failure(self, mock_glob, mock_load_compression): + """Test error handling when compression analysis fails""" + mock_compression_module = MagicMock() + mock_compression_module.getData.side_effect = Exception("Connection failed") + mock_load_compression.return_value = mock_compression_module + + mock_glob.return_value = [] + + with self.assertRaisesRegex(RuntimeError, "Error running compression analysis"): + run_compression_and_get_output( + uri='mongodb://localhost:27017', + sample_size=1000, + dictionary_sample_size=100 + ) + + @patch('sizing.load_compression_module') + @patch('sizing.glob.glob') + @patch('sizing.os.path.getmtime') + def test_multiple_new_files_created(self, mock_getmtime, mock_glob, mock_load_compression): + """Test handling when multiple new files are created""" + # Setup mocks + mock_compression_module = MagicMock() + mock_load_compression.return_value = mock_compression_module + + mock_glob.side_effect = [ + [], # No existing files + [ + 'temp-20260209120000-compression-review.csv', + 'temp-20260209120001-compression-review.csv' + ] # Two new files + ] + + # Mock getmtime to return different times based on filename + def getmtime_side_effect(filename): + if '120001' in filename: + return 2000 # Newer file + else: + return 1000 # Older file + + mock_getmtime.side_effect = getmtime_side_effect + + result = run_compression_and_get_output( + uri='mongodb://localhost:27017', + sample_size=1000, + dictionary_sample_size=100 + ) + + # Should return the most recent file + self.assertEqual(result, 'temp-20260209120001-compression-review.csv') + + @patch('sizing.load_compression_module') + def test_compression_module_load_failure(self, mock_load_compression): + """Test error handling when compression module fails to load""" + mock_load_compression.side_effect = RuntimeError("Compression module not found") + + with self.assertRaisesRegex(RuntimeError, "Compression module not found"): + run_compression_and_get_output( + uri='mongodb://localhost:27017', + sample_size=1000, + dictionary_sample_size=100 + ) + + +class TestGenerateSizingCsv(unittest.TestCase): + """Tests for generate_sizing_csv function""" + + @patch('sizing.pymongo.MongoClient') + @patch('sizing.dt.datetime') + def test_generate_sizing_csv_success(self, mock_datetime, mock_mongo_client): + """Test successful sizing CSV generation""" + # Setup mocks + mock_datetime.now.return_value.strftime.return_value = '20260209120000' + + mock_client = MagicMock() + mock_mongo_client.return_value.__enter__.return_value = mock_client + + # Mock MongoDB collStats response + mock_client.__getitem__.return_value.command.return_value = { + 'nindexes': 3, + 'totalIndexSize': 1073741824 # 1GB + } + + comp_data = { + 'testdb.users': { + 'db_name': 'testdb', + 'coll_name': 'users', + 'num_docs': 10000, + 'avg_doc_size': 512, + 'comp_ratio': 2.0 + } + } + + with tempfile.TemporaryDirectory() as tmpdir: + os.chdir(tmpdir) + + result = generate_sizing_csv( + comp_data=comp_data, + uri='mongodb://localhost:27017' + ) + + self.assertEqual(result, 'sizing-20260209120000.csv') + self.assertTrue(os.path.exists(result)) + + # Verify CSV content + with open(result, 'r') as f: + reader = csv.reader(f) + rows = list(reader) + + # Check header + self.assertEqual(rows[0][0], 'SLNo') + self.assertEqual(rows[0][1], 'Database_Name') + + # Check data row + self.assertEqual(rows[1][0], '1') + self.assertEqual(rows[1][1], 'testdb') + self.assertEqual(rows[1][2], 'users') + self.assertEqual(rows[1][3], '10000') + + @patch('sizing.pymongo.MongoClient') + @patch('sizing.dt.datetime') + def test_generate_sizing_csv_with_error(self, mock_datetime, mock_mongo_client): + """Test sizing CSV generation with collection error""" + # Setup mocks + mock_datetime.now.return_value.strftime.return_value = '20260209120000' + + mock_client = MagicMock() + mock_mongo_client.return_value.__enter__.return_value = mock_client + + # Mock MongoDB collStats to raise exception + mock_client.__getitem__.return_value.command.side_effect = Exception("Collection not found") + + comp_data = { + 'testdb.users': { + 'db_name': 'testdb', + 'coll_name': 'users', + 'num_docs': 10000, + 'avg_doc_size': 512, + 'comp_ratio': 2.0 + } + } + + with tempfile.TemporaryDirectory() as tmpdir: + os.chdir(tmpdir) + + result = generate_sizing_csv( + comp_data=comp_data, + uri='mongodb://localhost:27017' + ) + + # Should still create file, but with no data rows + self.assertTrue(os.path.exists(result)) + + with open(result, 'r') as f: + reader = csv.reader(f) + rows = list(reader) + + # Only header, no data rows + self.assertEqual(len(rows), 1) + + @patch('sizing.pymongo.MongoClient') + @patch('sizing.dt.datetime') + def test_generate_sizing_csv_multiple_collections(self, mock_datetime, mock_mongo_client): + """Test sizing CSV generation with multiple collections""" + # Setup mocks + mock_datetime.now.return_value.strftime.return_value = '20260209120000' + + mock_client = MagicMock() + mock_mongo_client.return_value.__enter__.return_value = mock_client + + # Mock MongoDB collStats response + mock_client.__getitem__.return_value.command.return_value = { + 'nindexes': 2, + 'totalIndexSize': 536870912 # 512MB + } + + comp_data = { + 'testdb.users': { + 'db_name': 'testdb', + 'coll_name': 'users', + 'num_docs': 10000, + 'avg_doc_size': 512, + 'comp_ratio': 2.0 + }, + 'testdb.orders': { + 'db_name': 'testdb', + 'coll_name': 'orders', + 'num_docs': 5000, + 'avg_doc_size': 1024, + 'comp_ratio': 2.5 + } + } + + with tempfile.TemporaryDirectory() as tmpdir: + os.chdir(tmpdir) + + result = generate_sizing_csv( + comp_data=comp_data, + uri='mongodb://localhost:27017' + ) + + with open(result, 'r') as f: + reader = csv.reader(f) + rows = list(reader) + + # Header + 2 data rows + self.assertEqual(len(rows), 3) + self.assertEqual(rows[1][2], 'users') + self.assertEqual(rows[2][2], 'orders') + + +if __name__ == '__main__': + unittest.main()