diff --git a/.gitignore b/.gitignore index 6dc74e01..4b5c8f58 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,9 @@ nosetests.xml coverage.xml *.cover .hypothesis/ +.pytest_cache/ +*.pytest_cache +.pytest_cache/* # Translations *.mo @@ -119,3 +122,13 @@ credentials.json # Default work dir work + +# Poetry +poetry.lock + +# Claude settings +.claude/* + +# Testing artifacts +test-results/ +.benchmarks/ diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..370cf566 --- /dev/null +++ b/Makefile @@ -0,0 +1,17 @@ +.PHONY: test tests coverage clean + +test: + poetry run pytest + +tests: + poetry run pytest + +coverage: + poetry run pytest --cov-report=term-missing --cov-report=html + +clean: + rm -rf .pytest_cache + rm -rf htmlcov + rm -f coverage.xml + rm -f .coverage + find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..3f1fde02 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,168 @@ +[tool.poetry] +name = "soweego" +version = "1.0.0" +description = "A Wikidata bot for entity linking" +authors = ["Marco Fossati "] +license = "GPL-3.0" +readme = "README.md" +repository = "https://github.com/Wikidata/soweego" +keywords = ["wikidata", "entity-linking", "record-linkage"] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Topic :: Scientific/Engineering :: Information Analysis", +] +packages = [{include = "soweego"}] + +[tool.poetry.dependencies] +python = "^3.8.1" +click = "^8.0.0" +jellyfish = "^0.9.0" +joblib = "^1.0.0" +keras = "^2.0.0" +lxml = "^4.0.0" +mlens = "^0.2.0" +numpy = "^1.0.0" +pandas = "^1.0.0" +pywikibot = "^7.0.0" +recordlinkage = "^0.15.0" +regex = "^2022.0.0" +requests = "^2.0.0" +scikit-learn = "^1.0.0" +sqlalchemy = "^1.4.0" +tensorflow = "^2.0.0" +tqdm = "^4.0.0" +urllib3 = "^1.26.0" +matplotlib = "^3.0.0" +seaborn = "^0.11.0" + +[tool.poetry.group.dev.dependencies] +pytest = "^7.4.0" +pytest-cov = "^4.1.0" +pytest-mock = "^3.11.0" +black = "^23.0.0" +isort = "^5.12.0" +flake8 = "^6.0.0" +mypy = "^1.4.0" +pre-commit = "^3.3.0" + +[tool.poetry.scripts] +soweego = "soweego.cli:cli" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +minversion = "7.0" +testpaths = ["tests"] +python_files = ["test_*.py", "*_test.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] +addopts = [ + "-ra", + "--strict-markers", + "--strict-config", + "--cov=soweego", + "--cov-branch", + "--cov-report=term-missing:skip-covered", + "--cov-report=html:htmlcov", + "--cov-report=xml:coverage.xml", + # "--cov-fail-under=80", # Uncomment when actual tests are written + "-vv" +] +markers = [ + "unit: Unit tests", + "integration: Integration tests", + "slow: Slow running tests" +] +filterwarnings = [ + "ignore::DeprecationWarning", + "ignore::PendingDeprecationWarning" +] + +[tool.coverage.run] +source = ["soweego"] +branch = true +parallel = true +omit = [ + "*/tests/*", + "*/test_*.py", + "*/__pycache__/*", + "*/site-packages/*", + "*/distutils/*", + "*/.venv/*", + "*/venv/*" +] + +[tool.coverage.report] +precision = 2 +show_missing = true +skip_covered = true +# fail_under = 80 # Uncomment when actual tests are written +exclude_lines = [ + "pragma: no cover", + "def __repr__", + "def __str__", + "raise AssertionError", + "raise NotImplementedError", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", + "if typing.TYPE_CHECKING:", + "@abstractmethod", + "@abc.abstractmethod", + "except ImportError:", + "pass" +] + +[tool.coverage.html] +directory = "htmlcov" + +[tool.coverage.xml] +output = "coverage.xml" + +[tool.isort] +profile = "black" +line_length = 88 +multi_line_output = 3 +include_trailing_comma = true +force_grid_wrap = 0 +use_parentheses = true +ensure_newline_before_comments = true + +[tool.black] +line-length = 88 +target-version = ['py38', 'py39', 'py310'] +include = '\.pyi?$' +exclude = ''' +/( + \.eggs + | \.git + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | _build + | buck-out + | build + | dist + | tests/fixtures +)/ +''' + +[tool.mypy] +python_version = "3.8" +warn_return_any = true +warn_unused_configs = true +disallow_untyped_defs = false +disallow_any_generics = false +ignore_missing_imports = true +no_implicit_optional = true +warn_redundant_casts = true +warn_unused_ignores = true +warn_unreachable = true +strict_equality = true \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..b0dea0b4 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,288 @@ +"""Shared pytest fixtures and configuration for all tests.""" + +import os +import tempfile +from pathlib import Path +from typing import Dict, Generator, List +from unittest.mock import MagicMock, Mock + +import pytest +from click.testing import CliRunner + + +@pytest.fixture +def temp_dir() -> Generator[Path, None, None]: + """Create a temporary directory for test files. + + Yields: + Path: Path to the temporary directory. + """ + with tempfile.TemporaryDirectory() as tmpdir: + yield Path(tmpdir) + + +@pytest.fixture +def temp_file(temp_dir: Path) -> Generator[Path, None, None]: + """Create a temporary file for testing. + + Args: + temp_dir: The temporary directory fixture. + + Yields: + Path: Path to the temporary file. + """ + temp_path = temp_dir / "test_file.txt" + temp_path.write_text("test content") + yield temp_path + + +@pytest.fixture +def mock_config() -> Dict[str, str]: + """Provide a mock configuration dictionary. + + Returns: + Dict[str, str]: A dictionary with test configuration values. + """ + return { + "database_url": "sqlite:///:memory:", + "api_key": "test_api_key", + "api_secret": "test_api_secret", + "batch_size": "100", + "timeout": "30", + "debug": "true", + "log_level": "DEBUG", + "output_dir": "/tmp/test_output", + } + + +@pytest.fixture +def mock_database_session(): + """Create a mock database session. + + Returns: + MagicMock: A mock SQLAlchemy session object. + """ + session = MagicMock() + session.query.return_value.filter.return_value.first.return_value = None + session.query.return_value.filter.return_value.all.return_value = [] + session.query.return_value.count.return_value = 0 + session.add = MagicMock() + session.commit = MagicMock() + session.rollback = MagicMock() + session.close = MagicMock() + return session + + +@pytest.fixture +def mock_http_client(): + """Create a mock HTTP client for API testing. + + Returns: + Mock: A mock requests-like object. + """ + client = Mock() + response = Mock() + response.status_code = 200 + response.json.return_value = {"status": "success", "data": []} + response.text = '{"status": "success", "data": []}' + response.headers = {"Content-Type": "application/json"} + client.get.return_value = response + client.post.return_value = response + client.put.return_value = response + client.delete.return_value = response + return client + + +@pytest.fixture +def sample_entity_data() -> List[Dict]: + """Provide sample entity data for testing. + + Returns: + List[Dict]: A list of sample entity dictionaries. + """ + return [ + { + "id": "Q1", + "label": "Test Entity 1", + "description": "A test entity for unit tests", + "aliases": ["TE1", "Entity One"], + "properties": { + "P31": "Q5", # instance of human + "P569": "1990-01-01", # date of birth + }, + }, + { + "id": "Q2", + "label": "Test Entity 2", + "description": "Another test entity", + "aliases": ["TE2", "Entity Two"], + "properties": { + "P31": "Q5", + "P569": "1985-06-15", + }, + }, + ] + + +@pytest.fixture +def cli_runner() -> CliRunner: + """Create a Click CLI test runner. + + Returns: + CliRunner: A Click test runner instance. + """ + return CliRunner() + + +@pytest.fixture +def mock_wikidata_api(): + """Mock Wikidata API responses. + + Returns: + Mock: A mock object simulating Wikidata API. + """ + api = Mock() + api.get_entity.return_value = { + "id": "Q42", + "labels": {"en": {"value": "Douglas Adams"}}, + "descriptions": {"en": {"value": "English writer"}}, + "claims": {}, + } + api.search.return_value = { + "search": [ + {"id": "Q42", "label": "Douglas Adams"}, + {"id": "Q43", "label": "Another Result"}, + ] + } + api.create_claim.return_value = {"success": True, "claim": {"id": "test_claim_id"}} + return api + + +@pytest.fixture +def sample_csv_data(temp_dir: Path) -> Path: + """Create a sample CSV file for testing. + + Args: + temp_dir: The temporary directory fixture. + + Returns: + Path: Path to the created CSV file. + """ + csv_path = temp_dir / "test_data.csv" + csv_content = """id,name,birth_date,occupation +1,John Doe,1990-01-01,Engineer +2,Jane Smith,1985-06-15,Scientist +3,Bob Johnson,1978-03-22,Artist +""" + csv_path.write_text(csv_content) + return csv_path + + +@pytest.fixture +def sample_json_data(temp_dir: Path) -> Path: + """Create a sample JSON file for testing. + + Args: + temp_dir: The temporary directory fixture. + + Returns: + Path: Path to the created JSON file. + """ + import json + + json_path = temp_dir / "test_data.json" + json_data = { + "entities": [ + {"id": 1, "name": "Test 1", "type": "person"}, + {"id": 2, "name": "Test 2", "type": "organization"}, + ], + "metadata": { + "version": "1.0", + "created": "2024-01-01", + }, + } + json_path.write_text(json.dumps(json_data, indent=2)) + return json_path + + +@pytest.fixture(autouse=True) +def reset_environment(): + """Reset environment variables before each test. + + This fixture automatically runs before each test to ensure + a clean environment state. + """ + original_env = os.environ.copy() + yield + os.environ.clear() + os.environ.update(original_env) + + +@pytest.fixture +def mock_logger(): + """Create a mock logger for testing logging behavior. + + Returns: + Mock: A mock logger object. + """ + logger = Mock() + logger.debug = Mock() + logger.info = Mock() + logger.warning = Mock() + logger.error = Mock() + logger.critical = Mock() + return logger + + +@pytest.fixture +def isolated_filesystem(tmp_path: Path, monkeypatch) -> Path: + """Create an isolated filesystem for testing. + + Args: + tmp_path: pytest's built-in tmp_path fixture. + monkeypatch: pytest's monkeypatch fixture. + + Returns: + Path: Path to the isolated directory. + """ + monkeypatch.chdir(tmp_path) + return tmp_path + + +@pytest.fixture +def mock_sparql_results(): + """Mock SPARQL query results. + + Returns: + Dict: A dictionary simulating SPARQL results. + """ + return { + "head": {"vars": ["item", "itemLabel", "value"]}, + "results": { + "bindings": [ + { + "item": {"type": "uri", "value": "http://www.wikidata.org/entity/Q42"}, + "itemLabel": {"type": "literal", "value": "Douglas Adams"}, + "value": {"type": "literal", "value": "42"}, + }, + { + "item": {"type": "uri", "value": "http://www.wikidata.org/entity/Q43"}, + "itemLabel": {"type": "literal", "value": "Test Item"}, + "value": {"type": "literal", "value": "123"}, + }, + ] + }, + } + + +def pytest_configure(config): + """Configure pytest with custom settings.""" + config.addinivalue_line( + "markers", "network: mark test as requiring network access" + ) + config.addinivalue_line( + "markers", "database: mark test as requiring database access" + ) + config.addinivalue_line( + "markers", "wikidata: mark test as requiring Wikidata API access" + ) \ No newline at end of file diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_infrastructure_validation.py b/tests/test_infrastructure_validation.py new file mode 100644 index 00000000..1894775d --- /dev/null +++ b/tests/test_infrastructure_validation.py @@ -0,0 +1,215 @@ +"""Validation tests to verify the testing infrastructure is properly configured.""" + +import sys +from pathlib import Path + +import pytest + + +class TestInfrastructureValidation: + """Test suite to validate the testing infrastructure setup.""" + + def test_pytest_installed(self): + """Verify pytest is installed and importable.""" + import pytest + assert pytest is not None + assert hasattr(pytest, '__version__') + + def test_pytest_cov_installed(self): + """Verify pytest-cov is installed and importable.""" + import pytest_cov + assert pytest_cov is not None + + def test_pytest_mock_installed(self): + """Verify pytest-mock is installed and importable.""" + import pytest_mock + assert pytest_mock is not None + + def test_project_structure_exists(self): + """Verify the project structure is correctly set up.""" + project_root = Path(__file__).parent.parent + + # Check main package exists + assert (project_root / "soweego").exists() + assert (project_root / "soweego" / "__init__.py").exists() + + # Check test directories exist + assert (project_root / "tests").exists() + assert (project_root / "tests" / "__init__.py").exists() + assert (project_root / "tests" / "unit").exists() + assert (project_root / "tests" / "unit" / "__init__.py").exists() + assert (project_root / "tests" / "integration").exists() + assert (project_root / "tests" / "integration" / "__init__.py").exists() + + # Check configuration files exist + assert (project_root / "pyproject.toml").exists() + + def test_conftest_fixtures_available(self, temp_dir, mock_config, cli_runner): + """Verify conftest fixtures are available and working.""" + # Test temp_dir fixture + assert temp_dir.exists() + assert temp_dir.is_dir() + + # Test mock_config fixture + assert isinstance(mock_config, dict) + assert "database_url" in mock_config + assert mock_config["database_url"] == "sqlite:///:memory:" + + # Test cli_runner fixture + assert cli_runner is not None + from click.testing import CliRunner + assert isinstance(cli_runner, CliRunner) + + def test_sample_data_fixtures(self, sample_entity_data, sample_csv_data, sample_json_data): + """Verify sample data fixtures are working correctly.""" + # Test entity data + assert isinstance(sample_entity_data, list) + assert len(sample_entity_data) == 2 + assert sample_entity_data[0]["id"] == "Q1" + + # Test CSV file creation + assert sample_csv_data.exists() + assert sample_csv_data.suffix == ".csv" + content = sample_csv_data.read_text() + assert "John Doe" in content + + # Test JSON file creation + assert sample_json_data.exists() + assert sample_json_data.suffix == ".json" + import json + data = json.loads(sample_json_data.read_text()) + assert "entities" in data + assert len(data["entities"]) == 2 + + def test_mock_fixtures(self, mock_database_session, mock_http_client, mock_wikidata_api): + """Verify mock fixtures are properly configured.""" + # Test database session mock + assert hasattr(mock_database_session, 'query') + assert hasattr(mock_database_session, 'commit') + mock_database_session.commit() # Should not raise + + # Test HTTP client mock + response = mock_http_client.get("http://example.com") + assert response.status_code == 200 + assert response.json() == {"status": "success", "data": []} + + # Test Wikidata API mock + entity = mock_wikidata_api.get_entity("Q42") + assert entity["id"] == "Q42" + assert "labels" in entity + + @pytest.mark.unit + def test_unit_marker(self): + """Test that unit test marker is properly configured.""" + assert True + + @pytest.mark.integration + def test_integration_marker(self): + """Test that integration test marker is properly configured.""" + assert True + + @pytest.mark.slow + def test_slow_marker(self): + """Test that slow test marker is properly configured.""" + assert True + + def test_python_path_includes_project(self): + """Verify the project root is in Python path for imports.""" + project_root = str(Path(__file__).parent.parent) + assert any(project_root in path for path in sys.path) + + def test_coverage_configuration(self): + """Verify coverage is properly configured.""" + from pathlib import Path + project_root = Path(__file__).parent.parent + pyproject = project_root / "pyproject.toml" + + assert pyproject.exists() + content = pyproject.read_text() + + # Check coverage configuration exists + assert "[tool.coverage.run]" in content + assert "[tool.coverage.report]" in content + assert "fail_under = 80" in content + + def test_isolated_filesystem_fixture(self, isolated_filesystem): + """Test the isolated filesystem fixture.""" + # Should be in a temporary directory + assert isolated_filesystem.exists() + assert isolated_filesystem.is_dir() + + # Create a test file + test_file = isolated_filesystem / "test.txt" + test_file.write_text("test content") + assert test_file.exists() + + def test_mock_logger_fixture(self, mock_logger): + """Test the mock logger fixture.""" + # Test all log levels + mock_logger.debug("debug message") + mock_logger.info("info message") + mock_logger.warning("warning message") + mock_logger.error("error message") + mock_logger.critical("critical message") + + # Verify calls were made + mock_logger.debug.assert_called_once_with("debug message") + mock_logger.info.assert_called_once_with("info message") + + def test_mock_sparql_results_fixture(self, mock_sparql_results): + """Test the SPARQL results mock fixture.""" + assert "head" in mock_sparql_results + assert "results" in mock_sparql_results + + bindings = mock_sparql_results["results"]["bindings"] + assert len(bindings) == 2 + assert bindings[0]["itemLabel"]["value"] == "Douglas Adams" + + def test_environment_reset_fixture(self): + """Test that environment is properly reset between tests.""" + import os + + # Set a test environment variable + os.environ["TEST_VAR"] = "test_value" + assert os.environ.get("TEST_VAR") == "test_value" + + # The reset_environment fixture should clean this up after the test + + +class TestPytestConfiguration: + """Tests to verify pytest configuration is correct.""" + + def test_pytest_ini_options(self): + """Verify pytest.ini options are properly set in pyproject.toml.""" + from pathlib import Path + + project_root = Path(__file__).parent.parent + pyproject = project_root / "pyproject.toml" + content = pyproject.read_text() + + # Check test paths + assert 'testpaths = ["tests"]' in content + + # Check test discovery patterns + assert 'python_files = ["test_*.py", "*_test.py"]' in content + assert 'python_classes = ["Test*"]' in content + assert 'python_functions = ["test_*"]' in content + + # Check coverage options + assert "--cov=soweego" in content + assert "--cov-branch" in content + assert "--cov-report=html:htmlcov" in content + assert "--cov-report=xml:coverage.xml" in content + + def test_custom_markers_registered(self): + """Verify custom markers are properly registered.""" + from pathlib import Path + + project_root = Path(__file__).parent.parent + pyproject = project_root / "pyproject.toml" + content = pyproject.read_text() + + # Check markers are defined + assert '"unit: Unit tests"' in content + assert '"integration: Integration tests"' in content + assert '"slow: Slow running tests"' in content \ No newline at end of file diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 00000000..e69de29b