diff --git a/README.md b/README.md index e8ff6115..3c5b6dbd 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ The following source and target combinations can be used: ### Validate -This command runs a validation against a metadata csv file. It generates a STAC object on the fly for each metadata and run a JSON schema validation. It outputs the errors and their recurrences grouped by JSON schemas as: +This command runs a validation against a metadata csv file. It generates the corresponding STAC objects on the fly for each metadata and run a JSON schema validation (using [jsonschema-rs](https://github.com/Stranger6667/jsonschema-rs)) for the `Items` and `Collections`. It outputs the errors and their recurrences grouped by JSON schemas as: ```json "errors": {"https://linz.github.io/stac/v0.0.7/aerial-photo/schema.json": {"'aerial-photo:run' is a required property": 4, "'aerial-photo:sequence_number' is a required property": 10} ``` @@ -78,6 +78,16 @@ This command runs a validation against a metadata csv file. It generates a STAC ./validate --source metadata_file.csv ``` +```shell +# Run against the `Items` only: +./validate --source metadata_file.csv --item +``` + +```shell +# Run against the `Collections` only: +./validate --source metadata_file.csv --collection +``` + ```shell # For help: ./validate --help diff --git a/topo_processor/cli/tests/main_test.py b/topo_processor/cli/tests/upload_test.py similarity index 100% rename from topo_processor/cli/tests/main_test.py rename to topo_processor/cli/tests/upload_test.py diff --git a/topo_processor/cli/main.py b/topo_processor/cli/upload.py similarity index 100% rename from topo_processor/cli/main.py rename to topo_processor/cli/upload.py diff --git a/topo_processor/cli/validate.py b/topo_processor/cli/validate.py index 8498daf9..edac0b67 100644 --- a/topo_processor/cli/validate.py +++ b/topo_processor/cli/validate.py @@ -4,7 +4,7 @@ from linz_logger import LogLevel, get_log, set_level from topo_processor.file_system.get_fs import is_s3_path -from topo_processor.stac.item_factory import process_metadata +from topo_processor.stac.validation import validate_stac from topo_processor.util import time_in_ms @@ -19,9 +19,21 @@ "-v", "--verbose", is_flag=True, - help="Use verbose to display trace logs", + help="Use verbose to display trace logs (it might be slower)", ) -def main(source, verbose): +@click.option( + "-i", + "--item", + is_flag=True, + help="Use item to validate items only", +) +@click.option( + "-c", + "--collection", + is_flag=True, + help="Use collection to validate collections only", +) +def main(source, verbose, item, collection): if verbose: set_level(LogLevel.trace) else: @@ -32,7 +44,11 @@ def main(source, verbose): if not is_s3_path(source): source = os.path.abspath(source) - process_metadata(source) + if item == collection: + validate_stac(source) + else: + validate_stac(source, item, collection) + get_log().debug( "validate completed", file=source, diff --git a/topo_processor/metadata/metadata_validators/metadata_validator_stac.py b/topo_processor/metadata/metadata_validators/metadata_validator_stac.py index f5fce31f..cb0ff67d 100644 --- a/topo_processor/metadata/metadata_validators/metadata_validator_stac.py +++ b/topo_processor/metadata/metadata_validators/metadata_validator_stac.py @@ -2,16 +2,15 @@ import json import urllib -from typing import TYPE_CHECKING, Any, Dict +from typing import TYPE_CHECKING, Any, Dict, Union import jsonschema_rs from linz_logger import get_log from pystac.errors import STACValidationError -from .metadata_validator import MetadataValidator +from topo_processor.stac import Collection, Item -if TYPE_CHECKING: - from topo_processor.stac import Item +from .metadata_validator import MetadataValidator class MetadataValidatorStac(MetadataValidator): @@ -28,7 +27,7 @@ def get_validator_from_uri(self, schema_uri: str) -> Any: return validator - def is_applicable(self, item: Item) -> bool: + def is_applicable(self, stac_object: Union[Item, Collection]) -> bool: return True def validate_metadata(self, item: Item) -> None: @@ -38,23 +37,27 @@ def validate_metadata(self, item: Item) -> None: except STACValidationError as e: raise STACValidationError(message=f"Not valid STAC: {e}") - def validate_metadata_with_report(self, item: Item) -> Dict[str, list[str]]: + def validate_metadata_with_report(self, stac_object: Union[Item, Collection]) -> Dict[str, list[str]]: + """Validate the STAC object (Item or Collection) against the core json schema and its extensions. + Return an error report [{schemaURI, [errors]}] + """ errors_report: Dict[str, list[str]] = {} - stac_item = item.create_stac().to_dict(include_self_link=False) - schema_uris: list[str] = [item.schema] + stac_item["stac_extensions"] + stac_dict = stac_object.create_stac().to_dict(include_self_link=False) + # FIXME: Work around pystac `to_dict` serialization issue with links (https://github.com/stac-utils/pystac/issues/652) + if stac_dict["type"] == "Collection": + stac_dict["links"] = json.loads(json.dumps(stac_dict["links"])) - get_log().debug(f"{self.name}:validate_metadata_with_report", itemId=stac_item["id"]) + schema_uris: list[str] = [stac_object.schema] + stac_dict["stac_extensions"] for schema_uri in schema_uris: - get_log().trace(f"{self.name}:validate_metadata_with_report", schema=schema_uri) + get_log().trace(f"{self.name}:validate_metadata_with_report", stacId=stac_dict["id"], schema=schema_uri) current_errors = [] v = self.get_validator_from_uri(schema_uri) - - errors = v.iter_errors(stac_item) + errors = v.iter_errors(stac_dict) for error in errors: current_errors.append(error.message) - get_log().warn(f"{self.name}:validate_metadata_with_report", itemId=stac_item["id"], error=error.message) + get_log().warn(f"{self.name}:validate_metadata_with_report", stacId=stac_dict["id"], error=error.message) if current_errors: errors_report[schema_uri] = current_errors diff --git a/topo_processor/metadata/metadata_validators/tests/metadata_validator_stac_test.py b/topo_processor/metadata/metadata_validators/tests/metadata_validator_stac_test.py index 01acc44f..ff745e00 100644 --- a/topo_processor/metadata/metadata_validators/tests/metadata_validator_stac_test.py +++ b/topo_processor/metadata/metadata_validators/tests/metadata_validator_stac_test.py @@ -7,6 +7,7 @@ import topo_processor.stac as stac from topo_processor.metadata.metadata_validators.metadata_validator_stac import MetadataValidatorStac +from topo_processor.stac.validate_report import ValidateReport def test_check_validity_camera_extension(): @@ -89,9 +90,9 @@ def test_check_validity_scanning_extension(): validator.validate_metadata(item) -def test_validate_metadata_with_report(): - """check that the method return a report of the errors""" - errors_report: Dict[str, list] = {} +def test_validate_metadata_with_report_item(): + """check that the method return a report of the errors for an item validation""" + validate_report: ValidateReport = ValidateReport() source_path = os.path.join(os.getcwd(), "test_data", "tiffs", "SURVEY_1", "CONTROL.tiff") asset = stac.Asset(source_path) item = stac.Item("item_id") @@ -107,6 +108,23 @@ def test_validate_metadata_with_report(): item.add_extension(stac.StacExtensions.aerial_photo.value) validator = MetadataValidatorStac() assert validator.is_applicable(item) - errors_report = validator.validate_metadata_with_report(item) - assert '"string" is not of type "integer"' in errors_report[stac.StacExtensions.film.value] - assert '"aerial-photo:run" is a required property' in errors_report[stac.StacExtensions.aerial_photo.value] + validate_report.add_errors(validator.validate_metadata_with_report(item)) + assert '"string" is not of type "integer"' in validate_report.report_per_error_type[stac.StacExtensions.film.value] + assert ( + '"aerial-photo:run" is a required property' + in validate_report.report_per_error_type[stac.StacExtensions.aerial_photo.value] + ) + + +def test_validate_metadata_with_report_collection(): + """check that the method return a report of the errors for a collection validation""" + """check that the method return a report of the errors for an item validation""" + validate_report: ValidateReport = ValidateReport() + collection = stac.Collection("title_col") + collection.description = "desc" + collection.license = "lic" + validator = MetadataValidatorStac() + assert validator.is_applicable(collection) + validate_report.add_errors(validator.validate_metadata_with_report(collection)) + assert validate_report.total == 1 + assert not validate_report.report_per_error_type diff --git a/topo_processor/stac/__init__.py b/topo_processor/stac/__init__.py index 81dcba53..946de642 100644 --- a/topo_processor/stac/__init__.py +++ b/topo_processor/stac/__init__.py @@ -5,3 +5,5 @@ from .item_factory import process_directory from .stac_extensions import StacExtensions from .store import collection_store +from .validate_report import ValidateReport +from .validation import validate_stac diff --git a/topo_processor/stac/collection.py b/topo_processor/stac/collection.py index 5dab9d10..f3293f1f 100644 --- a/topo_processor/stac/collection.py +++ b/topo_processor/stac/collection.py @@ -6,9 +6,10 @@ from tempfile import mkdtemp from typing import TYPE_CHECKING, Dict, List -import pystac import ulid from linz_logger import get_log +from pystac import pystac +from pystac.validation.schema_uri_map import DefaultSchemaUriMap from shapely.ops import unary_union from topo_processor.util import Validity @@ -21,17 +22,22 @@ class Collection(Validity): + id: str title: str description: str license: str items: Dict[str, "Item"] providers: List[pystac.Provider] + schema: str stac_extensions: set def __init__(self, title: str): super().__init__() + # FIXME: Do we want to generate this id like this? + self.id = str(ulid.ULID()) self.title = title self.items = {} + self.schema = DefaultSchemaUriMap().get_object_schema_uri(pystac.STACObjectType.COLLECTION, pystac.get_stac_version()) self.stac_extensions = set([]) def add_item(self, item: Item): @@ -95,7 +101,7 @@ def delete_temp_dir(self): def create_stac(self) -> pystac.Collection: stac = pystac.Collection( - id=str(ulid.ULID()), + id=self.id, description=self.description, extent=pystac.Extent( pystac.SpatialExtent(bboxes=self.get_bounding_boxes()), diff --git a/topo_processor/stac/item_factory.py b/topo_processor/stac/item_factory.py index b95a55a3..b03b0152 100644 --- a/topo_processor/stac/item_factory.py +++ b/topo_processor/stac/item_factory.py @@ -1,12 +1,10 @@ -from typing import Dict - from linz_logger import get_log from topo_processor.data.data_transformers import data_transformer_repo from topo_processor.file_system.get_fs import get_fs from topo_processor.file_system.get_path_with_protocol import get_path_with_protocol -from topo_processor.metadata.metadata_loaders import metadata_loader_imagery_historic, metadata_loader_repo -from topo_processor.metadata.metadata_validators import metadata_validator_repo, metadata_validator_stac +from topo_processor.metadata.metadata_loaders import metadata_loader_repo +from topo_processor.metadata.metadata_validators import metadata_validator_repo from topo_processor.stac.store import get_asset, item_store from topo_processor.util import time_in_ms @@ -19,42 +17,6 @@ def process_directory(source_dir: str) -> None: get_log().debug("Items Created", source_dir=source_dir, duration=time_in_ms() - start_time) -def process_metadata(metadata_file: str) -> None: - start_time = time_in_ms() - errors_per_item: Dict[str, Dict[str, list]] = {} - errors_per_type: Dict[str, Dict[str, int]] = {} - total_items_processed = 0 - - # Load metadata from metadata csv file - metadata_loader_imagery_historic.load_all_metadata(metadata_file) - get_log().debug("Metadata Loaded", metadata_file=metadata_file, duration=time_in_ms() - start_time) - - # Validate item against schema - for item in item_store.values(): - if item.is_valid(): - errors_per_item[item.id] = metadata_validator_stac.validate_metadata_with_report(item) - total_items_processed = total_items_processed + 1 - - # Build errors report - for errors_item in errors_per_item.values(): - for schema_uri in errors_item: - if schema_uri not in errors_per_type: - errors_per_type[schema_uri] = {} - for error in errors_item[schema_uri]: - if error in errors_per_type[schema_uri]: - errors_per_type[schema_uri][error] = errors_per_type[schema_uri][error] + 1 - else: - errors_per_type[schema_uri][error] = 1 - - get_log().info( - "Metadata Validated", - metadata_file=metadata_file, - nbItemsProcessed=total_items_processed, - duration=time_in_ms() - start_time, - errors=errors_per_type, - ) - - def _create_assets(source_dir: str) -> None: fs = get_fs(source_dir) for (path, _, files) in fs.walk(source_dir): diff --git a/topo_processor/stac/tests/validate_report_test.py b/topo_processor/stac/tests/validate_report_test.py new file mode 100644 index 00000000..0ec6f77c --- /dev/null +++ b/topo_processor/stac/tests/validate_report_test.py @@ -0,0 +1,19 @@ +import os +from logging import error + +import pytest + +from topo_processor.stac.validate_report import ValidateReport + + +def test_increment_error(): + """""" + error_report: ValidateReport = ValidateReport() + error_report.increment_error("schema_a", "error_1") + assert error_report.report_per_error_type["schema_a"]["error_1"] == 1 + error_report.increment_error("schema_a", "error_1") + assert error_report.report_per_error_type["schema_a"]["error_1"] == 2 + error_report.increment_error("schema_b", "error_1") + assert error_report.report_per_error_type["schema_b"]["error_1"] == 1 + error_report.increment_error("schema_a", "error_2") + assert error_report.report_per_error_type["schema_a"]["error_2"] == 1 diff --git a/topo_processor/stac/validate_report.py b/topo_processor/stac/validate_report.py new file mode 100644 index 00000000..3371280e --- /dev/null +++ b/topo_processor/stac/validate_report.py @@ -0,0 +1,22 @@ +from typing import Dict, List + + +class ValidateReport: + total: int + report_per_error_type: Dict[str, Dict[str, int]] + + def __init__(self): + self.total = 0 + self.report_per_error_type = {} + + def add_errors(self, errors_per_schema: Dict[str, List[str]]) -> None: + for schema_uri in errors_per_schema: + for error in errors_per_schema[schema_uri]: + self.increment_error(schema_uri, error) + self.total = self.total + 1 + + def increment_error(self, schema: str, error: str) -> None: + existing = self.report_per_error_type.get(schema) + if existing is None: + self.report_per_error_type[schema] = existing = {} + existing[error] = existing.get(error, 0) + 1 diff --git a/topo_processor/stac/validation.py b/topo_processor/stac/validation.py new file mode 100644 index 00000000..fd7f4d2a --- /dev/null +++ b/topo_processor/stac/validation.py @@ -0,0 +1,49 @@ +from typing import Any, Dict, List, Tuple, Union + +from linz_logger import get_log + +from topo_processor.metadata.metadata_loaders import metadata_loader_imagery_historic +from topo_processor.metadata.metadata_validators import metadata_validator_stac +from topo_processor.stac.validate_report import ValidateReport +from topo_processor.util import time_in_ms + +from .collection import Collection +from .item import Item +from .store import collection_store, item_store + + +def validate_stac(metadata_file: str, validate_item: bool = True, validate_collection: bool = True) -> None: + start_time = time_in_ms() + item_report: ValidateReport = ValidateReport() + collection_report: ValidateReport = ValidateReport() + + # Load metadata from metadata csv file + metadata_loader_imagery_historic.load_all_metadata(metadata_file) + get_log().debug("Metadata Loaded", metadata_file=metadata_file, duration=time_in_ms() - start_time) + + # Validate metadata from stored STAC objects + if validate_item: + item_report = validate_store(item_store) + if validate_collection: + collection_report = validate_store(collection_store) + + # Print report + get_log().info( + "Metadata Validated", + metadata_file=metadata_file, + nbItemsValidated=item_report.total, + nbCollectionsValidated=collection_report.total, + duration=time_in_ms() - start_time, + itemErrors=item_report.report_per_error_type, + collectionErrors=collection_report.report_per_error_type, + ) + + +def validate_store(store: List[Union[Item, Collection]]) -> ValidateReport: + validate_report: ValidateReport = ValidateReport() + + for stac_object in store.values(): + if stac_object.is_valid(): + validate_report.add_errors(metadata_validator_stac.validate_metadata_with_report(stac_object)) + + return validate_report diff --git a/upload b/upload index 852ed632..5bcda4c8 100755 --- a/upload +++ b/upload @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -from topo_processor.cli import main +from topo_processor.cli import upload -main.main() +upload.main()