Skip to content
This repository was archived by the owner on Sep 25, 2023. It is now read-only.
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ The following source and target combinations can be used:

### Validate

This command runs a validation against a metadata csv file. It generates a STAC object on the fly for each metadata and run a JSON schema validation. It outputs the errors and their recurrences grouped by JSON schemas as:
This command runs a validation against a metadata csv file. It generates the corresponding STAC objects on the fly for each metadata and run a JSON schema validation (using [jsonschema-rs](https://github.com/Stranger6667/jsonschema-rs)) for the `Items` and `Collections`. It outputs the errors and their recurrences grouped by JSON schemas as:
```json
"errors": {"https://linz.github.io/stac/v0.0.7/aerial-photo/schema.json": {"'aerial-photo:run' is a required property": 4, "'aerial-photo:sequence_number' is a required property": 10}
```
Expand All @@ -78,6 +78,16 @@ This command runs a validation against a metadata csv file. It generates a STAC
./validate --source metadata_file.csv
```

```shell
# Run against the `Items` only:
./validate --source metadata_file.csv --item
```

```shell
# Run against the `Collections` only:
./validate --source metadata_file.csv --collection
```

```shell
# For help:
./validate --help
Expand Down
24 changes: 20 additions & 4 deletions topo_processor/cli/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from linz_logger import LogLevel, get_log, set_level

from topo_processor.file_system.get_fs import is_s3_path
from topo_processor.stac.item_factory import process_metadata
from topo_processor.stac.validation import validate_stac
from topo_processor.util import time_in_ms


Expand All @@ -19,9 +19,21 @@
"-v",
"--verbose",
is_flag=True,
help="Use verbose to display trace logs",
help="Use verbose to display trace logs (it might be slower)",
)
def main(source, verbose):
@click.option(
"-i",
"--item",
is_flag=True,
help="Use item to validate items only",
)
@click.option(
"-c",
"--collection",
is_flag=True,
help="Use collection to validate collections only",
)
def main(source, verbose, item, collection):
if verbose:
set_level(LogLevel.trace)
else:
Expand All @@ -32,7 +44,11 @@ def main(source, verbose):
if not is_s3_path(source):
source = os.path.abspath(source)

process_metadata(source)
if item == collection:
validate_stac(source)
else:
validate_stac(source, item, collection)

get_log().debug(
"validate completed",
file=source,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,15 @@

import json
import urllib
from typing import TYPE_CHECKING, Any, Dict
from typing import TYPE_CHECKING, Any, Dict, Union

import jsonschema_rs
from linz_logger import get_log
from pystac.errors import STACValidationError

from .metadata_validator import MetadataValidator
from topo_processor.stac import Collection, Item

if TYPE_CHECKING:
from topo_processor.stac import Item
from .metadata_validator import MetadataValidator


class MetadataValidatorStac(MetadataValidator):
Expand All @@ -28,7 +27,7 @@ def get_validator_from_uri(self, schema_uri: str) -> Any:

return validator

def is_applicable(self, item: Item) -> bool:
def is_applicable(self, stac_object: Union[Item, Collection]) -> bool:
return True

def validate_metadata(self, item: Item) -> None:
Expand All @@ -38,23 +37,27 @@ def validate_metadata(self, item: Item) -> None:
except STACValidationError as e:
raise STACValidationError(message=f"Not valid STAC: {e}")

def validate_metadata_with_report(self, item: Item) -> Dict[str, list[str]]:
def validate_metadata_with_report(self, stac_object: Union[Item, Collection]) -> Dict[str, list[str]]:
"""Validate the STAC object (Item or Collection) against the core json schema and its extensions.
Return an error report [{schemaURI, [errors]}]
"""
errors_report: Dict[str, list[str]] = {}
stac_item = item.create_stac().to_dict(include_self_link=False)
schema_uris: list[str] = [item.schema] + stac_item["stac_extensions"]
stac_dict = stac_object.create_stac().to_dict(include_self_link=False)
# FIXME: Work around pystac `to_dict` serialization issue with links (https://github.com/stac-utils/pystac/issues/652)
if stac_dict["type"] == "Collection":
stac_dict["links"] = json.loads(json.dumps(stac_dict["links"]))

get_log().debug(f"{self.name}:validate_metadata_with_report", itemId=stac_item["id"])
schema_uris: list[str] = [stac_object.schema] + stac_dict["stac_extensions"]

for schema_uri in schema_uris:
get_log().trace(f"{self.name}:validate_metadata_with_report", schema=schema_uri)
get_log().trace(f"{self.name}:validate_metadata_with_report", stacId=stac_dict["id"], schema=schema_uri)
current_errors = []
v = self.get_validator_from_uri(schema_uri)

errors = v.iter_errors(stac_item)
errors = v.iter_errors(stac_dict)

for error in errors:
current_errors.append(error.message)
get_log().warn(f"{self.name}:validate_metadata_with_report", itemId=stac_item["id"], error=error.message)
get_log().warn(f"{self.name}:validate_metadata_with_report", stacId=stac_dict["id"], error=error.message)

if current_errors:
errors_report[schema_uri] = current_errors
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import topo_processor.stac as stac
from topo_processor.metadata.metadata_validators.metadata_validator_stac import MetadataValidatorStac
from topo_processor.stac.validate_report import ValidateReport


def test_check_validity_camera_extension():
Expand Down Expand Up @@ -89,9 +90,9 @@ def test_check_validity_scanning_extension():
validator.validate_metadata(item)


def test_validate_metadata_with_report():
"""check that the method return a report of the errors"""
errors_report: Dict[str, list] = {}
def test_validate_metadata_with_report_item():
"""check that the method return a report of the errors for an item validation"""
validate_report: ValidateReport = ValidateReport()
source_path = os.path.join(os.getcwd(), "test_data", "tiffs", "SURVEY_1", "CONTROL.tiff")
asset = stac.Asset(source_path)
item = stac.Item("item_id")
Expand All @@ -107,6 +108,23 @@ def test_validate_metadata_with_report():
item.add_extension(stac.StacExtensions.aerial_photo.value)
validator = MetadataValidatorStac()
assert validator.is_applicable(item)
errors_report = validator.validate_metadata_with_report(item)
assert '"string" is not of type "integer"' in errors_report[stac.StacExtensions.film.value]
assert '"aerial-photo:run" is a required property' in errors_report[stac.StacExtensions.aerial_photo.value]
validate_report.add_errors(validator.validate_metadata_with_report(item))
assert '"string" is not of type "integer"' in validate_report.report_per_error_type[stac.StacExtensions.film.value]
assert (
'"aerial-photo:run" is a required property'
in validate_report.report_per_error_type[stac.StacExtensions.aerial_photo.value]
)


def test_validate_metadata_with_report_collection():
"""check that the method return a report of the errors for a collection validation"""
"""check that the method return a report of the errors for an item validation"""
validate_report: ValidateReport = ValidateReport()
collection = stac.Collection("title_col")
collection.description = "desc"
collection.license = "lic"
validator = MetadataValidatorStac()
assert validator.is_applicable(collection)
validate_report.add_errors(validator.validate_metadata_with_report(collection))
assert validate_report.total == 1
assert not validate_report.report_per_error_type
2 changes: 2 additions & 0 deletions topo_processor/stac/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@
from .item_factory import process_directory
from .stac_extensions import StacExtensions
from .store import collection_store
from .validate_report import ValidateReport
from .validation import validate_stac
10 changes: 8 additions & 2 deletions topo_processor/stac/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
from tempfile import mkdtemp
from typing import TYPE_CHECKING, Dict, List

import pystac
import ulid
from linz_logger import get_log
from pystac import pystac
from pystac.validation.schema_uri_map import DefaultSchemaUriMap
from shapely.ops import unary_union

from topo_processor.util import Validity
Expand All @@ -21,17 +22,22 @@


class Collection(Validity):
id: str
title: str
description: str
license: str
items: Dict[str, "Item"]
providers: List[pystac.Provider]
schema: str
stac_extensions: set

def __init__(self, title: str):
super().__init__()
# FIXME: Do we want to generate this id like this?
self.id = str(ulid.ULID())
self.title = title
self.items = {}
self.schema = DefaultSchemaUriMap().get_object_schema_uri(pystac.STACObjectType.COLLECTION, pystac.get_stac_version())
self.stac_extensions = set([])

def add_item(self, item: Item):
Expand Down Expand Up @@ -95,7 +101,7 @@ def delete_temp_dir(self):

def create_stac(self) -> pystac.Collection:
stac = pystac.Collection(
id=str(ulid.ULID()),
id=self.id,
description=self.description,
extent=pystac.Extent(
pystac.SpatialExtent(bboxes=self.get_bounding_boxes()),
Expand Down
42 changes: 2 additions & 40 deletions topo_processor/stac/item_factory.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
from typing import Dict

from linz_logger import get_log

from topo_processor.data.data_transformers import data_transformer_repo
from topo_processor.file_system.get_fs import get_fs
from topo_processor.file_system.get_path_with_protocol import get_path_with_protocol
from topo_processor.metadata.metadata_loaders import metadata_loader_imagery_historic, metadata_loader_repo
from topo_processor.metadata.metadata_validators import metadata_validator_repo, metadata_validator_stac
from topo_processor.metadata.metadata_loaders import metadata_loader_repo
from topo_processor.metadata.metadata_validators import metadata_validator_repo
from topo_processor.stac.store import get_asset, item_store
from topo_processor.util import time_in_ms

Expand All @@ -19,42 +17,6 @@ def process_directory(source_dir: str) -> None:
get_log().debug("Items Created", source_dir=source_dir, duration=time_in_ms() - start_time)


def process_metadata(metadata_file: str) -> None:
start_time = time_in_ms()
errors_per_item: Dict[str, Dict[str, list]] = {}
errors_per_type: Dict[str, Dict[str, int]] = {}
total_items_processed = 0

# Load metadata from metadata csv file
metadata_loader_imagery_historic.load_all_metadata(metadata_file)
get_log().debug("Metadata Loaded", metadata_file=metadata_file, duration=time_in_ms() - start_time)

# Validate item against schema
for item in item_store.values():
if item.is_valid():
errors_per_item[item.id] = metadata_validator_stac.validate_metadata_with_report(item)
total_items_processed = total_items_processed + 1

# Build errors report
for errors_item in errors_per_item.values():
for schema_uri in errors_item:
if schema_uri not in errors_per_type:
errors_per_type[schema_uri] = {}
for error in errors_item[schema_uri]:
if error in errors_per_type[schema_uri]:
errors_per_type[schema_uri][error] = errors_per_type[schema_uri][error] + 1
else:
errors_per_type[schema_uri][error] = 1

get_log().info(
"Metadata Validated",
metadata_file=metadata_file,
nbItemsProcessed=total_items_processed,
duration=time_in_ms() - start_time,
errors=errors_per_type,
)


def _create_assets(source_dir: str) -> None:
fs = get_fs(source_dir)
for (path, _, files) in fs.walk(source_dir):
Expand Down
19 changes: 19 additions & 0 deletions topo_processor/stac/tests/validate_report_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import os
from logging import error

import pytest

from topo_processor.stac.validate_report import ValidateReport


def test_increment_error():
""""""
error_report: ValidateReport = ValidateReport()
error_report.increment_error("schema_a", "error_1")
assert error_report.report_per_error_type["schema_a"]["error_1"] == 1
error_report.increment_error("schema_a", "error_1")
assert error_report.report_per_error_type["schema_a"]["error_1"] == 2
error_report.increment_error("schema_b", "error_1")
assert error_report.report_per_error_type["schema_b"]["error_1"] == 1
error_report.increment_error("schema_a", "error_2")
assert error_report.report_per_error_type["schema_a"]["error_2"] == 1
22 changes: 22 additions & 0 deletions topo_processor/stac/validate_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from typing import Dict, List


class ValidateReport:
total: int
report_per_error_type: Dict[str, Dict[str, int]]

def __init__(self):
self.total = 0
self.report_per_error_type = {}

def add_errors(self, errors_per_schema: Dict[str, List[str]]) -> None:
for schema_uri in errors_per_schema:
for error in errors_per_schema[schema_uri]:
self.increment_error(schema_uri, error)
self.total = self.total + 1

def increment_error(self, schema: str, error: str) -> None:
existing = self.report_per_error_type.get(schema)
if existing is None:
self.report_per_error_type[schema] = existing = {}
existing[error] = existing.get(error, 0) + 1
49 changes: 49 additions & 0 deletions topo_processor/stac/validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from typing import Any, Dict, List, Tuple, Union

from linz_logger import get_log

from topo_processor.metadata.metadata_loaders import metadata_loader_imagery_historic
from topo_processor.metadata.metadata_validators import metadata_validator_stac
from topo_processor.stac.validate_report import ValidateReport
from topo_processor.util import time_in_ms

from .collection import Collection
from .item import Item
from .store import collection_store, item_store


def validate_stac(metadata_file: str, validate_item: bool = True, validate_collection: bool = True) -> None:
start_time = time_in_ms()
item_report: ValidateReport = ValidateReport()
collection_report: ValidateReport = ValidateReport()

# Load metadata from metadata csv file
metadata_loader_imagery_historic.load_all_metadata(metadata_file)
get_log().debug("Metadata Loaded", metadata_file=metadata_file, duration=time_in_ms() - start_time)

# Validate metadata from stored STAC objects
if validate_item:
item_report = validate_store(item_store)
if validate_collection:
collection_report = validate_store(collection_store)

# Print report
get_log().info(
"Metadata Validated",
metadata_file=metadata_file,
nbItemsValidated=item_report.total,
nbCollectionsValidated=collection_report.total,
duration=time_in_ms() - start_time,
itemErrors=item_report.report_per_error_type,
collectionErrors=collection_report.report_per_error_type,
)


def validate_store(store: List[Union[Item, Collection]]) -> ValidateReport:
validate_report: ValidateReport = ValidateReport()

for stac_object in store.values():
if stac_object.is_valid():
validate_report.add_errors(metadata_validator_stac.validate_metadata_with_report(stac_object))

return validate_report
4 changes: 2 additions & 2 deletions upload
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python3

from topo_processor.cli import main
from topo_processor.cli import upload

main.main()
upload.main()