Skip to content
This repository was archived by the owner on Sep 25, 2023. It is now read-only.

Commit 19a8c1a

Browse files
authored
feat(validate): validate STAC Collection (TDE-170) (#416)
1 parent 3b2c726 commit 19a8c1a

13 files changed

Lines changed: 175 additions & 68 deletions

File tree

README.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ The following source and target combinations can be used:
6868

6969
### Validate
7070

71-
This command runs a validation against a metadata csv file. It generates a STAC object on the fly for each metadata and run a JSON schema validation. It outputs the errors and their recurrences grouped by JSON schemas as:
71+
This command runs a validation against a metadata csv file. It generates the corresponding STAC objects on the fly for each metadata and run a JSON schema validation (using [jsonschema-rs](https://github.com/Stranger6667/jsonschema-rs)) for the `Items` and `Collections`. It outputs the errors and their recurrences grouped by JSON schemas as:
7272
```json
7373
"errors": {"https://linz.github.io/stac/v0.0.7/aerial-photo/schema.json": {"'aerial-photo:run' is a required property": 4, "'aerial-photo:sequence_number' is a required property": 10}
7474
```
@@ -78,6 +78,16 @@ This command runs a validation against a metadata csv file. It generates a STAC
7878
./validate --source metadata_file.csv
7979
```
8080

81+
```shell
82+
# Run against the `Items` only:
83+
./validate --source metadata_file.csv --item
84+
```
85+
86+
```shell
87+
# Run against the `Collections` only:
88+
./validate --source metadata_file.csv --collection
89+
```
90+
8191
```shell
8292
# For help:
8393
./validate --help

topo_processor/cli/validate.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from linz_logger import LogLevel, get_log, set_level
55

66
from topo_processor.file_system.get_fs import is_s3_path
7-
from topo_processor.stac.item_factory import process_metadata
7+
from topo_processor.stac.validation import validate_stac
88
from topo_processor.util import time_in_ms
99

1010

@@ -19,9 +19,21 @@
1919
"-v",
2020
"--verbose",
2121
is_flag=True,
22-
help="Use verbose to display trace logs",
22+
help="Use verbose to display trace logs (it might be slower)",
2323
)
24-
def main(source, verbose):
24+
@click.option(
25+
"-i",
26+
"--item",
27+
is_flag=True,
28+
help="Use item to validate items only",
29+
)
30+
@click.option(
31+
"-c",
32+
"--collection",
33+
is_flag=True,
34+
help="Use collection to validate collections only",
35+
)
36+
def main(source, verbose, item, collection):
2537
if verbose:
2638
set_level(LogLevel.trace)
2739
else:
@@ -32,7 +44,11 @@ def main(source, verbose):
3244
if not is_s3_path(source):
3345
source = os.path.abspath(source)
3446

35-
process_metadata(source)
47+
if item == collection:
48+
validate_stac(source)
49+
else:
50+
validate_stac(source, item, collection)
51+
3652
get_log().debug(
3753
"validate completed",
3854
file=source,

topo_processor/metadata/metadata_validators/metadata_validator_stac.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,15 @@
22

33
import json
44
import urllib
5-
from typing import TYPE_CHECKING, Any, Dict
5+
from typing import TYPE_CHECKING, Any, Dict, Union
66

77
import jsonschema_rs
88
from linz_logger import get_log
99
from pystac.errors import STACValidationError
1010

11-
from .metadata_validator import MetadataValidator
11+
from topo_processor.stac import Collection, Item
1212

13-
if TYPE_CHECKING:
14-
from topo_processor.stac import Item
13+
from .metadata_validator import MetadataValidator
1514

1615

1716
class MetadataValidatorStac(MetadataValidator):
@@ -28,7 +27,7 @@ def get_validator_from_uri(self, schema_uri: str) -> Any:
2827

2928
return validator
3029

31-
def is_applicable(self, item: Item) -> bool:
30+
def is_applicable(self, stac_object: Union[Item, Collection]) -> bool:
3231
return True
3332

3433
def validate_metadata(self, item: Item) -> None:
@@ -38,23 +37,27 @@ def validate_metadata(self, item: Item) -> None:
3837
except STACValidationError as e:
3938
raise STACValidationError(message=f"Not valid STAC: {e}")
4039

41-
def validate_metadata_with_report(self, item: Item) -> Dict[str, list[str]]:
40+
def validate_metadata_with_report(self, stac_object: Union[Item, Collection]) -> Dict[str, list[str]]:
41+
"""Validate the STAC object (Item or Collection) against the core json schema and its extensions.
42+
Return an error report [{schemaURI, [errors]}]
43+
"""
4244
errors_report: Dict[str, list[str]] = {}
43-
stac_item = item.create_stac().to_dict(include_self_link=False)
44-
schema_uris: list[str] = [item.schema] + stac_item["stac_extensions"]
45+
stac_dict = stac_object.create_stac().to_dict(include_self_link=False)
46+
# FIXME: Work around pystac `to_dict` serialization issue with links (https://github.com/stac-utils/pystac/issues/652)
47+
if stac_dict["type"] == "Collection":
48+
stac_dict["links"] = json.loads(json.dumps(stac_dict["links"]))
4549

46-
get_log().debug(f"{self.name}:validate_metadata_with_report", itemId=stac_item["id"])
50+
schema_uris: list[str] = [stac_object.schema] + stac_dict["stac_extensions"]
4751

4852
for schema_uri in schema_uris:
49-
get_log().trace(f"{self.name}:validate_metadata_with_report", schema=schema_uri)
53+
get_log().trace(f"{self.name}:validate_metadata_with_report", stacId=stac_dict["id"], schema=schema_uri)
5054
current_errors = []
5155
v = self.get_validator_from_uri(schema_uri)
52-
53-
errors = v.iter_errors(stac_item)
56+
errors = v.iter_errors(stac_dict)
5457

5558
for error in errors:
5659
current_errors.append(error.message)
57-
get_log().warn(f"{self.name}:validate_metadata_with_report", itemId=stac_item["id"], error=error.message)
60+
get_log().warn(f"{self.name}:validate_metadata_with_report", stacId=stac_dict["id"], error=error.message)
5861

5962
if current_errors:
6063
errors_report[schema_uri] = current_errors

topo_processor/metadata/metadata_validators/tests/metadata_validator_stac_test.py

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import topo_processor.stac as stac
99
from topo_processor.metadata.metadata_validators.metadata_validator_stac import MetadataValidatorStac
10+
from topo_processor.stac.validate_report import ValidateReport
1011

1112

1213
def test_check_validity_camera_extension():
@@ -89,9 +90,9 @@ def test_check_validity_scanning_extension():
8990
validator.validate_metadata(item)
9091

9192

92-
def test_validate_metadata_with_report():
93-
"""check that the method return a report of the errors"""
94-
errors_report: Dict[str, list] = {}
93+
def test_validate_metadata_with_report_item():
94+
"""check that the method return a report of the errors for an item validation"""
95+
validate_report: ValidateReport = ValidateReport()
9596
source_path = os.path.join(os.getcwd(), "test_data", "tiffs", "SURVEY_1", "CONTROL.tiff")
9697
asset = stac.Asset(source_path)
9798
item = stac.Item("item_id")
@@ -107,6 +108,23 @@ def test_validate_metadata_with_report():
107108
item.add_extension(stac.StacExtensions.aerial_photo.value)
108109
validator = MetadataValidatorStac()
109110
assert validator.is_applicable(item)
110-
errors_report = validator.validate_metadata_with_report(item)
111-
assert '"string" is not of type "integer"' in errors_report[stac.StacExtensions.film.value]
112-
assert '"aerial-photo:run" is a required property' in errors_report[stac.StacExtensions.aerial_photo.value]
111+
validate_report.add_errors(validator.validate_metadata_with_report(item))
112+
assert '"string" is not of type "integer"' in validate_report.report_per_error_type[stac.StacExtensions.film.value]
113+
assert (
114+
'"aerial-photo:run" is a required property'
115+
in validate_report.report_per_error_type[stac.StacExtensions.aerial_photo.value]
116+
)
117+
118+
119+
def test_validate_metadata_with_report_collection():
120+
"""check that the method return a report of the errors for a collection validation"""
121+
"""check that the method return a report of the errors for an item validation"""
122+
validate_report: ValidateReport = ValidateReport()
123+
collection = stac.Collection("title_col")
124+
collection.description = "desc"
125+
collection.license = "lic"
126+
validator = MetadataValidatorStac()
127+
assert validator.is_applicable(collection)
128+
validate_report.add_errors(validator.validate_metadata_with_report(collection))
129+
assert validate_report.total == 1
130+
assert not validate_report.report_per_error_type

topo_processor/stac/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,5 @@
55
from .item_factory import process_directory
66
from .stac_extensions import StacExtensions
77
from .store import collection_store
8+
from .validate_report import ValidateReport
9+
from .validation import validate_stac

topo_processor/stac/collection.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@
66
from tempfile import mkdtemp
77
from typing import TYPE_CHECKING, Dict, List
88

9-
import pystac
109
import ulid
1110
from linz_logger import get_log
11+
from pystac import pystac
12+
from pystac.validation.schema_uri_map import DefaultSchemaUriMap
1213
from shapely.ops import unary_union
1314

1415
from topo_processor.util import Validity
@@ -21,17 +22,22 @@
2122

2223

2324
class Collection(Validity):
25+
id: str
2426
title: str
2527
description: str
2628
license: str
2729
items: Dict[str, "Item"]
2830
providers: List[pystac.Provider]
31+
schema: str
2932
stac_extensions: set
3033

3134
def __init__(self, title: str):
3235
super().__init__()
36+
# FIXME: Do we want to generate this id like this?
37+
self.id = str(ulid.ULID())
3338
self.title = title
3439
self.items = {}
40+
self.schema = DefaultSchemaUriMap().get_object_schema_uri(pystac.STACObjectType.COLLECTION, pystac.get_stac_version())
3541
self.stac_extensions = set([])
3642

3743
def add_item(self, item: Item):
@@ -95,7 +101,7 @@ def delete_temp_dir(self):
95101

96102
def create_stac(self) -> pystac.Collection:
97103
stac = pystac.Collection(
98-
id=str(ulid.ULID()),
104+
id=self.id,
99105
description=self.description,
100106
extent=pystac.Extent(
101107
pystac.SpatialExtent(bboxes=self.get_bounding_boxes()),

topo_processor/stac/item_factory.py

Lines changed: 2 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
1-
from typing import Dict
2-
31
from linz_logger import get_log
42

53
from topo_processor.data.data_transformers import data_transformer_repo
64
from topo_processor.file_system.get_fs import get_fs
75
from topo_processor.file_system.get_path_with_protocol import get_path_with_protocol
8-
from topo_processor.metadata.metadata_loaders import metadata_loader_imagery_historic, metadata_loader_repo
9-
from topo_processor.metadata.metadata_validators import metadata_validator_repo, metadata_validator_stac
6+
from topo_processor.metadata.metadata_loaders import metadata_loader_repo
7+
from topo_processor.metadata.metadata_validators import metadata_validator_repo
108
from topo_processor.stac.store import get_asset, item_store
119
from topo_processor.util import time_in_ms
1210

@@ -19,42 +17,6 @@ def process_directory(source_dir: str) -> None:
1917
get_log().debug("Items Created", source_dir=source_dir, duration=time_in_ms() - start_time)
2018

2119

22-
def process_metadata(metadata_file: str) -> None:
23-
start_time = time_in_ms()
24-
errors_per_item: Dict[str, Dict[str, list]] = {}
25-
errors_per_type: Dict[str, Dict[str, int]] = {}
26-
total_items_processed = 0
27-
28-
# Load metadata from metadata csv file
29-
metadata_loader_imagery_historic.load_all_metadata(metadata_file)
30-
get_log().debug("Metadata Loaded", metadata_file=metadata_file, duration=time_in_ms() - start_time)
31-
32-
# Validate item against schema
33-
for item in item_store.values():
34-
if item.is_valid():
35-
errors_per_item[item.id] = metadata_validator_stac.validate_metadata_with_report(item)
36-
total_items_processed = total_items_processed + 1
37-
38-
# Build errors report
39-
for errors_item in errors_per_item.values():
40-
for schema_uri in errors_item:
41-
if schema_uri not in errors_per_type:
42-
errors_per_type[schema_uri] = {}
43-
for error in errors_item[schema_uri]:
44-
if error in errors_per_type[schema_uri]:
45-
errors_per_type[schema_uri][error] = errors_per_type[schema_uri][error] + 1
46-
else:
47-
errors_per_type[schema_uri][error] = 1
48-
49-
get_log().info(
50-
"Metadata Validated",
51-
metadata_file=metadata_file,
52-
nbItemsProcessed=total_items_processed,
53-
duration=time_in_ms() - start_time,
54-
errors=errors_per_type,
55-
)
56-
57-
5820
def _create_assets(source_dir: str) -> None:
5921
fs = get_fs(source_dir)
6022
for (path, _, files) in fs.walk(source_dir):
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import os
2+
from logging import error
3+
4+
import pytest
5+
6+
from topo_processor.stac.validate_report import ValidateReport
7+
8+
9+
def test_increment_error():
10+
""""""
11+
error_report: ValidateReport = ValidateReport()
12+
error_report.increment_error("schema_a", "error_1")
13+
assert error_report.report_per_error_type["schema_a"]["error_1"] == 1
14+
error_report.increment_error("schema_a", "error_1")
15+
assert error_report.report_per_error_type["schema_a"]["error_1"] == 2
16+
error_report.increment_error("schema_b", "error_1")
17+
assert error_report.report_per_error_type["schema_b"]["error_1"] == 1
18+
error_report.increment_error("schema_a", "error_2")
19+
assert error_report.report_per_error_type["schema_a"]["error_2"] == 1

0 commit comments

Comments
 (0)