diff --git a/rocrate_validator/cli/commands/validate.py b/rocrate_validator/cli/commands/validate.py index 87697745..be058b0b 100644 --- a/rocrate_validator/cli/commands/validate.py +++ b/rocrate_validator/cli/commands/validate.py @@ -135,6 +135,13 @@ def get_single_char(console: Optional[Console] = None, end: str = "\n", @cli.command("validate") @click.argument("rocrate-uri", callback=validate_uri, default=".") +@click.option( + '-rr', + '--relative-root-path', + help="Use root-relative paths for all file references in the RO-Crate", + default=None, + show_default=True +) @click.option( '-ff', '--fail-fast', @@ -269,6 +276,7 @@ def validate(ctx, requirement_severity_only: bool = False, skip_checks: list[str] = None, rocrate_uri: Path = ".", + relative_root_path: Optional[Path] = None, fail_fast: bool = False, no_paging: bool = False, verbose: bool = False, @@ -330,6 +338,7 @@ def validate(ctx, "requirement_severity_only": requirement_severity_only, "enable_profile_inheritance": not disable_profile_inheritance, "rocrate_uri": rocrate_uri, + "rocrate_relative_root_path": relative_root_path, "abort_on_first": fail_fast, "skip_checks": skip_checks_list } @@ -838,7 +847,7 @@ def show_validation_details(self, pager: Pager, enable_pager: bool = True): Padding("\n[bold]The following requirements have not meet: [/bold]", (0, 2)), style="white") for requirement in sorted(result.failed_requirements, key=lambda x: x.identifier): console.print( - Align(f"\n[profile: [magenta bold]{requirement.profile.name }[/magenta bold]]", align="right") + Align(f"\n[profile: [magenta bold]{requirement.profile.name}[/magenta bold]]", align="right") ) console.print( Padding( diff --git a/rocrate_validator/models.py b/rocrate_validator/models.py index 3e02080e..afae370f 100644 --- a/rocrate_validator/models.py +++ b/rocrate_validator/models.py @@ -1573,10 +1573,10 @@ def get_failed_checks_by_requirement_and_severity( and check.severity == severity] def __str__(self) -> str: - return f"Validation result: passed={len(self.failed_checks)==0}, {len(self._issues)} issues" + return f"Validation result: passed={len(self.failed_checks) == 0}, {len(self._issues)} issues" def __repr__(self): - return f"ValidationResult(passed={len(self.failed_checks)==0},issues={self._issues})" + return f"ValidationResult(passed={len(self.failed_checks) == 0},issues={self._issues})" def __eq__(self, other: object) -> bool: if not isinstance(other, ValidationResult): @@ -1641,6 +1641,8 @@ class ValidationSettings: """ #: The URI of the RO-Crate rocrate_uri: URI + #: The relative root path of the RO-Crate + rocrate_relative_root_path: Optional[Path] = None # Profile settings #: The path to the profiles profiles_path: Path = DEFAULT_PROFILES_PATH @@ -1824,7 +1826,7 @@ def __str__(self) -> str: def __repr__(self) -> str: return f"RequirementCheckValidationEvent(event_type={self.event_type}, " \ - f"requirement_check={self.requirement_check})" + f"requirement_check={self.requirement_check})" def __eq__(self, other: object) -> bool: if not isinstance(other, RequirementCheckValidationEvent): @@ -1993,7 +1995,8 @@ def __init__(self, validator: Validator, settings: ValidationSettings): self._properties = {} # initialize the ROCrate object - self._rocrate = ROCrate.new_instance(settings.rocrate_uri) + self._rocrate = ROCrate.new_instance(settings.rocrate_uri, + relative_root_path=settings.rocrate_relative_root_path) assert isinstance(self._rocrate, ROCrate), "Invalid RO-Crate instance" @property diff --git a/rocrate_validator/rocrate.py b/rocrate_validator/rocrate.py index ba05057b..88c98347 100644 --- a/rocrate_validator/rocrate.py +++ b/rocrate_validator/rocrate.py @@ -16,6 +16,7 @@ import io import json +import re import struct import zipfile from abc import ABC, abstractmethod @@ -47,6 +48,12 @@ def id(self) -> str: def type(self) -> Union[str, list[str]]: return self._raw_data.get('@type') + def is_dataset(self) -> bool: + return self.has_type('Dataset') + + def is_file(self) -> bool: + return self.has_type('File') + @property def name(self) -> str: return self._raw_data.get('name') @@ -64,14 +71,23 @@ def is_remote(self) -> bool: @classmethod def get_id_as_path(cls, entity_id: str, ro_crate: Optional[ROCrate] = None) -> Path: - return cls.get_path_from_identifier(entity_id, ro_crate.uri.as_path() if ro_crate else None) + result = cls.get_path_from_identifier( + entity_id, + ro_crate.uri.as_path() + if ro_crate and ro_crate.uri.is_local_resource() + else None, + ) + return result @staticmethod - def get_path_from_identifier(identifier: str, rocrate_path: Optional[Union[str, Path]] = None) -> Path: + def get_path_from_identifier( + identifier: str, + rocrate_path: Optional[Union[str, Path]] = None, + decode: bool = None, + ) -> Path: """ Get the path from an identifier. - :param identifier: the identifier of the entity :type identifier: str @@ -82,9 +98,10 @@ def get_path_from_identifier(identifier: str, rocrate_path: Optional[Union[str, :rtype: Path """ + def __define_path__(path: str, decode: bool = False) -> Path: # ensure the path is a string and remove the file:// prefix - path = str(path).replace('file://', '') + path = str(path).replace("file://", "") # Decode the path if required if decode: path = unquote(path) @@ -97,21 +114,21 @@ def __define_path__(path: str, decode: bool = False) -> Path: # set the base path base_path = rocrate_path if base_path is None: - base_path = Path('./') + base_path = Path("./") elif not isinstance(base_path, Path): base_path = Path(base_path) # Check if the path if the root of the RO-Crate - if path == Path('./'): + if path == Path("./"): return base_path # if the path is relative, try to resolve it return base_path / path.relative_to(base_path) except ValueError: # if the path cannot be resolved, return the absolute path return base_path / path + # Define the path based on the identifier - path = __define_path__(identifier) - if not path.exists(): - path = __define_path__(identifier, decode=True) + path = __define_path__(identifier, decode=decode) + logger.debug("Defined path '%s' from identifier '%s'", path, identifier) return path @property @@ -123,6 +140,9 @@ def get_id_as_uri(cls, entity_id: str, ro_crate: ROCrate) -> URI: assert entity_id, "Entity ID cannot be None" if entity_id.startswith("http"): return URI(entity_id) + if ro_crate.uri.is_remote_resource(): + if entity_id.startswith("./"): + return URI(f"{ro_crate.uri}/{entity_id[2:]}") return URI(cls.get_id_as_path(entity_id, ro_crate)) @property @@ -136,11 +156,16 @@ def has_relative_path(self) -> bool: return not self.has_absolute_path() def has_local_identifier(self) -> bool: - has_local_id = self.id.startswith('#') or \ - f"{self.ro_crate.uri}/#" in self.id or \ - f"file://{self.ro_crate.uri}/#" in self.id - logger.debug("Identifier '%s' is %s a local identifier", self.id, - "" if has_local_id else " not") + has_local_id = ( + self.id.startswith("#") + or f"{self.ro_crate.uri}/#" in self.id + or f"file://{self.ro_crate.uri}/#" in self.id + ) + logger.debug( + "Identifier '%s' is %s a local identifier", + self.id, + "" if has_local_id else " not", + ) return has_local_id def has_type(self, entity_type: str) -> bool: @@ -159,8 +184,8 @@ def has_types(self, entity_types: list[str], all_types: bool = False) -> bool: return any([t in e_types for t in entity_types]) def __process_property__(self, name: str, data: object) -> object: - if isinstance(data, dict) and '@id' in data: - entity = self.metadata.get_entity(data['@id']) + if isinstance(data, dict) and "@id" in data: + entity = self.metadata.get_entity(data["@id"]) if entity is None: return ROCrateEntity(self, data) return entity @@ -185,34 +210,54 @@ def is_available(self) -> bool: try: # check if the entity points to an external file if self.id.startswith("http"): - return ROCrate.get_external_file_size(self.id) > 0 + logger.debug("Checking the availability of a remote entity") + return self.ro_crate.get_external_file_size(self.id) > 0 # check if the entity is part of the local RO-Crate if self.ro_crate.uri.is_local_resource(): # check if the file exists in the local file system if isinstance(self.ro_crate, ROCrateLocalFolder): - logger.debug("Checking the availability of a local entity in a local folder") - return self.ro_crate.has_file(self.id_as_path) \ - or self.ro_crate.has_directory(self.id_as_path) + logger.debug( + "Checking the availability of a local entity in a local folder" + ) + return self.ro_crate.has_file( + self.id_as_path + ) or self.ro_crate.has_directory(self.id_as_path) # check if the file exists in the local zip file if isinstance(self.ro_crate, ROCrateLocalZip): - logger.debug("Checking the availability of a local entity in a local zip file") + logger.debug( + "Checking the availability of a local entity in a local zip file" + ) # Skip the check for the root of a ZIP archive if self.id == "./": - logger.debug("Skipping the check for the presence of the Data Entity '%s' within the RO-Crate " - "as it is the root of a ZIP archive", self.id) + logger.debug( + "Skipping the check for the presence of the Data Entity '%s' within the RO-Crate " + "as it is the root of a ZIP archive", + self.id, + ) return True - return self.ro_crate.get_entry(str(self.id)) is not None + return self.ro_crate.has_directory( + unquote(str(self.id)) + ) or self.ro_crate.has_file(unquote(str(self.id))) # check if the entity is part of the remote RO-Crate + logger.debug( + "Checking the availability of a remote entity in a remote RO-Crate" + ) if self.ro_crate.uri.is_remote_resource(): - return self.ro_crate.get_file_size(Path(self.id)) > 0 + if self.id == "./": + return self.ro_crate.get_file_size(Path(self.id_as_uri())) > 0 + return self.ro_crate.has_directory( + unquote(str(self.id)) + ) or self.ro_crate.has_file(unquote(str(self.id))) except Exception as e: if logger.isEnabledFor(logging.DEBUG): logger.exception(e) return False - raise ROCrateInvalidURIError(uri=self.id, message="Could not determine the availability of the entity") + raise ROCrateInvalidURIError( + uri=self.id, message="Could not determine the availability of the entity" + ) def get_size(self) -> int: try: @@ -235,8 +280,7 @@ def __eq__(self, other: ROCrateEntity) -> bool: class ROCrateMetadata: - - METADATA_FILE_DESCRIPTOR = 'ro-crate-metadata.json' + METADATA_FILE_DESCRIPTOR = "ro-crate-metadata.json" def __init__(self, ro_crate: ROCrate) -> None: self._ro_crate = ro_crate @@ -264,7 +308,7 @@ def get_file_descriptor_entity(self) -> ROCrateEntity: def get_root_data_entity(self) -> ROCrateEntity: metadata_file_descriptor = self.get_file_descriptor_entity() - main_entity = metadata_file_descriptor.get_property('about') + main_entity = metadata_file_descriptor.get_property("about") if not main_entity: raise ValueError("no main entity in metadata file descriptor") return main_entity @@ -272,7 +316,7 @@ def get_root_data_entity(self) -> ROCrateEntity: def get_root_data_entity_conforms_to(self) -> Optional[list[str]]: try: root_data_entity = self.get_root_data_entity() - result = root_data_entity.get_property('conformsTo', []) + result = root_data_entity.get_property("conformsTo", []) if result is None: return None if not isinstance(result, list): @@ -285,24 +329,26 @@ def get_root_data_entity_conforms_to(self) -> Optional[list[str]]: def get_main_workflow(self) -> ROCrateEntity: root_data_entity = self.get_root_data_entity() - main_workflow = root_data_entity.get_property('mainEntity') + main_workflow = root_data_entity.get_property("mainEntity") if not main_workflow: raise ValueError("no main workflow in metadata file descriptor") return main_workflow def get_entity(self, entity_id: str) -> ROCrateEntity: - for entity in self.as_dict().get('@graph', []): - if entity.get('@id') == entity_id: + for entity in self.as_dict().get("@graph", []): + if entity.get("@id") == entity_id: return ROCrateEntity(self, entity) return None def get_entities(self) -> list[ROCrateEntity]: entities = [] - for entity in self.as_dict().get('@graph', []): + for entity in self.as_dict().get("@graph", []): entities.append(ROCrateEntity(self, entity)) return entities - def get_entities_by_type(self, entity_type: Union[str, list[str]]) -> list[ROCrateEntity]: + def get_entities_by_type( + self, entity_type: Union[str, list[str]] + ) -> list[ROCrateEntity]: entities = [] for e in self.get_entities(): if e.has_types(entity_type): @@ -310,21 +356,26 @@ def get_entities_by_type(self, entity_type: Union[str, list[str]]) -> list[ROCra return entities def get_dataset_entities(self) -> list[ROCrateEntity]: - return self.get_entities_by_type('Dataset') + return self.get_entities_by_type("Dataset") def get_file_entities(self) -> list[ROCrateEntity]: - return self.get_entities_by_type('File') + return self.get_entities_by_type("File") - def get_data_entities(self, exclude_web_data_entities: bool = False) -> list[ROCrateEntity]: + def get_data_entities( + self, exclude_web_data_entities: bool = False + ) -> list[ROCrateEntity]: if not exclude_web_data_entities: - return self.get_entities_by_type(['Dataset', 'File']) - return [e for e in self.get_entities_by_type(['Dataset', 'File']) - if not e.is_remote()] + return self.get_entities_by_type(["Dataset", "File"]) + return [ + e + for e in self.get_entities_by_type(["Dataset", "File"]) + if not e.is_remote() + ] def get_web_data_entities(self) -> list[ROCrateEntity]: entities = [] for entity in self.get_entities(): - if entity.has_type('File') or entity.has_type('Dataset'): + if entity.has_type("File") or entity.has_type("Dataset"): if entity.is_remote(): entities.append(entity) return entities @@ -332,7 +383,7 @@ def get_web_data_entities(self) -> list[ROCrateEntity]: def get_conforms_to(self) -> Optional[list[str]]: try: file_descriptor = self.get_file_descriptor_entity() - result = file_descriptor.get_property('conformsTo', []) + result = file_descriptor.get_property("conformsTo", []) if result is None: return None if not isinstance(result, list): @@ -346,7 +397,8 @@ def get_conforms_to(self) -> Optional[list[str]]: def as_json(self) -> str: if not self._json: self._json = self.ro_crate.get_file_content( - Path(self.METADATA_FILE_DESCRIPTOR), binary_mode=False) + Path(self.METADATA_FILE_DESCRIPTOR), binary_mode=False + ) return self._json def as_dict(self) -> dict: @@ -359,7 +411,7 @@ def as_graph(self, publicID: str = None) -> Graph: if not self._graph: # if the graph is not cached, load it self._graph = Graph(base=publicID or self.ro_crate.uri) - self._graph.parse(data=self.as_json, format='json-ld') + self._graph.parse(data=self.as_json, format="json-ld") return self._graph def __str__(self) -> str: @@ -375,12 +427,37 @@ def __eq__(self, other: ROCrateMetadata) -> bool: class ROCrate(ABC): - """ Base class for representing and interacting with a Research Object Crate (RO-Crate). """ - def __init__(self, uri: Union[str, Path, URI]): + def __new__(cls, uri: Union[str, Path, URI], relative_root_path: Path = None): + """ + Factory method to create the appropriate ROCrate subclass instance. + + :param uri: the URI of the RO-Crate + :type uri: Union[str, Path, URI] + + :param relative_root_path: the relative root path inside the RO-Crate + :type relative_root_path: Path + + :return: an instance of the appropriate ROCrate subclass + :rtype: ROCrate + + :raises ROCrateInvalidURIError: if the URI is invalid + """ + if cls is not ROCrate: + # If called on a subclass, use normal instantiation + instance = super(ROCrate, cls).__new__(cls) + return instance + + # If called on ROCrate directly, use factory logic + instance = cls.new_instance(uri) + if relative_root_path: + instance.relative_root_path = relative_root_path + return instance + + def __init__(self, uri: Union[str, Path, URI], relative_root_path: Path = None) -> None: """ Initialize the RO-Crate. @@ -393,6 +470,9 @@ def __init__(self, uri: Union[str, Path, URI]): # store the path to the crate self._uri = URI(uri) + # the relative root path inside the RO-Crate + self.relative_root_path = relative_root_path + # cache the list of files self._files = None @@ -442,9 +522,80 @@ def list_files(self) -> list[Path]: """ pass + def __get_search_path__(self, path: Path) -> tuple[Path, Path]: + """ + Get the search path relative to the RO-Crate root path. + + :param path: the path to resolve + :type path: Path + :return: the search path + :rtype: Path + """ + assert path, "Path cannot be None" + # Identify the root path of the RO-Crate + root_path = ( + self.uri.as_path() + if self.uri.is_local_resource() and isinstance(path, Path) + else Path("./") + ) + # Extract the search path relative to the root of the RO-Crate root path + try: + search_path = path.relative_to(root_path) + except Exception: + search_path = path + return search_path, root_path + + def __check_search_path__(self, path) -> tuple[Optional[Path], Optional[Path]]: + """ " + Extract the search path if it does not contain the relative root path. + + :param path: the path to resolve + :type path: Path + :return: the search path if valid, None otherwise + :rtype: Path or None + """ + if not self.relative_root_path: + return None, None + + search_path, root_path = self.__get_search_path__(path) + # Check if the path has the substring 'relative_root_path/' in it + has_sub_data_path = re.search(self.relative_root_path, str(search_path)) + if not has_sub_data_path: + return search_path, root_path + return None, None + def __parse_path__(self, path: Path) -> Path: + """ " + Parse the given path to resolve it within the RO-Crate. + :param path: the path to resolve + :type path: Path + :return: the resolved path + :rtype: Path + """ assert path, "Path cannot be None" - return ROCrateEntity.get_path_from_identifier(str(path), rocrate_path=self.uri.as_path()) + + # Resolve the path based on the RO-Crate location + rocrate_path = self.uri.as_path() if self.uri.is_local_resource() else None + rocrate_path_arg = ( + rocrate_path if not str(rocrate_path).endswith(".zip") else None + ) + paths_to_try = [path] + unquoted_path = Path(unquote(str(path))) + if str(path) != str(unquoted_path): + paths_to_try.append(unquoted_path) + for p in paths_to_try: + path_identifier = ROCrateEntity.get_path_from_identifier( + str(p), rocrate_path=rocrate_path_arg, decode=False + ) + search_path, base_path = self.__check_search_path__(path_identifier) + if search_path and base_path: + if self.relative_root_path: + path_identifier = base_path / self.relative_root_path / search_path + else: + path_identifier = base_path / search_path + if path_identifier.exists(): + return path_identifier + return path_identifier def has_descriptor(self) -> bool: """ @@ -453,7 +604,9 @@ def has_descriptor(self) -> bool: :return: `True` if the RO-Crate has a metadata descriptor file, `False` otherwise :rtype: bool """ - return (self.uri.as_path().absolute() / self.metadata.METADATA_FILE_DESCRIPTOR).is_file() + path = self.__parse_path__(Path(self.metadata.METADATA_FILE_DESCRIPTOR)) + logger.debug("Checking for metadata descriptor at path: %s", path) + return self.has_file(path) def has_file(self, path: Path) -> bool: """ @@ -503,7 +656,9 @@ def get_file_size(self, path: Path) -> int: pass @abstractmethod - def get_file_content(self, path: Path, binary_mode: bool = True) -> Union[str, bytes]: + def get_file_content( + self, path: Path, binary_mode: bool = True + ) -> Union[str, bytes]: """ Get the content of a file in the RO-Crate. @@ -519,7 +674,9 @@ def get_file_content(self, path: Path, binary_mode: bool = True) -> Union[str, b pass @staticmethod - def get_external_file_content(uri: str, binary_mode: bool = True) -> Union[str, bytes]: + def get_external_file_content( + uri: str, binary_mode: bool = True + ) -> Union[str, bytes]: """ Get the content of an external file. @@ -551,10 +708,12 @@ def get_external_file_size(uri: str) -> int: """ response = HttpRequester().head(str(uri)) response.raise_for_status() - return int(response.headers.get('Content-Length')) + return int(response.headers.get("Content-Length")) @staticmethod - def new_instance(uri: Union[str, Path, URI]) -> 'ROCrate': + def new_instance( + uri: Union[str, Path, URI], relative_root_path: Optional[Path] = None + ) -> "ROCrate": """ Create a new instance of the RO-Crate based on the URI. @@ -571,23 +730,41 @@ def new_instance(uri: Union[str, Path, URI]) -> 'ROCrate': # create a new instance based on the URI if not isinstance(uri, URI): uri = URI(uri) + # check if the URI is a BagIt-wrapped crate + is_bagit_crate = BagitROCrate.is_bagit_wrapping_crate(uri) + # check if the URI is a local directory if uri.is_local_directory(): - return ROCrateLocalFolder(uri) + return ( + ROCrateBagitLocalFolder(uri, relative_root_path=relative_root_path) + if is_bagit_crate + else ROCrateLocalFolder(uri, relative_root_path=relative_root_path) + ) # check if the URI is a local zip file if uri.is_local_file(): - return ROCrateLocalZip(uri) + return ( + ROCrateBagitLocalZip(uri, relative_root_path=relative_root_path) + if is_bagit_crate + else ROCrateLocalZip(uri, relative_root_path=relative_root_path) + ) # check if the URI is a remote zip file if uri.is_remote_resource(): - return ROCrateRemoteZip(uri) + return ( + ROCrateBagitRemoteZip(uri, relative_root_path=relative_root_path) + if is_bagit_crate + else ROCrateRemoteZip(uri, relative_root_path=relative_root_path) + ) # if the URI is not supported, raise an error raise ROCrateInvalidURIError(uri=uri, message="Unsupported RO-Crate URI") class ROCrateLocalFolder(ROCrate): + """ + Class representing an RO-Crate stored in a local folder. + """ - def __init__(self, path: Union[str, Path, URI]): - super().__init__(path) + def __init__(self, path: Union[str, Path, URI], relative_root_path: Path = None): + super().__init__(path, relative_root_path=relative_root_path) # cache the list of files self._files = None @@ -604,7 +781,7 @@ def list_files(self) -> list[Path]: if not self._files: self._files = [] base_path = self.uri.as_path() - for file in base_path.rglob('*'): + for file in base_path.rglob("*"): if file.is_file(): self._files.append(base_path / file) return self._files @@ -615,7 +792,9 @@ def get_file_size(self, path: Path) -> int: raise FileNotFoundError(f"File not found: {path}") return path.stat().st_size - def get_file_content(self, path: Path, binary_mode: bool = True) -> Union[str, bytes]: + def get_file_content( + self, path: Path, binary_mode: bool = True + ) -> Union[str, bytes]: path = self.__parse_path__(path) if not self.has_file(path): raise FileNotFoundError(f"File not found: {path}") @@ -623,9 +802,13 @@ def get_file_content(self, path: Path, binary_mode: bool = True) -> Union[str, b class ROCrateLocalZip(ROCrate): - - def __init__(self, path: Union[str, Path, URI], init_zip: bool = True): - super().__init__(path) + def __init__( + self, + path: Union[str, Path, URI], + relative_root_path: Path = None, + init_zip: bool = True, + ): + super().__init__(path, relative_root_path=relative_root_path) # initialize the zip reference self._zipref = None @@ -636,9 +819,18 @@ def __init__(self, path: Union[str, Path, URI], init_zip: bool = True): self._files = None def __del__(self): - if self._zipref and self._zipref.fp is not None: - self._zipref.close() - del self._zipref + try: + if self._zipref and self._zipref.fp is not None: + self._zipref.close() + del self._zipref + except Exception as e: + if logger.isEnabledFor(logging.DEBUG): + logger.exception(e) + + def __parse_path__(self, path): + assert path, "Path cannot be None" + # If the RO-Crate is a zip file, the path should be changed + return path @property def size(self) -> int: @@ -650,27 +842,41 @@ def __init_zip_reference__(self): if not self.uri.as_path().is_file(): raise ROCrateInvalidURIError(uri=path) # check if the file is a zip file - if not self.uri.as_path().suffix == '.zip': + if not self.uri.as_path().suffix == ".zip": raise ROCrateInvalidURIError(uri=path) self._zipref = zipfile.ZipFile(path) logger.debug("Initialized zip reference: %s", self._zipref) def __get_file_info__(self, path: Path) -> zipfile.ZipInfo: - return self._zipref.getinfo(str(path)) + try: + return self._zipref.getinfo(str(path)) + except KeyError: + logger.error("File not found in zip: %s", path) + raise FileNotFoundError(f"File not found in zip: {path}") def has_descriptor(self) -> bool: - return ROCrateMetadata.METADATA_FILE_DESCRIPTOR in [str(_.name) for _ in self.list_files()] + """ + Check if the RO-Crate has a metadata descriptor file. + :rtype: bool + """ + path = self.__parse_path__(Path(self.metadata.METADATA_FILE_DESCRIPTOR)) + return str(path) in [str(_) for _ in self.list_files()] def has_file(self, path: Path) -> bool: - if path in self.list_files(): - info = self.__get_file_info__(path) - return not info.is_dir() + path = self.__parse_path__(path) + for p in self.list_files(): + if str(path) == str(p): + info = self.__get_file_info__(path) + return not info.is_dir() return False def has_directory(self, path: Path) -> bool: - if path in self.list_files(): - info = self.__get_file_info__(path) - return info.is_dir() + assert path, "Path cannot be None" + for px in (path, self.__parse_path__(path)): + for p in self._zipref.namelist(): + if f"{str(px)}/" == str(p) or str(px) == str(p): + info = self.__get_file_info__(p) + return info.is_dir() return False def list_files(self) -> list[Path]: @@ -687,24 +893,25 @@ def get_entry(self, path: Path) -> zipfile.ZipInfo: """ Return the ZipInfo object for the specified path. """ - return self.__get_file_info__(path) + return self.__get_file_info__(self.__parse_path__(path)) def get_file_size(self, path: Path) -> int: - return self._zipref.getinfo(str(path)).file_size + return self._zipref.getinfo(str(self.__parse_path__(path))).file_size - def get_file_content(self, path: Path, binary_mode: bool = True) -> Union[str, bytes]: + def get_file_content( + self, path: Path, binary_mode: bool = True + ) -> Union[str, bytes]: + path = self.__parse_path__(path) if not self.has_file(path): raise FileNotFoundError(f"File not found: {path}") data = self._zipref.read(str(path)) - return data if binary_mode else data.decode('utf-8') + return data if binary_mode else data.decode("utf-8") class ROCrateRemoteZip(ROCrateLocalZip): - def __init__(self, path: Union[str, Path, URI]): - super().__init__(path, init_zip=False) - - logger.debug("Size: %s", self.size) + def __init__(self, path: Union[str, Path, URI], relative_root_path: Path = None): + super().__init__(path, relative_root_path=relative_root_path, init_zip=False) # # initialize the zip reference self.__init_zip_reference__() @@ -764,3 +971,140 @@ def __parse_eocd__(data): central_directory_size = eocd[5] central_directory_offset = eocd[6] return central_directory_offset, central_directory_size + + +class BagitROCrate(ROCrate, ABC): + + def __init__(self, uri, relative_root_path=None): + super().__init__(uri, relative_root_path) + + # check if the path is a BagIt-wrapped crate + assert self.is_bagit_wrapping_crate(uri), "Not a BagIt-wrapped RO-Crate" + + @staticmethod + def is_bagit_wrapping_crate(uri: Union[str, Path, URI]) -> bool: + """ + Check if the RO-Crate is a BagIt-wrapped crate. + + :param uri: the URI of the RO-Crate + :type uri: Union[str, Path, URI] + + :return: `True` if the RO-Crate is a BagIt-wrapped crate, `False` otherwise + :rtype: bool + """ + if not isinstance(uri, URI): + uri = URI(uri) + + try: + # Check for local directory + if uri.is_local_directory(): + base_path = uri.as_path() + return (base_path / 'bagit.txt').is_file() and \ + (base_path / 'data' / 'ro-crate-metadata.json').is_file() + + # Check for local zip file + elif uri.is_local_file(): + path = uri.as_path() + if path.suffix == '.zip': + with zipfile.ZipFile(path, 'r') as zf: + namelist = zf.namelist() + return 'bagit.txt' in namelist and \ + 'data/ro-crate-metadata.json' in namelist + + # Check for remote zip file + elif uri.is_remote_resource(): + # For remote resources, we need to check if both files exist + # We'll use HTTP HEAD requests to check without downloading + base_url = str(uri).rstrip('/') + + if not base_url.endswith('.zip'): + # Check for bagit.txt + bagit_response = HttpRequester().head(f"{base_url}/bagit.txt") + if bagit_response.status_code != 200: + return False + + # Check for data/ro-crate-metadata.json + metadata_response = HttpRequester().head(f"{base_url}/data/ro-crate-metadata.json") + return metadata_response.status_code == 200 + + else: + # If it's a remote zip file, we need to download it partially + # Temporarily create instance to check + temp_crate = ROCrateRemoteZip(uri) + logger.debug("Initializing ROCrateRemoteZip for URI: %s", uri) + # ROCrate.__init__(temp_crate, uri) + # temp_crate._ROCrateRemoteZip__init_zip_reference__() + has_bagit_txt = temp_crate.has_file(Path('bagit.txt')) + logger.debug("Presence of 'bagit.txt': %s", has_bagit_txt) + has_ro_crate_metadata = temp_crate.has_file(Path('data/ro-crate-metadata.json')) + logger.debug("Presence of 'data/ro-crate-metadata.json': %s", + has_ro_crate_metadata) + result = has_bagit_txt and has_ro_crate_metadata + del temp_crate + return result + + except Exception as e: + if logger.isEnabledFor(logging.DEBUG): + logger.exception(e) + return False + + return False + + def __check_search_path__(self, path): + """ + Check if the search path is valid for a BagIt-wrapped RO-Crate, + i.e., if it points to the 'data/' directory. + + :param path: the path to resolve + :type path: Path + :return: the search path if valid, None otherwise + :rtype: Path or None + """ + search_path, root_path = super().__get_search_path__(path) + # Check if the path has the substring 'data/' in it + has_sub_data_path = re.search(r'data/', str(search_path)) + logger.debug("The search path '%s' %s the 'data/' sub-path", search_path, + "contains" if has_sub_data_path else "does not contain") + if search_path == "." or not has_sub_data_path: + return search_path, root_path + return None, None + + +class ROCrateBagitLocalFolder(BagitROCrate, ROCrateLocalFolder): + + def __init__(self, uri: Union[str, Path, URI], relative_root_path: Path = None): + # initialize the parent classes + super(ROCrateLocalFolder, self).__init__(uri, relative_root_path=relative_root_path) + # check if the path is a BagIt-wrapped crate + assert self.is_bagit_wrapping_crate(uri), "Not a BagIt-wrapped RO-Crate" + + def __parse_path__(self, path: Path) -> Path: + search_path, root_path = self.__check_search_path__(path) + # if search_path and root_path are set, adjust the path + if search_path and root_path: + path = root_path / Path("data") / search_path + if not path.exists(): + path = Path(unquote(str(path))) + return path + + +class ROCrateBagitLocalZip(BagitROCrate, ROCrateLocalZip): + """ + Class representing an RO-Crate stored in a local BagIt-wrapped zip file. + """ + + def __parse_path__(self, path: Path) -> Path: + # Extract the search path relative to the root of the RO-Crate root path + search_path, _ = super().__check_search_path__(path) + + # if search_path is set, adjust the path + if search_path: + path = Path("data") / search_path + zip_namelist = self._zipref.namelist() + if str(path) not in zip_namelist and f"{path}/" not in zip_namelist: + path = Path(unquote(str(path))) + return path + + +class ROCrateBagitRemoteZip(ROCrateBagitLocalZip, ROCrateRemoteZip): + pass diff --git a/rocrate_validator/utils.py b/rocrate_validator/utils.py index 6d1fbc38..099783fe 100644 --- a/rocrate_validator/utils.py +++ b/rocrate_validator/utils.py @@ -575,20 +575,20 @@ def __hash__(self): return hash(self._uri) -def validate_rocrate_uri(uri: Union[str, URI], silent: bool = False) -> bool: +def validate_rocrate_uri(uri: Union[str, Path, URI], silent: bool = False) -> bool: """ Validate the RO-Crate URI - :param uri: The RO-Crate URI + :param uri: The RO-Crate URI to validate. Can be a string, Path, or URI object :param silent: If True, do not raise an exception :return: True if the URI is valid, False otherwise """ try: assert uri, "The RO-Crate URI is required" - assert isinstance(uri, (str, URI)), "The RO-Crate URI must be a string or URI object" + assert isinstance(uri, (str, Path, URI)), "The RO-Crate URI must be a string, Path, or URI object" try: # parse the value to extract the scheme - uri = URI(uri) if isinstance(uri, str) else uri + uri = URI(str(uri)) if isinstance(uri, str) or isinstance(uri, Path) else uri # check if the URI is a remote resource or local directory or local file if not uri.is_remote_resource() and not uri.is_local_directory() and not uri.is_local_file(): raise errors.ROCrateInvalidURIError(uri) diff --git a/tests/data/crates/valid/bagit.zip b/tests/data/crates/valid/bagit.zip new file mode 100644 index 00000000..7c61f204 Binary files /dev/null and b/tests/data/crates/valid/bagit.zip differ diff --git a/tests/data/crates/valid/rocrate_with_custom_terms/data set2/.gitkeep b/tests/data/crates/valid/bagit/bagig-info.txt similarity index 100% rename from tests/data/crates/valid/rocrate_with_custom_terms/data set2/.gitkeep rename to tests/data/crates/valid/bagit/bagig-info.txt diff --git a/tests/data/crates/valid/bagit/bagit.txt b/tests/data/crates/valid/bagit/bagit.txt new file mode 100644 index 00000000..c6792997 --- /dev/null +++ b/tests/data/crates/valid/bagit/bagit.txt @@ -0,0 +1,2 @@ +BagIt-version: 1.0 +Tag-File-Character-Encoding: UTF-8 diff --git a/tests/data/crates/valid/rocrate_with_custom_terms/data set3/.gitkeep b/tests/data/crates/valid/bagit/data/data set2/.gitkeep similarity index 100% rename from tests/data/crates/valid/rocrate_with_custom_terms/data set3/.gitkeep rename to tests/data/crates/valid/bagit/data/data set2/.gitkeep diff --git a/tests/data/crates/valid/rocrate_with_custom_terms/data%20set/.gitkeep b/tests/data/crates/valid/bagit/data/data set3/.gitkeep similarity index 100% rename from tests/data/crates/valid/rocrate_with_custom_terms/data%20set/.gitkeep rename to tests/data/crates/valid/bagit/data/data set3/.gitkeep diff --git a/tests/data/crates/valid/rocrate_with_data_entities/data set2/.gitkeep b/tests/data/crates/valid/bagit/data/data%20set/.gitkeep similarity index 100% rename from tests/data/crates/valid/rocrate_with_data_entities/data set2/.gitkeep rename to tests/data/crates/valid/bagit/data/data%20set/.gitkeep diff --git a/tests/data/crates/valid/rocrate_with_custom_terms/pics/2017-06-11%2012.56.14.jpg b/tests/data/crates/valid/bagit/data/pics/2017-06-11%2012.56.14.jpg similarity index 100% rename from tests/data/crates/valid/rocrate_with_custom_terms/pics/2017-06-11%2012.56.14.jpg rename to tests/data/crates/valid/bagit/data/pics/2017-06-11%2012.56.14.jpg diff --git a/tests/data/crates/valid/rocrate_with_custom_terms/pics/2018-06-11 12.56.14.jpg b/tests/data/crates/valid/bagit/data/pics/2018-06-11 12.56.14.jpg similarity index 100% rename from tests/data/crates/valid/rocrate_with_custom_terms/pics/2018-06-11 12.56.14.jpg rename to tests/data/crates/valid/bagit/data/pics/2018-06-11 12.56.14.jpg diff --git a/tests/data/crates/valid/rocrate_with_custom_terms/pics/2019-06-11 12.56.14.jpg b/tests/data/crates/valid/bagit/data/pics/2019-06-11 12.56.14.jpg similarity index 100% rename from tests/data/crates/valid/rocrate_with_custom_terms/pics/2019-06-11 12.56.14.jpg rename to tests/data/crates/valid/bagit/data/pics/2019-06-11 12.56.14.jpg diff --git a/tests/data/crates/valid/rocrate_with_custom_terms/pics/sepia_fence.jpg b/tests/data/crates/valid/bagit/data/pics/sepia_fence.jpg similarity index 100% rename from tests/data/crates/valid/rocrate_with_custom_terms/pics/sepia_fence.jpg rename to tests/data/crates/valid/bagit/data/pics/sepia_fence.jpg diff --git a/tests/data/crates/valid/rocrate_with_data_entities/ro-crate-metadata.json b/tests/data/crates/valid/bagit/data/ro-crate-metadata.json similarity index 96% rename from tests/data/crates/valid/rocrate_with_data_entities/ro-crate-metadata.json rename to tests/data/crates/valid/bagit/data/ro-crate-metadata.json index 23e39780..d4aa7a6e 100644 --- a/tests/data/crates/valid/rocrate_with_data_entities/ro-crate-metadata.json +++ b/tests/data/crates/valid/bagit/data/ro-crate-metadata.json @@ -27,7 +27,7 @@ ], "hasPart": [ { - "@id": "pics/2017-06-11%2012.56.14.jpg" + "@id": "pics/2017-06-11%252012.56.14.jpg" }, { "@id": "pics/2018-06-11%2012.56.14.jpg" @@ -36,7 +36,7 @@ "@id": "pics/2019-06-11 12.56.14.jpg" }, { - "@id": "data%20set/" + "@id": "data%2520set/" }, { "@id": "data%20set2/" @@ -100,7 +100,7 @@ "@id": "https://www.imagemagick.org/" }, "object": { - "@id": "pics/2017-06-11%2012.56.14.jpg" + "@id": "pics/2017-06-11%252012.56.14.jpg" }, "result": { "@id": "pics/sepia_fence.jpg" @@ -139,7 +139,7 @@ "encodingFormat": "text/plain" }, { - "@id": "pics/2017-06-11%2012.56.14.jpg", + "@id": "pics/2017-06-11%252012.56.14.jpg", "@type": "File", "description": "Original image", "encodingFormat": "image/jpeg", @@ -170,7 +170,7 @@ "name": "2018-06-11 12.56.14.jpg (input)" }, { - "@id": "data%20set/", + "@id": "data%2520set/", "@type": "Dataset", "name": "Data set", "description": "A dataset", @@ -180,7 +180,7 @@ } }, { - "@id": "#xdata%20set/", + "@id": "#xdata%2520set/", "@type": "Dataset", "name": "Data set with a local ID", "description": "A dataset", diff --git a/tests/data/crates/valid/rocrate_with_data_entities/data set3/.gitkeep b/tests/data/crates/valid/bagit/manifest-sha512.txt similarity index 100% rename from tests/data/crates/valid/rocrate_with_data_entities/data set3/.gitkeep rename to tests/data/crates/valid/bagit/manifest-sha512.txt diff --git a/tests/data/crates/valid/rocrate_with_data_entities/data%20set/.gitkeep b/tests/data/crates/valid/rocrate-relative-root/README.md similarity index 100% rename from tests/data/crates/valid/rocrate_with_data_entities/data%20set/.gitkeep rename to tests/data/crates/valid/rocrate-relative-root/README.md diff --git a/tests/data/crates/valid/rocrate_with_data_entities/pics/2017-06-11%2012.56.14.jpg b/tests/data/crates/valid/rocrate-relative-root/custom-relative-root/data set2/.gitkeep similarity index 100% rename from tests/data/crates/valid/rocrate_with_data_entities/pics/2017-06-11%2012.56.14.jpg rename to tests/data/crates/valid/rocrate-relative-root/custom-relative-root/data set2/.gitkeep diff --git a/tests/data/crates/valid/rocrate_with_data_entities/pics/2018-06-11 12.56.14.jpg b/tests/data/crates/valid/rocrate-relative-root/custom-relative-root/data set3/.gitkeep similarity index 100% rename from tests/data/crates/valid/rocrate_with_data_entities/pics/2018-06-11 12.56.14.jpg rename to tests/data/crates/valid/rocrate-relative-root/custom-relative-root/data set3/.gitkeep diff --git a/tests/data/crates/valid/rocrate_with_data_entities/pics/2019-06-11 12.56.14.jpg b/tests/data/crates/valid/rocrate-relative-root/custom-relative-root/data%20set/.gitkeep similarity index 100% rename from tests/data/crates/valid/rocrate_with_data_entities/pics/2019-06-11 12.56.14.jpg rename to tests/data/crates/valid/rocrate-relative-root/custom-relative-root/data%20set/.gitkeep diff --git a/tests/data/crates/valid/rocrate_with_data_entities/pics/sepia_fence.jpg b/tests/data/crates/valid/rocrate-relative-root/custom-relative-root/pics/2017-06-11%2012.56.14.jpg similarity index 100% rename from tests/data/crates/valid/rocrate_with_data_entities/pics/sepia_fence.jpg rename to tests/data/crates/valid/rocrate-relative-root/custom-relative-root/pics/2017-06-11%2012.56.14.jpg diff --git a/tests/data/crates/valid/rocrate-relative-root/custom-relative-root/pics/2018-06-11 12.56.14.jpg b/tests/data/crates/valid/rocrate-relative-root/custom-relative-root/pics/2018-06-11 12.56.14.jpg new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/crates/valid/rocrate-relative-root/custom-relative-root/pics/2019-06-11 12.56.14.jpg b/tests/data/crates/valid/rocrate-relative-root/custom-relative-root/pics/2019-06-11 12.56.14.jpg new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/crates/valid/rocrate-relative-root/custom-relative-root/pics/sepia_fence.jpg b/tests/data/crates/valid/rocrate-relative-root/custom-relative-root/pics/sepia_fence.jpg new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/crates/valid/rocrate-relative-root/custom-relative-root/ro-crate-metadata.json b/tests/data/crates/valid/rocrate-relative-root/custom-relative-root/ro-crate-metadata.json new file mode 100644 index 00000000..d4aa7a6e --- /dev/null +++ b/tests/data/crates/valid/rocrate-relative-root/custom-relative-root/ro-crate-metadata.json @@ -0,0 +1,230 @@ +{ + "@context": [ + "https://w3id.org/ro/crate/1.1/context", + "https://w3id.org/ro/terms/workflow-run/context" + ], + "@graph": [ + { + "@id": "ro-crate-metadata.json", + "@type": "CreativeWork", + "conformsTo": { + "@id": "https://w3id.org/ro/crate/1.1" + }, + "about": { + "@id": "./" + } + }, + { + "@id": "./", + "@type": "Dataset", + "name": "My Pictures", + "description": "A collection of my pictures", + "datePublished": "2024-05-17T01:04:52+01:00", + "conformsTo": [ + { + "@id": "https://w3id.org/ro/crate/1.1" + } + ], + "hasPart": [ + { + "@id": "pics/2017-06-11%252012.56.14.jpg" + }, + { + "@id": "pics/2018-06-11%2012.56.14.jpg" + }, + { + "@id": "pics/2019-06-11 12.56.14.jpg" + }, + { + "@id": "data%2520set/" + }, + { + "@id": "data%20set2/" + }, + { + "@id": "data set3/" + }, + { + "@id": "pics/sepia_fence.jpg" + }, + { + "@id": "file:///tmp/test.txt" + } + ], + "isBasedOn": { + "@id": "https://doi.org/10.5281/zenodo.1009240" + }, + "license": { + "@id": "http://spdx.org/licenses/CC0-1.0" + }, + "mentions": { + "@id": "#SepiaConversion_1" + } + }, + { + "@id": "https://w3id.org/ro/wfrun/process/0.5", + "@type": "CreativeWork", + "name": "Process Run Crate", + "version": "0.5" + }, + { + "@id": "https://example.com/otherprofile/0.1", + "@type": "CreativeWork", + "name": "Other Profile", + "version": "0.1" + }, + { + "@id": "https://www.imagemagick.org/", + "@type": "SoftwareApplication", + "url": "https://www.imagemagick.org/", + "name": "ImageMagick", + "softwareVersion": "6.9.7-4", + "softwareRequirements": { + "@id": "https://example.com/foobar/1.0.0/" + } + }, + { + "@id": "https://example.com/foobar/1.0.0/", + "@type": "SoftwareApplication", + "name": "foobar", + "softwareVersion": "1.0.0" + }, + { + "@id": "#SepiaConversion_1", + "@type": "CreateAction", + "name": "Convert dog image to sepia", + "description": "convert -sepia-tone 80% pics/2017-06-11\\ 12.56.14.jpg pics/sepia_fence.jpg", + "startTime": "2024-05-17T01:04:50+01:00", + "endTime": "2024-05-17T01:04:52+01:00", + "instrument": { + "@id": "https://www.imagemagick.org/" + }, + "object": { + "@id": "pics/2017-06-11%252012.56.14.jpg" + }, + "result": { + "@id": "pics/sepia_fence.jpg" + }, + "agent": { + "@id": "https://orcid.org/0000-0001-9842-9718" + }, + "actionStatus": "http://schema.org/FailedActionStatus", + "error": "this is just to test the error property", + "environment": [ + { + "@id": "#height-limit-pv" + }, + { + "@id": "#width-limit-pv" + } + ], + "containerImage": "https://example.com/imagemagick.sif" + }, + { + "@id": "#width-limit-pv", + "@type": "PropertyValue", + "name": "MAGICK_WIDTH_LIMIT", + "value": "4096" + }, + { + "@id": "#height-limit-pv", + "@type": "PropertyValue", + "name": "MAGICK_HEIGHT_LIMIT", + "value": "3072" + }, + { + "@id": "file:///tmp/test.txt", + "@type": "File", + "description": "A test file", + "encodingFormat": "text/plain" + }, + { + "@id": "pics/2017-06-11%252012.56.14.jpg", + "@type": "File", + "description": "Original image", + "encodingFormat": "image/jpeg", + "name": "2017-06-11 12.56.14.jpg (input)", + "author": { + "@id": "https://orcid.org/0000-0002-3545-944X" + } + }, + { + "@id": "pics/2018-06-11%2012.56.14.jpg", + "@type": "File", + "description": "Original image", + "encodingFormat": "image/jpeg", + "name": "2018-06-11 12.56.14.jpg (input)" + }, + { + "@id": "#thisIsNotDataEntity", + "@type": "File", + "description": "A File type that is not a data entity", + "encodingFormat": "text/plain", + "name": "thisIsNotDataEntity.txt" + }, + { + "@id": "pics/2019-06-11 12.56.14.jpg", + "@type": "File", + "description": "Original image", + "encodingFormat": "image/jpeg", + "name": "2018-06-11 12.56.14.jpg (input)" + }, + { + "@id": "data%2520set/", + "@type": "Dataset", + "name": "Data set", + "description": "A dataset", + "datePublished": "2024-05-17T01:04:52+01:00", + "license": { + "@id": "http://spdx.org/licenses/CC0-1.0" + } + }, + { + "@id": "#xdata%2520set/", + "@type": "Dataset", + "name": "Data set with a local ID", + "description": "A dataset", + "datePublished": "2024-05-17T01:04:52+01:00", + "license": { + "@id": "http://spdx.org/licenses/CC0-1.0" + } + }, + { + "@id": "data%20set2/", + "@type": "Dataset", + "name": "Data set 2", + "description": "A dataset", + "datePublished": "2024-05-17T01:04:52+01:00", + "license": { + "@id": "http://spdx.org/licenses/CC0-1.0" + } + }, + { + "@id": "data set3/", + "@type": "Dataset", + "name": "Data set 3", + "description": "A dataset", + "datePublished": "2024-05-17T01:04:52+01:00", + "license": { + "@id": "http://spdx.org/licenses/CC0-1.0" + } + }, + { + "@id": "pics/sepia_fence.jpg", + "@type": "File", + "description": "The converted picture, now sepia-colored", + "encodingFormat": "image/jpeg", + "name": "sepia_fence (output)" + }, + { + "@id": "https://orcid.org/0000-0001-9842-9718", + "@type": "Person", + "name": "Stian Soiland-Reyes" + }, + { + "@id": "https://orcid.org/0000-0002-3545-944X", + "@type": "Person", + "name": "Peter Sefton" + } + ] +} diff --git a/tests/data/crates/valid/rocrate-with-custom-terms/data set2/.gitkeep b/tests/data/crates/valid/rocrate-with-custom-terms/data set2/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/crates/valid/rocrate-with-custom-terms/data set3/.gitkeep b/tests/data/crates/valid/rocrate-with-custom-terms/data set3/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/crates/valid/rocrate-with-custom-terms/data%20set/.gitkeep b/tests/data/crates/valid/rocrate-with-custom-terms/data%20set/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/crates/valid/rocrate-with-custom-terms/pics/2017-06-11%2012.56.14.jpg b/tests/data/crates/valid/rocrate-with-custom-terms/pics/2017-06-11%2012.56.14.jpg new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/crates/valid/rocrate-with-custom-terms/pics/2018-06-11 12.56.14.jpg b/tests/data/crates/valid/rocrate-with-custom-terms/pics/2018-06-11 12.56.14.jpg new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/crates/valid/rocrate-with-custom-terms/pics/2019-06-11 12.56.14.jpg b/tests/data/crates/valid/rocrate-with-custom-terms/pics/2019-06-11 12.56.14.jpg new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/crates/valid/rocrate-with-custom-terms/pics/sepia_fence.jpg b/tests/data/crates/valid/rocrate-with-custom-terms/pics/sepia_fence.jpg new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/crates/valid/rocrate_with_custom_terms/ro-crate-metadata.json b/tests/data/crates/valid/rocrate-with-custom-terms/ro-crate-metadata.json similarity index 96% rename from tests/data/crates/valid/rocrate_with_custom_terms/ro-crate-metadata.json rename to tests/data/crates/valid/rocrate-with-custom-terms/ro-crate-metadata.json index e6243c9b..b201b9e8 100644 --- a/tests/data/crates/valid/rocrate_with_custom_terms/ro-crate-metadata.json +++ b/tests/data/crates/valid/rocrate-with-custom-terms/ro-crate-metadata.json @@ -29,7 +29,7 @@ ], "hasPart": [ { - "@id": "pics/2017-06-11%2012.56.14.jpg" + "@id": "pics/2017-06-11%252012.56.14.jpg" }, { "@id": "pics/2018-06-11%2012.56.14.jpg" @@ -38,7 +38,7 @@ "@id": "pics/2019-06-11 12.56.14.jpg" }, { - "@id": "data%20set/" + "@id": "data%2520set/" }, { "@id": "data%20set2/" @@ -102,7 +102,7 @@ "@id": "https://www.imagemagick.org/" }, "object": { - "@id": "pics/2017-06-11%2012.56.14.jpg" + "@id": "pics/2017-06-11%252012.56.14.jpg" }, "result": { "@id": "pics/sepia_fence.jpg" @@ -141,7 +141,7 @@ "encodingFormat": "text/plain" }, { - "@id": "pics/2017-06-11%2012.56.14.jpg", + "@id": "pics/2017-06-11%252012.56.14.jpg", "@type": "File", "description": "Original image", "encodingFormat": "image/jpeg", @@ -165,7 +165,7 @@ "name": "2018-06-11 12.56.14.jpg (input)" }, { - "@id": "data%20set/", + "@id": "data%2520set/", "@type": "Dataset", "name": "Data set", "description": "A dataset", diff --git a/tests/data/crates/valid/rocrate-with-data-entities/data set2/.gitkeep b/tests/data/crates/valid/rocrate-with-data-entities/data set2/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/crates/valid/rocrate-with-data-entities/data set3/.gitkeep b/tests/data/crates/valid/rocrate-with-data-entities/data set3/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/crates/valid/rocrate-with-data-entities/data%20set/.gitkeep b/tests/data/crates/valid/rocrate-with-data-entities/data%20set/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/crates/valid/rocrate-with-data-entities/pics/2017-06-11%2012.56.14.jpg b/tests/data/crates/valid/rocrate-with-data-entities/pics/2017-06-11%2012.56.14.jpg new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/crates/valid/rocrate-with-data-entities/pics/2018-06-11 12.56.14.jpg b/tests/data/crates/valid/rocrate-with-data-entities/pics/2018-06-11 12.56.14.jpg new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/crates/valid/rocrate-with-data-entities/pics/2019-06-11 12.56.14.jpg b/tests/data/crates/valid/rocrate-with-data-entities/pics/2019-06-11 12.56.14.jpg new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/crates/valid/rocrate-with-data-entities/pics/sepia_fence.jpg b/tests/data/crates/valid/rocrate-with-data-entities/pics/sepia_fence.jpg new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/crates/valid/rocrate-with-data-entities/ro-crate-metadata.json b/tests/data/crates/valid/rocrate-with-data-entities/ro-crate-metadata.json new file mode 100644 index 00000000..d4aa7a6e --- /dev/null +++ b/tests/data/crates/valid/rocrate-with-data-entities/ro-crate-metadata.json @@ -0,0 +1,230 @@ +{ + "@context": [ + "https://w3id.org/ro/crate/1.1/context", + "https://w3id.org/ro/terms/workflow-run/context" + ], + "@graph": [ + { + "@id": "ro-crate-metadata.json", + "@type": "CreativeWork", + "conformsTo": { + "@id": "https://w3id.org/ro/crate/1.1" + }, + "about": { + "@id": "./" + } + }, + { + "@id": "./", + "@type": "Dataset", + "name": "My Pictures", + "description": "A collection of my pictures", + "datePublished": "2024-05-17T01:04:52+01:00", + "conformsTo": [ + { + "@id": "https://w3id.org/ro/crate/1.1" + } + ], + "hasPart": [ + { + "@id": "pics/2017-06-11%252012.56.14.jpg" + }, + { + "@id": "pics/2018-06-11%2012.56.14.jpg" + }, + { + "@id": "pics/2019-06-11 12.56.14.jpg" + }, + { + "@id": "data%2520set/" + }, + { + "@id": "data%20set2/" + }, + { + "@id": "data set3/" + }, + { + "@id": "pics/sepia_fence.jpg" + }, + { + "@id": "file:///tmp/test.txt" + } + ], + "isBasedOn": { + "@id": "https://doi.org/10.5281/zenodo.1009240" + }, + "license": { + "@id": "http://spdx.org/licenses/CC0-1.0" + }, + "mentions": { + "@id": "#SepiaConversion_1" + } + }, + { + "@id": "https://w3id.org/ro/wfrun/process/0.5", + "@type": "CreativeWork", + "name": "Process Run Crate", + "version": "0.5" + }, + { + "@id": "https://example.com/otherprofile/0.1", + "@type": "CreativeWork", + "name": "Other Profile", + "version": "0.1" + }, + { + "@id": "https://www.imagemagick.org/", + "@type": "SoftwareApplication", + "url": "https://www.imagemagick.org/", + "name": "ImageMagick", + "softwareVersion": "6.9.7-4", + "softwareRequirements": { + "@id": "https://example.com/foobar/1.0.0/" + } + }, + { + "@id": "https://example.com/foobar/1.0.0/", + "@type": "SoftwareApplication", + "name": "foobar", + "softwareVersion": "1.0.0" + }, + { + "@id": "#SepiaConversion_1", + "@type": "CreateAction", + "name": "Convert dog image to sepia", + "description": "convert -sepia-tone 80% pics/2017-06-11\\ 12.56.14.jpg pics/sepia_fence.jpg", + "startTime": "2024-05-17T01:04:50+01:00", + "endTime": "2024-05-17T01:04:52+01:00", + "instrument": { + "@id": "https://www.imagemagick.org/" + }, + "object": { + "@id": "pics/2017-06-11%252012.56.14.jpg" + }, + "result": { + "@id": "pics/sepia_fence.jpg" + }, + "agent": { + "@id": "https://orcid.org/0000-0001-9842-9718" + }, + "actionStatus": "http://schema.org/FailedActionStatus", + "error": "this is just to test the error property", + "environment": [ + { + "@id": "#height-limit-pv" + }, + { + "@id": "#width-limit-pv" + } + ], + "containerImage": "https://example.com/imagemagick.sif" + }, + { + "@id": "#width-limit-pv", + "@type": "PropertyValue", + "name": "MAGICK_WIDTH_LIMIT", + "value": "4096" + }, + { + "@id": "#height-limit-pv", + "@type": "PropertyValue", + "name": "MAGICK_HEIGHT_LIMIT", + "value": "3072" + }, + { + "@id": "file:///tmp/test.txt", + "@type": "File", + "description": "A test file", + "encodingFormat": "text/plain" + }, + { + "@id": "pics/2017-06-11%252012.56.14.jpg", + "@type": "File", + "description": "Original image", + "encodingFormat": "image/jpeg", + "name": "2017-06-11 12.56.14.jpg (input)", + "author": { + "@id": "https://orcid.org/0000-0002-3545-944X" + } + }, + { + "@id": "pics/2018-06-11%2012.56.14.jpg", + "@type": "File", + "description": "Original image", + "encodingFormat": "image/jpeg", + "name": "2018-06-11 12.56.14.jpg (input)" + }, + { + "@id": "#thisIsNotDataEntity", + "@type": "File", + "description": "A File type that is not a data entity", + "encodingFormat": "text/plain", + "name": "thisIsNotDataEntity.txt" + }, + { + "@id": "pics/2019-06-11 12.56.14.jpg", + "@type": "File", + "description": "Original image", + "encodingFormat": "image/jpeg", + "name": "2018-06-11 12.56.14.jpg (input)" + }, + { + "@id": "data%2520set/", + "@type": "Dataset", + "name": "Data set", + "description": "A dataset", + "datePublished": "2024-05-17T01:04:52+01:00", + "license": { + "@id": "http://spdx.org/licenses/CC0-1.0" + } + }, + { + "@id": "#xdata%2520set/", + "@type": "Dataset", + "name": "Data set with a local ID", + "description": "A dataset", + "datePublished": "2024-05-17T01:04:52+01:00", + "license": { + "@id": "http://spdx.org/licenses/CC0-1.0" + } + }, + { + "@id": "data%20set2/", + "@type": "Dataset", + "name": "Data set 2", + "description": "A dataset", + "datePublished": "2024-05-17T01:04:52+01:00", + "license": { + "@id": "http://spdx.org/licenses/CC0-1.0" + } + }, + { + "@id": "data set3/", + "@type": "Dataset", + "name": "Data set 3", + "description": "A dataset", + "datePublished": "2024-05-17T01:04:52+01:00", + "license": { + "@id": "http://spdx.org/licenses/CC0-1.0" + } + }, + { + "@id": "pics/sepia_fence.jpg", + "@type": "File", + "description": "The converted picture, now sepia-colored", + "encodingFormat": "image/jpeg", + "name": "sepia_fence (output)" + }, + { + "@id": "https://orcid.org/0000-0001-9842-9718", + "@type": "Person", + "name": "Stian Soiland-Reyes" + }, + { + "@id": "https://orcid.org/0000-0002-3545-944X", + "@type": "Person", + "name": "Peter Sefton" + } + ] +} diff --git a/tests/integration/profiles/ro-crate/test_valid_ro-crate.py b/tests/integration/profiles/ro-crate/test_valid_ro-crate.py index ffb3214c..4cd61cd3 100644 --- a/tests/integration/profiles/ro-crate/test_valid_ro-crate.py +++ b/tests/integration/profiles/ro-crate/test_valid_ro-crate.py @@ -56,3 +56,49 @@ def test_valid_roc_required_with_value_objects(): Severity.REQUIRED, True ) + + +def test_valid_roc_with_relative_root_required(): + """Test a valid RO-Crate.""" + do_entity_test( + ValidROC().rocrate_with_relative_root, + Severity.REQUIRED, + True, + rocrate_relative_root_path="custom-relative-root/" + ) + + +def test_valid_roc_remote_required(): + """Test a valid RO-Crate.""" + do_entity_test( + ValidROC().sort_and_change_remote, + Severity.REQUIRED, + True + ) + + +def test_valid_roc_bagit_required(): + """Test a valid RO-Crate.""" + do_entity_test( + ValidROC().bagit, + Severity.REQUIRED, + True + ) + + +def test_valid_roc_bagit_zip_required(): + """Test a valid RO-Crate.""" + do_entity_test( + ValidROC().bagit_zip, + Severity.REQUIRED, + True + ) + + +def test_valid_roc_remote_bagit_required(): + """Test a valid RO-Crate.""" + do_entity_test( + ValidROC().bagit_remote_zip, + Severity.REQUIRED, + True + ) diff --git a/tests/ro_crates.py b/tests/ro_crates.py index 484abd4b..d62190dc 100644 --- a/tests/ro_crates.py +++ b/tests/ro_crates.py @@ -33,11 +33,11 @@ class ValidROC: @property def rocrate_with_data_entities(self) -> Path: - return VALID_CRATES_DATA_PATH / "rocrate_with_data_entities" + return VALID_CRATES_DATA_PATH / "rocrate-with-data-entities" @property def rocrate_with_custom_terms(self) -> Path: - return VALID_CRATES_DATA_PATH / "rocrate_with_custom_terms" + return VALID_CRATES_DATA_PATH / "rocrate-with-custom-terms" @property def wrroc_paper(self) -> Path: @@ -51,6 +51,25 @@ def wrroc_paper_long_date(self) -> Path: def rocrate_with_value_objects(self) -> Path: return VALID_CRATES_DATA_PATH / "rocrate-with-value-objects" + @property + def rocrate_with_relative_root(self) -> Path: + return VALID_CRATES_DATA_PATH / "rocrate-relative-root" + + @property + def bagit(self) -> Path: + return VALID_CRATES_DATA_PATH / "bagit" + + @property + def bagit_zip(self) -> Path: + return VALID_CRATES_DATA_PATH / "bagit.zip" + + @property + def bagit_remote_zip(self) -> str: + return ( + "https://github.com/kikkomep/rocrate-validator/raw/refs/heads/" + "feat/configurable-dataroot_issue-100/tests/data/crates/valid/bagit.zip" + ) + @property def workflow_roc(self) -> Path: return VALID_CRATES_DATA_PATH / "workflow-roc" diff --git a/tests/shared.py b/tests/shared.py index 5a66ec63..1f32fa0a 100644 --- a/tests/shared.py +++ b/tests/shared.py @@ -46,7 +46,8 @@ def do_entity_test( abort_on_first: bool = False, profile_identifier: str = DEFAULT_PROFILE_IDENTIFIER, rocrate_entity_patch: Optional[dict] = None, - skip_checks: Optional[list[str]] = () + skip_checks: Optional[list[str]] = (), + rocrate_relative_root_path: Optional[str] = None ): """ Shared function to test a RO-Crate entity @@ -55,7 +56,7 @@ def do_entity_test( failed_requirements = None detected_issues = None - if not isinstance(rocrate_path, Path): + if not isinstance(rocrate_path, Path) and not rocrate_path.startswith("http"): rocrate_path = Path(rocrate_path) temp_rocrate_path = None @@ -98,7 +99,8 @@ def do_entity_test( "requirement_severity": requirement_severity, "abort_on_first": abort_on_first, "profile_identifier": profile_identifier, - "skip_checks": skip_checks + "skip_checks": skip_checks, + "rocrate_relative_root_path": rocrate_relative_root_path })) logger.debug("Expected validation result: %s", expected_validation_result) diff --git a/tests/unit/test_rocrate.py b/tests/unit/test_rocrate.py index 3e9e3234..50f2c577 100644 --- a/tests/unit/test_rocrate.py +++ b/tests/unit/test_rocrate.py @@ -18,7 +18,10 @@ from rocrate_validator import log as logging from rocrate_validator.errors import ROCrateInvalidURIError -from rocrate_validator.rocrate import (ROCrate, ROCrateEntity, +from rocrate_validator.rocrate import (BagitROCrate, ROCrate, + ROCrateBagitLocalFolder, + ROCrateBagitLocalZip, + ROCrateBagitRemoteZip, ROCrateEntity, ROCrateLocalFolder, ROCrateLocalZip, ROCrateMetadata, ROCrateRemoteZip) from tests.ro_crates import InvalidDataEntity, ValidROC @@ -40,12 +43,173 @@ def test_invalid_local_ro_crate(): ROCrateLocalFolder("/tmp/does_not_exist") +def test_is_bagit_rocrate(): + assert BagitROCrate.is_bagit_wrapping_crate(ValidROC().bagit), \ + "Should be a BagIt RO-Crate" + + assert BagitROCrate.is_bagit_wrapping_crate(ValidROC().bagit_zip), \ + "Should be a BagIt Zip RO-Crate" + + assert BagitROCrate.is_bagit_wrapping_crate(ValidROC().bagit_remote_zip), \ + "Should be a BagIt Remote Zip RO-Crate" + + assert not BagitROCrate.is_bagit_wrapping_crate(ValidROC().wrroc_paper), \ + "Should not be a BagIt RO-Crate" + + assert not BagitROCrate.is_bagit_wrapping_crate(ValidROC().sort_and_change_archive), \ + "Should not be a BagIt RO-Crate" + + assert not BagitROCrate.is_bagit_wrapping_crate(ValidROC().sort_and_change_remote), \ + "Should not be a BagIt RO-Crate" + + +def test_abstract_bagit_rocrate_instantiation(): + # Check that the base class BagItROCrate cannot be instantiated directly + with pytest.raises(TypeError, match="Can't instantiate"): + BagitROCrate(ValidROC().bagit) + + +def test_rocrate_factory(): + + logger.debug("Testing wrroc_paper: %s", ValidROC().wrroc_paper) + roc = ROCrate.new_instance(ValidROC().wrroc_paper) + assert isinstance(roc, ROCrateLocalFolder), "Should be a ROCrateLocalFolder" + + roc = ROCrate.new_instance(ValidROC().sort_and_change_archive) + assert isinstance(roc, ROCrateLocalZip), "Should be a ROCrateLocalZip" + + roc = ROCrate.new_instance(ValidROC().sort_and_change_remote) + assert isinstance(roc, ROCrateRemoteZip), "Should be a ROCrateRemoteZip" + + roc = ROCrate.new_instance(ValidROC().bagit) + assert isinstance(roc, BagitROCrate), "Should be a BagItROCrate" + assert isinstance(roc, ROCrateLocalFolder), "Should be a ROCrateLocalFolder" + assert isinstance(roc, ROCrateBagitLocalFolder), "Should be a ROCrateBagitLocalFolder" + + roc = ROCrate.new_instance(ValidROC().bagit_zip) + assert isinstance(roc, BagitROCrate), "Should be a BagItROCrate" + assert isinstance(roc, ROCrateLocalZip), "Should be a ROCrateLocalZip" + assert isinstance(roc, ROCrateBagitLocalZip), "Should be a ROCrateBagitLocalZip" + + roc = ROCrate.new_instance(ValidROC().bagit_remote_zip) + assert isinstance(roc, ROCrateRemoteZip), "Should be a ROCrateRemoteZip" + assert isinstance(roc, BagitROCrate), "Should be a BagItROCrate" + assert isinstance(roc, ROCrateBagitRemoteZip), "Should be a ROCrateBagitRemoteZip" + + +def test_rocrate_constructor(): + roc = ROCrate(ValidROC().wrroc_paper) + assert isinstance(roc, ROCrateLocalFolder), "Should be a ROCrateLocalFolder" + + roc = ROCrate(ValidROC().sort_and_change_archive) + assert isinstance(roc, ROCrateLocalZip), "Should be a ROCrateLocalZip" + + roc = ROCrate(ValidROC().sort_and_change_remote) + assert isinstance(roc, ROCrateRemoteZip), "Should be a ROCrateRemoteZip" + + roc = ROCrate(ValidROC().bagit) + assert isinstance(roc, BagitROCrate), "Should be a BagItROCrate" + assert isinstance(roc, ROCrateLocalFolder), "Should be a ROCrateLocalFolder" + assert isinstance(roc, ROCrateBagitLocalFolder), "Should be a ROCrateBagitLocalFolder" + + roc = ROCrate(ValidROC().bagit_zip) + assert isinstance(roc, BagitROCrate), "Should be a BagItROCrate" + assert isinstance(roc, ROCrateLocalZip), "Should be a ROCrateLocalZip" + assert isinstance(roc, ROCrateBagitLocalZip), "Should be a ROCrateBagitLocalZip" + + +def test_parse_path(): + roc = ROCrate.new_instance(ValidROC().bagit_zip) + assert isinstance(roc, ROCrateBagitLocalZip) + + logger.debug("Testing bagit_zip: %s", ValidROC().bagit_zip) + + # test parse_path for normal file + path = Path("file.txt") + parsed_path = roc.__parse_path__(path) + logger.debug(f"Parsed path: {parsed_path}") + assert parsed_path == Path("data/file.txt"), "Parsed path should be data/file.txt" + + # test parse_path for bagit wrapped file + path = Path("ro-crate-metadata.json") + parsed_path = roc.__parse_path__(path) + logger.debug(f"Parsed path: {parsed_path}") + assert parsed_path == Path("data/ro-crate-metadata.json"), "Parsed path should be data/ro-crate-metadata.json" + + # test parse_path for an explicit data/ path + path = Path("data/file.txt") + parsed_path = roc.__parse_path__(path) + logger.debug(f"Parsed path: {parsed_path}") + assert parsed_path == Path("data/file.txt"), "Parsed path should be data/file.txt" + + +def test_local_folder_with_relative_root(): + # set relative root path + relative_root_path = "data" + # create ROCrateBagitLocalFolder with relative root path + roc = ROCrateLocalFolder(ValidROC().bagit, relative_root_path=relative_root_path) + assert isinstance(roc, ROCrateLocalFolder) + logger.debug("Testing bagit with relative root path: %s", relative_root_path) + + # test parse_path for normal file + path = Path("file.txt") + + search_path, root_path = roc.__get_search_path__(path) + logger.debug(f"Search path: {search_path}") + logger.debug(f"Root path: {root_path}") + assert root_path == roc.uri.as_path(), "Root path should be the ro-crate path" + assert search_path == path, "Search path should be file.txt" + + parsed_path = roc.__parse_path__(path) + logger.debug(f"Parsed path: {parsed_path}") + assert parsed_path == ValidROC().bagit / relative_root_path / path, "Parsed path should be data/file.txt" + + # test has_file + assert roc.has_file("data/ro-crate-metadata.json"), "Should have ro-crate-metadata.json file" + + # test get_file_content + content = roc.get_file_content("data/ro-crate-metadata.json") + assert isinstance(content, bytes), "Content should be bytes" + + +def test_remote_bagit_rocrate(): + + bagit_crate = ValidROC().bagit_remote_zip + roc = ROCrate.new_instance(bagit_crate) + assert isinstance(roc, BagitROCrate), "Should be a BagItROCrate" + assert isinstance(roc, ROCrateRemoteZip), "Should be a ROCrateRemoteZip" + assert isinstance(roc, ROCrateBagitRemoteZip), "Should be a ROCrateBagitRemoteZip" + + # test list files + files = roc.list_files() + logger.debug(f"Files: {files}") + assert len(files) == 16, "Should have 16 files" + + # test is_file + assert roc.has_file(metadata_file_descriptor), "Should be a file" + # test file size + size = roc.get_file_size(metadata_file_descriptor) + assert size == 7321, "Size should be 7321" + + # test has directory + assert roc.has_directory(Path("data")), "Should have data/ directory" + assert roc.has_directory(Path("data/pics/")), "Should have data/pics/ directory" + assert roc.has_directory(Path("data%20set/")), "Should have data%20set/ directory" + assert roc.has_directory(Path("data set3/")), "Should have data set3/ directory" + # test has file + assert roc.has_file("pics/2018-06-11 12.56.14.jpg"), "Should have pics/2018-06-11%2012.56.14.jpg file" + + # test file availability + img_2018 = roc.metadata.get_entity("pics/2018-06-11%2012.56.14.jpg") + assert img_2018 is not None, "Should have pics/2018-06-11%2012.56.14.jpg entity" + logger.debug(f"Image 2018 entity: {img_2018}") + assert img_2018.is_available(), "pics/2018-06-11%2012.56.14.jpg entity should be available" + + def test_valid_local_rocrate(): roc = ROCrateLocalFolder(ValidROC().wrroc_paper) assert isinstance(roc, ROCrateLocalFolder) - # raise Exception("Test not implemented: %s", str(roc.uri)) - # test list files files = roc.list_files() logger.debug(f"Files: {files}") @@ -97,9 +261,48 @@ def test_valid_local_rocrate(): assert root_data_entity.is_available(), "Main entity should be available" +################################ +# ROCrateLocalFolder +def test_valid_local_folder_rocrate_with_relative_root(): + # set relative root path + relative_root_path = "custom-relative-root" + # create ROCrateLocalFolder with relative root path + roc = ROCrateLocalFolder(ValidROC().rocrate_with_relative_root, + relative_root_path=relative_root_path) + assert isinstance(roc, ROCrateLocalFolder) + logger.debug("Testing bagit with relative root path: %s", relative_root_path) + + # inspect ro-crate-metadata.json to confirm correct relative root path + assert roc.has_file("ro-crate-metadata.json"), "Should have ro-crate-metadata.json file" + + metadata_path = roc.get_file_content("ro-crate-metadata.json", binary_mode=False) + logger.debug(f"ro-crate-metadata.json content: {metadata_path}") + + # test has_file + assert roc.has_file("ro-crate-metadata.json"), "Should have ro-crate-metadata.json file" + assert roc.has_file("pics/2017-06-11%252012.56.14.jpg"), \ + "Should have pics/2017-06-11%252012.56.14.jpg file" + + # test get_file_content + content = roc.get_file_content("ro-crate-metadata.json") + assert isinstance(content, bytes), "Content should be bytes" + + # check availability + metadata = roc.metadata + assert isinstance(metadata, ROCrateMetadata), "Metadata should be ROCrateMetadata" + + # check availability + entity = metadata.get_entity("pics/2017-06-11%252012.56.14.jpg") + assert entity is not None, "Entity should be available" + logger.debug(f"Entity: {entity}") + assert entity.is_available(), "Entity should be available" + + ################################ # ROCrateLocalZip ################################ + + def test_valid_zip_rocrate(): roc = ROCrateLocalZip(ValidROC().sort_and_change_archive) assert isinstance(roc, ROCrateLocalZip) @@ -161,6 +364,116 @@ def test_valid_zip_rocrate(): assert main_entity.is_available(), "Main entity should be available" +################################ +# ROCrate Local Bagit Zip +################################ + +def test_paths_valid_bagit_rocrate(): + roc = ROCrate(ValidROC().bagit_zip) + assert isinstance(roc, ROCrateLocalZip) + + # test list files + files = roc.list_files() + logger.debug(f"Files: {files}") + assert len(files) == 16, "Should have 16 files" + + # check file paths + # assert roc.has_file(Path("ro-crate-metadata.json")), "Should have ro-crate-metadata.json file" + # # assert roc.has_file(Path("bagit.txt")), "Should have bagit.txt file" + # # assert roc.has_file(Path("data/ro-crate-metadata.json")), "Should have data/ro-crate-metadata.json file" + # assert roc.has_file(Path("pics/2017-06-11%2012.56.14.jpg") + # ), "Should have data/pics/2017-06-11 12.56.14.jpg file" + + assert roc.has_directory(Path("data")), "Should have data/ directory" + assert roc.has_directory(Path("data/pics/")), "Should have data/pics/ directory" + assert roc.has_directory(Path("data%20set/")), "Should have data%20set/ directory" + + assert roc.has_directory(Path("data set3")), "Should have data set3/ directory" + assert roc.has_directory(Path("data set3/")), "Should have data set3/ directory" + + assert roc.has_file("pics/2018-06-11 12.56.14.jpg"), "Should have pics/2018-06-11%2012.56.14.jpg file" + + dataset3 = roc.metadata.get_entity("data set3/") + assert dataset3 is not None, "Should have data set3/ entity" + logger.debug(f"Dataset3 entity: {dataset3}") + assert dataset3.is_dataset(), "data set3/ entity should be a Dataset" + assert dataset3.is_available(), "data set3/ entity should be available" + + img_2018 = roc.metadata.get_entity("pics/2018-06-11%2012.56.14.jpg") + assert img_2018 is not None, "Should have pics/2018-06-11%2012.56.14.jpg entity" + logger.debug(f"Image 2018 entity: {img_2018}") + assert img_2018.is_available(), "pics/2018-06-11%2012.56.14.jpg entity should be available" + + img_2017 = roc.metadata.get_entity("pics/2017-06-11%252012.56.14.jpg") + assert img_2017 is not None, "Should have pics/2017-06-11%252012.56.14.jpg entity" + logger.debug(f"Image 2017 entity: {img_2017}") + assert img_2017.is_available(), "pics/2017-06-11%252012.56.14.jpg entity should be available" + + +def test_valid_bagit_zip_rocrate(): + roc = ROCrate(ValidROC().bagit_zip) + assert isinstance(roc, ROCrateLocalZip) + + # test list files + files = roc.list_files() + logger.debug(f"Files: {files}") + # assert len(files) == 11, "Should have 11 files" + + # test is_file + assert roc.has_file(metadata_file_descriptor), "Should be a file" + + # test file size + size = roc.get_file_size(metadata_file_descriptor) + assert size == 7321, "Size should be 7321" + + # test crate size + assert roc.size == 4055, "Size should be 4055" + + # test get_file_content binary mode + content = roc.get_file_content(metadata_file_descriptor) + assert isinstance(content, bytes), "Content should be bytes" + + # test get_file_content text mode + content = roc.get_file_content(metadata_file_descriptor, binary_mode=False) + assert isinstance(content, str), "Content should be str" + + # test metadata + metadata = roc.metadata + assert isinstance(metadata, ROCrateMetadata), "Metadata should be ROCrateMetadata" + + # test metadata id + file_descriptor_entity = metadata.get_entity("ro-crate-metadata.json") + logger.debug(f"File descriptor entity: {file_descriptor_entity}") + assert isinstance(file_descriptor_entity, ROCrateEntity), "Entity should be ROCrateEntity" + assert file_descriptor_entity.id == "ro-crate-metadata.json", "Id should be ro-crate-metadata.json" + assert file_descriptor_entity.type == "CreativeWork", "Type should be File" + + # test root data entity + root_data_entity = metadata.get_entity("./") + logger.debug(f"Root data entity: {root_data_entity}") + assert isinstance(root_data_entity, ROCrateEntity), "Entity should be ROCrateEntity" + assert root_data_entity.id == "./", "Id should be ./" + assert root_data_entity.type == "Dataset", "Type should be Dataset" + assert root_data_entity.name == "My Pictures", "Name should be sort_and_change" + + # test subEntity mainEntity + # main_entity = root_data_entity.get_property("mainEntity") + # logger.error(f"Main entity: {main_entity}") + # assert isinstance(main_entity, ROCrateEntity), "Entity should be ROCrateEntity" + # assert main_entity.id == "sort-and-change-case.ga", "Id should be main-entity" + # assert "ComputationalWorkflow" in main_entity.type, "Type should be ComputationalWorkflow" + + # check metadata consistency + # assert main_entity.metadata == metadata, "Metadata should be the same" + # assert main_entity.metadata == roc.metadata, "Metadata should be the same" + + # check availability of 'pics/2017-06-11%2012.56.14.jpg' + # entity = metadata.get_entity("pics/2017-06-11%2012.56.14.jpg") + # assert entity.is_available(), "Entity should be available" + + # assert roc.has_directory("data%20set/"), "Should have data%20set/ directory" + + ################################ # ROCrateRemoteZip ################################ @@ -248,7 +561,8 @@ def test_entity_path_from_identifier(): # Test quoted entity id which does not exist within the ro-crate quoted_entity_id = "pics/2018-06-11%2012.56.14.jpg" - path = ROCrateEntity.get_path_from_identifier(quoted_entity_id, rocrate_path=rocrate_path) + path = ROCrateEntity.get_path_from_identifier( + quoted_entity_id, rocrate_path=rocrate_path, decode=True) logger.debug(f"Quoted Entity Path: {path}") assert str(path) == f"{rocrate_path}/pics/2018-06-11 12.56.14.jpg", \ "Path should be pics/2018-06-11 12.56.14.jpg"