diff --git a/datalad_fuse/fsspec.py b/datalad_fuse/fsspec.py index 4a7f7ce..27581aa 100644 --- a/datalad_fuse/fsspec.py +++ b/datalad_fuse/fsspec.py @@ -15,7 +15,7 @@ import methodtools from .consts import CACHE_SIZE -from .utils import is_annex_dir_or_key +from .utils import AnnexKey, is_annex_dir_or_key lgr = logging.getLogger("datalad.fuse.fsspec") @@ -51,19 +51,17 @@ def close(self) -> None: self.annex._batched.clear() @methodtools.lru_cache(maxsize=CACHE_SIZE) - def get_file_state(self, relpath: str) -> Tuple[FileState, Optional[str]]: + def get_file_state(self, relpath: str) -> Tuple[FileState, Optional[AnnexKey]]: p = self.path / relpath lgr.debug("get_file_state: %s", relpath) def handle_path_under_annex_objects(p: Path): iadok = is_annex_dir_or_key(p) - if iadok is not None and iadok[1] == "key": - assert iadok[0] == str(self.path) - key = filename2key(p.name) + if isinstance(iadok, AnnexKey): if p.exists(): - return (FileState.HAS_CONTENT, key) + return (FileState.HAS_CONTENT, iadok) else: - return (FileState.NO_CONTENT, key) + return (FileState.NO_CONTENT, iadok) else: return (FileState.NOT_ANNEXED, None) @@ -75,7 +73,7 @@ def handle_path_under_annex_objects(p: Path): if not p.is_symlink(): if p.stat().st_size < 1024 and self.annex is not None: if self.annex.is_under_annex(relpath, batch=True): - key = self.annex.get_file_key(relpath, batch=True) + key = AnnexKey.parse(self.annex.get_file_key(relpath, batch=True)) if self.annex.file_has_content(relpath, batch=True): return (FileState.HAS_CONTENT, key) else: @@ -163,7 +161,7 @@ def open( ) if fstate is FileState.NO_CONTENT: lgr.debug("%s: opening via fsspec", relpath) - for url in self.get_urls(key): + for url in self.get_urls(str(key)): try: lgr.debug("%s: Attempting to open via URL %s", relpath, url) return self.fs.open(url, mode, **kwargs) @@ -242,6 +240,12 @@ def open( ) return dsap.open(relpath, mode=mode, encoding=encoding, errors=errors) + def get_file_state( + self, filepath: Union[str, Path] + ) -> Tuple[FileState, Optional[AnnexKey]]: + dsap, relpath = self.resolve_dataset(filepath) + return dsap.get_file_state(relpath) + def is_under_annex(self, filepath: Union[str, Path]) -> bool: dsap, relpath = self.resolve_dataset(filepath) fstate, _ = dsap.get_file_state(relpath) @@ -254,11 +258,3 @@ def get_commit_datetime(self, filepath: Union[str, Path]) -> datetime: def is_http_url(s: str) -> bool: return s.lower().startswith(("http://", "https://")) - - -def filename2key(name: str) -> str: - # See `keyFile` and `fileKey` in `Annex/Locations.hs` in the git-annex - # source - return ( - name.replace("%", "/").replace("&c", ":").replace("&s", "%").replace("&a", "&") - ) diff --git a/datalad_fuse/fuse_.py b/datalad_fuse/fuse_.py index 6ce4076..0a37bc2 100644 --- a/datalad_fuse/fuse_.py +++ b/datalad_fuse/fuse_.py @@ -22,7 +22,7 @@ # Make it relatively small since we are aiming for metadata records ATM # Seems of no real good positive net ATM # BLOCK_SIZE = 2**20 # 1M. block size to fetch at a time. -from .utils import is_annex_dir_or_key +from .utils import AnnexDir, AnnexKey, is_annex_dir_or_key lgr = logging.getLogger("datalad.fuse") @@ -126,34 +126,48 @@ def getattr(self, path, fh=None): else: iadok = is_annex_dir_or_key(path) if iadok is not None: - topdir, dir_or_key = iadok - if dir_or_key == "key": - # needs to be open but it is a key. We will let fsspec - # to handle it - pass - elif dir_or_key == "dir": + if isinstance(iadok, AnnexKey): + if iadok.size is not None: + lgr.debug("Got size from key") + r = mkstat( + is_file=True, + size=iadok.size, + timestamp=self._adapter.get_commit_datetime(path), + ) + else: + # needs to be open but it is a key. We will let + # fsspec handle it + pass + elif isinstance(iadok, AnnexDir): # just return that one of the top directory # TODO: cache this since would be a frequent operation - r = self._filter_stat(os.stat(topdir)) + r = self._filter_stat(os.stat(iadok.topdir)) else: - raise AssertionError(f"Unexpected dir_or_key: {dir_or_key!r}") + raise AssertionError(f"Unexpected iadok: {iadok!r}") elif self.is_under_git(path): lgr.debug("Path under .git does not exist; raising ENOENT") raise FuseOSError(ENOENT) if r is None: + fsspec_file = None if fh and fh >= self._counter_offset: lgr.debug("File already open") fsspec_file = self._fhdict[fh] to_close = False else: - # TODO: it is expensive to open each file just for `getattr`! - # We should just fabricate stats from the key here or not even - # bother???! - lgr.debug("File not already open") - with self.rwlock: - fsspec_file = self._adapter.open(path) - to_close = True - if fsspec_file: + _, key = self._adapter.get_file_state(path) + if key.size is not None: + lgr.debug("Got size from key") + r = mkstat( + is_file=True, + size=key.size, + timestamp=self._adapter.get_commit_datetime(path), + ) + else: + lgr.debug("File not already open") + with self.rwlock: + fsspec_file = self._adapter.open(path) + to_close = True + if fsspec_file is not None: if isinstance(fsspec_file, io.BufferedIOBase): # full file was already fetched locally lgr.debug("File object is io.BufferedIOBase") @@ -166,12 +180,6 @@ def getattr(self, path, fh=None): if to_close: with self.rwlock: fsspec_file.close() - else: - # TODO: although seems to be logical -- seems to cause logging etc - # lgr.error("ENOENTing %s %s", path, fh) - # raise FuseOSError(ENOENT) - lgr.debug("File failed to open???") - r = {} # we have nothing to say. TODO: proper return/error? lgr.debug("Returning %r for %s", r, path) return r @@ -377,16 +385,20 @@ def file_getattr(f, timestamp: datetime): info = f.info() except FileNotFoundError: raise FuseOSError(ENOENT) + return mkstat(info["type"] == "file", info["size"], timestamp) + + +def mkstat(is_file: bool, size: int, timestamp: datetime) -> dict: # TODO Also I get UID.GID funny -- yarik, not yoh # get of the original symlink, so float it up! data = {"st_uid": os.getuid(), "st_gid": os.getgid()} - if info["type"] != "file": + if not is_file: data["st_mode"] = stat.S_IFDIR | 0o755 data["st_size"] = 0 data["st_blksize"] = 0 else: data["st_mode"] = stat.S_IFREG | 0o644 - data["st_size"] = info["size"] + data["st_size"] = size data["st_blksize"] = 5 * 2**20 data["st_nlink"] = 1 data["st_atime"] = timestamp.timestamp() diff --git a/datalad_fuse/tests/test_fuse.py b/datalad_fuse/tests/test_fuse.py index 3580e2e..d491d75 100644 --- a/datalad_fuse/tests/test_fuse.py +++ b/datalad_fuse/tests/test_fuse.py @@ -57,6 +57,7 @@ def test_fuse(tmp_path, transparent, url_dataset): with fusing(ds.path, tmp_path, transparent=transparent) as mount: assert sorted(q.name for q in mount.iterdir()) == dots + sorted(data_files) for fname, blob in data_files.items(): + assert os.path.getsize(mount / fname) == len(blob) assert (mount / fname).read_bytes() == blob diff --git a/datalad_fuse/tests/test_util.py b/datalad_fuse/tests/test_util.py index 7f3e563..4117e8c 100644 --- a/datalad_fuse/tests/test_util.py +++ b/datalad_fuse/tests/test_util.py @@ -1,32 +1,40 @@ -from typing import Optional, Tuple +from __future__ import annotations import pytest -from datalad_fuse.fsspec import filename2key -from datalad_fuse.utils import is_annex_dir_or_key +from datalad_fuse.utils import AnnexDir, AnnexKey, is_annex_dir_or_key SAMPLE_KEY = "MD5E-s1064--8804d3d11f17e33bd912f1f0947afdb9.json" URL_KEY = "URL--http&c%%127.0.0.1&c55485%binary.png" +SAMPLE_ANNEX_KEY = AnnexKey( + backend="MD5E", + size=1064, + name="8804d3d11f17e33bd912f1f0947afdb9", + suffix=".json", +) + +URL_ANNEX_KEY = AnnexKey(backend="URL", name="http://127.0.0.1:55485/binary.png") + @pytest.mark.parametrize( "path,expected", [ - (f".git/annex/objects/p0/4v/{SAMPLE_KEY}/{SAMPLE_KEY}", (".", "key")), - (f".git/annex/objects/p2/pX/{URL_KEY}/{URL_KEY}", (".", "key")), - (f".git/annex/objects/p0/4v/{SAMPLE_KEY}/", (".", "dir")), - (f".git/annex/objects/p0/4v/{SAMPLE_KEY}", (".", "dir")), - (".git/annex/objects/p0/4v", (".", "dir")), + (f".git/annex/objects/p0/4v/{SAMPLE_KEY}/{SAMPLE_KEY}", SAMPLE_ANNEX_KEY), + (f".git/annex/objects/p2/pX/{URL_KEY}/{URL_KEY}", URL_ANNEX_KEY), + (f".git/annex/objects/p0/4v/{SAMPLE_KEY}/", AnnexDir(".")), + (f".git/annex/objects/p0/4v/{SAMPLE_KEY}", AnnexDir(".")), + (".git/annex/objects/p0/4v", AnnexDir(".")), ( f"some/project/.git/annex/objects/p0/4v/{SAMPLE_KEY}/{SAMPLE_KEY}", - ("some/project", "key"), + SAMPLE_ANNEX_KEY, ), - ("some/project/.git/annex/objects/p0/4v", ("some/project", "dir")), + ("some/project/.git/annex/objects/p0/4v", AnnexDir("some/project")), ( f"/usr/src/project/.git/annex/objects/p0/4v/{SAMPLE_KEY}/{SAMPLE_KEY}", - ("/usr/src/project", "key"), + SAMPLE_ANNEX_KEY, ), - ("/usr/src/project/.git/annex/objects/p0/4v", ("/usr/src/project", "dir")), + ("/usr/src/project/.git/annex/objects/p0/4v", AnnexDir("/usr/src/project")), ("foo.txt", None), ("foo.git/annex/objects/p0/4v", None), ("some/project/.git/refs/heads", None), @@ -44,28 +52,13 @@ ( "some/project/.git/embedded/sub/.git/annex/objects/p0/4v/" f"{SAMPLE_KEY}/{SAMPLE_KEY}", - ("some/project/.git/embedded/sub", "key"), + SAMPLE_ANNEX_KEY, ), ( "some/project/.git/embedded/sub/.git/annex/objects/p0/4v", - ("some/project/.git/embedded/sub", "dir"), + AnnexDir("some/project/.git/embedded/sub"), ), ], ) -def test_is_annex_dir_or_key(path: str, expected: Optional[Tuple[str, str]]) -> None: +def test_is_annex_dir_or_key(path: str, expected: AnnexDir | AnnexKey | None) -> None: assert is_annex_dir_or_key(path) == expected - - -@pytest.mark.parametrize( - "filename,key", - [ - ( - "URL--http&c%%127.0.0.1&c35401%text.txt", - "URL--http://127.0.0.1:35401/text.txt", - ), - ("foo&ac", "foo&c"), - ("foo&a&s", "foo&%"), - ], -) -def test_filename2key(filename: str, key: str) -> None: - assert filename2key(filename) == key diff --git a/datalad_fuse/utils.py b/datalad_fuse/utils.py index feeec64..b7748d2 100644 --- a/datalad_fuse/utils.py +++ b/datalad_fuse/utils.py @@ -1,14 +1,98 @@ +from __future__ import annotations + +from dataclasses import dataclass from functools import lru_cache from pathlib import Path import re -from typing import Optional, Tuple, Union +from typing import Optional from datalad_fuse.consts import CACHE_SIZE +@dataclass +class AnnexKey: + # + backend: str + name: str + size: Optional[int] = None + mtime: Optional[int] = None + chunk_size: Optional[int] = None + chunk_number: Optional[int] = None + suffix: Optional[str] = None + + def __str__(self) -> str: + s = self.backend + if self.size is not None: + s += f"-s{self.size}" + if self.mtime is not None: + s += f"-m{self.mtime}" + if self.chunk_size is not None: + s += f"-S{self.chunk_size}" + if self.chunk_number is not None: + s += f"-C{self.chunk_number}" + s += f"--{self.name}" + if self.suffix is not None: + s += self.suffix + return s + + @classmethod + def parse(cls, s: str) -> AnnexKey: + m = re.fullmatch( + r"(?P[A-Z0-9_]{2,14})" + r"(?:-s(?P[0-9]+))?" + r"(?:-m(?P[0-9]+))?" + r"(?:-S(?P[0-9]+)-C(?P[0-9]+))?" + r"--(?P.+)", + s, + ) + if m: + backend = m["backend"] + size = int(m["size"]) if m["size"] is not None else None + mtime = int(m["mtime"]) if m["mtime"] is not None else None + chunk_size = int(m["chunk_size"]) if m["chunk_size"] is not None else None + chunk_number = ( + int(m["chunk_number"]) if m["chunk_number"] is not None else None + ) + name = m["name"] + if backend.endswith("E"): + name, sep, suffix = name.rpartition(".") + suffix = sep + suffix + else: + suffix = None + return cls( + backend=backend, + size=size, + mtime=mtime, + chunk_size=chunk_size, + chunk_number=chunk_number, + name=name, + suffix=suffix, + ) + else: + raise ValueError(f"invalid git-annex key: {s!r}") + + @classmethod + def parse_filename(cls, s: str) -> AnnexKey: + fields, sep, name = s.partition("--") + # See `keyFile` and `fileKey` in `Annex/Locations.hs` in the git-annex + # source + name = ( + name.replace("%", "/") + .replace("&c", ":") + .replace("&s", "%") + .replace("&a", "&") + ) + return cls.parse(fields + sep + name) + + +@dataclass +class AnnexDir: + topdir: str + + # might be called twice in rapid succession for an annex key path @lru_cache(maxsize=CACHE_SIZE) -def is_annex_dir_or_key(path: Union[str, Path]) -> Optional[Tuple[str, str]]: +def is_annex_dir_or_key(path: str | Path) -> AnnexDir | AnnexKey | None: parts = list(Path(path).parts) start = 0 while True: @@ -22,16 +106,15 @@ def is_annex_dir_or_key(path: Union[str, Path]) -> Optional[Tuple[str, str]]: topdir = str(Path(*parts[:i])) depth = len(parts) - i if depth <= 5: # have only two level of hash'ing directories - return (topdir, "dir") - # matches an annex key regex in the form of - # BACKEND[-sSIZE][-mMTIME][-Ssize-Cchunk]--HASH[EXTENSION] - if re.fullmatch( - r"[A-Z0-9_]{2,14}(?:-s[0-9]+)?(?:-m[0-9]+)?(?:-S[0-9]+-C[0-9]+)?--.*", - parts[i + 5], - ): + return AnnexDir(topdir) + try: + key = AnnexKey.parse_filename(parts[i + 5]) + except ValueError: + pass + else: # note: key and its directory must match in name if depth == 7 and parts[-1] == parts[-2]: - return (topdir, "key") + return key elif depth == 6: - return (topdir, "dir") + return AnnexDir(topdir) start = i + 1