diff --git a/datalad_fuse/fsspec.py b/datalad_fuse/fsspec.py
index 4a7f7ce..27581aa 100644
--- a/datalad_fuse/fsspec.py
+++ b/datalad_fuse/fsspec.py
@@ -15,7 +15,7 @@
import methodtools
from .consts import CACHE_SIZE
-from .utils import is_annex_dir_or_key
+from .utils import AnnexKey, is_annex_dir_or_key
lgr = logging.getLogger("datalad.fuse.fsspec")
@@ -51,19 +51,17 @@ def close(self) -> None:
self.annex._batched.clear()
@methodtools.lru_cache(maxsize=CACHE_SIZE)
- def get_file_state(self, relpath: str) -> Tuple[FileState, Optional[str]]:
+ def get_file_state(self, relpath: str) -> Tuple[FileState, Optional[AnnexKey]]:
p = self.path / relpath
lgr.debug("get_file_state: %s", relpath)
def handle_path_under_annex_objects(p: Path):
iadok = is_annex_dir_or_key(p)
- if iadok is not None and iadok[1] == "key":
- assert iadok[0] == str(self.path)
- key = filename2key(p.name)
+ if isinstance(iadok, AnnexKey):
if p.exists():
- return (FileState.HAS_CONTENT, key)
+ return (FileState.HAS_CONTENT, iadok)
else:
- return (FileState.NO_CONTENT, key)
+ return (FileState.NO_CONTENT, iadok)
else:
return (FileState.NOT_ANNEXED, None)
@@ -75,7 +73,7 @@ def handle_path_under_annex_objects(p: Path):
if not p.is_symlink():
if p.stat().st_size < 1024 and self.annex is not None:
if self.annex.is_under_annex(relpath, batch=True):
- key = self.annex.get_file_key(relpath, batch=True)
+ key = AnnexKey.parse(self.annex.get_file_key(relpath, batch=True))
if self.annex.file_has_content(relpath, batch=True):
return (FileState.HAS_CONTENT, key)
else:
@@ -163,7 +161,7 @@ def open(
)
if fstate is FileState.NO_CONTENT:
lgr.debug("%s: opening via fsspec", relpath)
- for url in self.get_urls(key):
+ for url in self.get_urls(str(key)):
try:
lgr.debug("%s: Attempting to open via URL %s", relpath, url)
return self.fs.open(url, mode, **kwargs)
@@ -242,6 +240,12 @@ def open(
)
return dsap.open(relpath, mode=mode, encoding=encoding, errors=errors)
+ def get_file_state(
+ self, filepath: Union[str, Path]
+ ) -> Tuple[FileState, Optional[AnnexKey]]:
+ dsap, relpath = self.resolve_dataset(filepath)
+ return dsap.get_file_state(relpath)
+
def is_under_annex(self, filepath: Union[str, Path]) -> bool:
dsap, relpath = self.resolve_dataset(filepath)
fstate, _ = dsap.get_file_state(relpath)
@@ -254,11 +258,3 @@ def get_commit_datetime(self, filepath: Union[str, Path]) -> datetime:
def is_http_url(s: str) -> bool:
return s.lower().startswith(("http://", "https://"))
-
-
-def filename2key(name: str) -> str:
- # See `keyFile` and `fileKey` in `Annex/Locations.hs` in the git-annex
- # source
- return (
- name.replace("%", "/").replace("&c", ":").replace("&s", "%").replace("&a", "&")
- )
diff --git a/datalad_fuse/fuse_.py b/datalad_fuse/fuse_.py
index 6ce4076..0a37bc2 100644
--- a/datalad_fuse/fuse_.py
+++ b/datalad_fuse/fuse_.py
@@ -22,7 +22,7 @@
# Make it relatively small since we are aiming for metadata records ATM
# Seems of no real good positive net ATM
# BLOCK_SIZE = 2**20 # 1M. block size to fetch at a time.
-from .utils import is_annex_dir_or_key
+from .utils import AnnexDir, AnnexKey, is_annex_dir_or_key
lgr = logging.getLogger("datalad.fuse")
@@ -126,34 +126,48 @@ def getattr(self, path, fh=None):
else:
iadok = is_annex_dir_or_key(path)
if iadok is not None:
- topdir, dir_or_key = iadok
- if dir_or_key == "key":
- # needs to be open but it is a key. We will let fsspec
- # to handle it
- pass
- elif dir_or_key == "dir":
+ if isinstance(iadok, AnnexKey):
+ if iadok.size is not None:
+ lgr.debug("Got size from key")
+ r = mkstat(
+ is_file=True,
+ size=iadok.size,
+ timestamp=self._adapter.get_commit_datetime(path),
+ )
+ else:
+ # needs to be open but it is a key. We will let
+ # fsspec handle it
+ pass
+ elif isinstance(iadok, AnnexDir):
# just return that one of the top directory
# TODO: cache this since would be a frequent operation
- r = self._filter_stat(os.stat(topdir))
+ r = self._filter_stat(os.stat(iadok.topdir))
else:
- raise AssertionError(f"Unexpected dir_or_key: {dir_or_key!r}")
+ raise AssertionError(f"Unexpected iadok: {iadok!r}")
elif self.is_under_git(path):
lgr.debug("Path under .git does not exist; raising ENOENT")
raise FuseOSError(ENOENT)
if r is None:
+ fsspec_file = None
if fh and fh >= self._counter_offset:
lgr.debug("File already open")
fsspec_file = self._fhdict[fh]
to_close = False
else:
- # TODO: it is expensive to open each file just for `getattr`!
- # We should just fabricate stats from the key here or not even
- # bother???!
- lgr.debug("File not already open")
- with self.rwlock:
- fsspec_file = self._adapter.open(path)
- to_close = True
- if fsspec_file:
+ _, key = self._adapter.get_file_state(path)
+ if key.size is not None:
+ lgr.debug("Got size from key")
+ r = mkstat(
+ is_file=True,
+ size=key.size,
+ timestamp=self._adapter.get_commit_datetime(path),
+ )
+ else:
+ lgr.debug("File not already open")
+ with self.rwlock:
+ fsspec_file = self._adapter.open(path)
+ to_close = True
+ if fsspec_file is not None:
if isinstance(fsspec_file, io.BufferedIOBase):
# full file was already fetched locally
lgr.debug("File object is io.BufferedIOBase")
@@ -166,12 +180,6 @@ def getattr(self, path, fh=None):
if to_close:
with self.rwlock:
fsspec_file.close()
- else:
- # TODO: although seems to be logical -- seems to cause logging etc
- # lgr.error("ENOENTing %s %s", path, fh)
- # raise FuseOSError(ENOENT)
- lgr.debug("File failed to open???")
- r = {} # we have nothing to say. TODO: proper return/error?
lgr.debug("Returning %r for %s", r, path)
return r
@@ -377,16 +385,20 @@ def file_getattr(f, timestamp: datetime):
info = f.info()
except FileNotFoundError:
raise FuseOSError(ENOENT)
+ return mkstat(info["type"] == "file", info["size"], timestamp)
+
+
+def mkstat(is_file: bool, size: int, timestamp: datetime) -> dict:
# TODO Also I get UID.GID funny -- yarik, not yoh
# get of the original symlink, so float it up!
data = {"st_uid": os.getuid(), "st_gid": os.getgid()}
- if info["type"] != "file":
+ if not is_file:
data["st_mode"] = stat.S_IFDIR | 0o755
data["st_size"] = 0
data["st_blksize"] = 0
else:
data["st_mode"] = stat.S_IFREG | 0o644
- data["st_size"] = info["size"]
+ data["st_size"] = size
data["st_blksize"] = 5 * 2**20
data["st_nlink"] = 1
data["st_atime"] = timestamp.timestamp()
diff --git a/datalad_fuse/tests/test_fuse.py b/datalad_fuse/tests/test_fuse.py
index 3580e2e..d491d75 100644
--- a/datalad_fuse/tests/test_fuse.py
+++ b/datalad_fuse/tests/test_fuse.py
@@ -57,6 +57,7 @@ def test_fuse(tmp_path, transparent, url_dataset):
with fusing(ds.path, tmp_path, transparent=transparent) as mount:
assert sorted(q.name for q in mount.iterdir()) == dots + sorted(data_files)
for fname, blob in data_files.items():
+ assert os.path.getsize(mount / fname) == len(blob)
assert (mount / fname).read_bytes() == blob
diff --git a/datalad_fuse/tests/test_util.py b/datalad_fuse/tests/test_util.py
index 7f3e563..4117e8c 100644
--- a/datalad_fuse/tests/test_util.py
+++ b/datalad_fuse/tests/test_util.py
@@ -1,32 +1,40 @@
-from typing import Optional, Tuple
+from __future__ import annotations
import pytest
-from datalad_fuse.fsspec import filename2key
-from datalad_fuse.utils import is_annex_dir_or_key
+from datalad_fuse.utils import AnnexDir, AnnexKey, is_annex_dir_or_key
SAMPLE_KEY = "MD5E-s1064--8804d3d11f17e33bd912f1f0947afdb9.json"
URL_KEY = "URL--http&c%%127.0.0.1&c55485%binary.png"
+SAMPLE_ANNEX_KEY = AnnexKey(
+ backend="MD5E",
+ size=1064,
+ name="8804d3d11f17e33bd912f1f0947afdb9",
+ suffix=".json",
+)
+
+URL_ANNEX_KEY = AnnexKey(backend="URL", name="http://127.0.0.1:55485/binary.png")
+
@pytest.mark.parametrize(
"path,expected",
[
- (f".git/annex/objects/p0/4v/{SAMPLE_KEY}/{SAMPLE_KEY}", (".", "key")),
- (f".git/annex/objects/p2/pX/{URL_KEY}/{URL_KEY}", (".", "key")),
- (f".git/annex/objects/p0/4v/{SAMPLE_KEY}/", (".", "dir")),
- (f".git/annex/objects/p0/4v/{SAMPLE_KEY}", (".", "dir")),
- (".git/annex/objects/p0/4v", (".", "dir")),
+ (f".git/annex/objects/p0/4v/{SAMPLE_KEY}/{SAMPLE_KEY}", SAMPLE_ANNEX_KEY),
+ (f".git/annex/objects/p2/pX/{URL_KEY}/{URL_KEY}", URL_ANNEX_KEY),
+ (f".git/annex/objects/p0/4v/{SAMPLE_KEY}/", AnnexDir(".")),
+ (f".git/annex/objects/p0/4v/{SAMPLE_KEY}", AnnexDir(".")),
+ (".git/annex/objects/p0/4v", AnnexDir(".")),
(
f"some/project/.git/annex/objects/p0/4v/{SAMPLE_KEY}/{SAMPLE_KEY}",
- ("some/project", "key"),
+ SAMPLE_ANNEX_KEY,
),
- ("some/project/.git/annex/objects/p0/4v", ("some/project", "dir")),
+ ("some/project/.git/annex/objects/p0/4v", AnnexDir("some/project")),
(
f"/usr/src/project/.git/annex/objects/p0/4v/{SAMPLE_KEY}/{SAMPLE_KEY}",
- ("/usr/src/project", "key"),
+ SAMPLE_ANNEX_KEY,
),
- ("/usr/src/project/.git/annex/objects/p0/4v", ("/usr/src/project", "dir")),
+ ("/usr/src/project/.git/annex/objects/p0/4v", AnnexDir("/usr/src/project")),
("foo.txt", None),
("foo.git/annex/objects/p0/4v", None),
("some/project/.git/refs/heads", None),
@@ -44,28 +52,13 @@
(
"some/project/.git/embedded/sub/.git/annex/objects/p0/4v/"
f"{SAMPLE_KEY}/{SAMPLE_KEY}",
- ("some/project/.git/embedded/sub", "key"),
+ SAMPLE_ANNEX_KEY,
),
(
"some/project/.git/embedded/sub/.git/annex/objects/p0/4v",
- ("some/project/.git/embedded/sub", "dir"),
+ AnnexDir("some/project/.git/embedded/sub"),
),
],
)
-def test_is_annex_dir_or_key(path: str, expected: Optional[Tuple[str, str]]) -> None:
+def test_is_annex_dir_or_key(path: str, expected: AnnexDir | AnnexKey | None) -> None:
assert is_annex_dir_or_key(path) == expected
-
-
-@pytest.mark.parametrize(
- "filename,key",
- [
- (
- "URL--http&c%%127.0.0.1&c35401%text.txt",
- "URL--http://127.0.0.1:35401/text.txt",
- ),
- ("foo&ac", "foo&c"),
- ("foo&a&s", "foo&%"),
- ],
-)
-def test_filename2key(filename: str, key: str) -> None:
- assert filename2key(filename) == key
diff --git a/datalad_fuse/utils.py b/datalad_fuse/utils.py
index feeec64..b7748d2 100644
--- a/datalad_fuse/utils.py
+++ b/datalad_fuse/utils.py
@@ -1,14 +1,98 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
from functools import lru_cache
from pathlib import Path
import re
-from typing import Optional, Tuple, Union
+from typing import Optional
from datalad_fuse.consts import CACHE_SIZE
+@dataclass
+class AnnexKey:
+ #
+ backend: str
+ name: str
+ size: Optional[int] = None
+ mtime: Optional[int] = None
+ chunk_size: Optional[int] = None
+ chunk_number: Optional[int] = None
+ suffix: Optional[str] = None
+
+ def __str__(self) -> str:
+ s = self.backend
+ if self.size is not None:
+ s += f"-s{self.size}"
+ if self.mtime is not None:
+ s += f"-m{self.mtime}"
+ if self.chunk_size is not None:
+ s += f"-S{self.chunk_size}"
+ if self.chunk_number is not None:
+ s += f"-C{self.chunk_number}"
+ s += f"--{self.name}"
+ if self.suffix is not None:
+ s += self.suffix
+ return s
+
+ @classmethod
+ def parse(cls, s: str) -> AnnexKey:
+ m = re.fullmatch(
+ r"(?P[A-Z0-9_]{2,14})"
+ r"(?:-s(?P[0-9]+))?"
+ r"(?:-m(?P[0-9]+))?"
+ r"(?:-S(?P[0-9]+)-C(?P[0-9]+))?"
+ r"--(?P.+)",
+ s,
+ )
+ if m:
+ backend = m["backend"]
+ size = int(m["size"]) if m["size"] is not None else None
+ mtime = int(m["mtime"]) if m["mtime"] is not None else None
+ chunk_size = int(m["chunk_size"]) if m["chunk_size"] is not None else None
+ chunk_number = (
+ int(m["chunk_number"]) if m["chunk_number"] is not None else None
+ )
+ name = m["name"]
+ if backend.endswith("E"):
+ name, sep, suffix = name.rpartition(".")
+ suffix = sep + suffix
+ else:
+ suffix = None
+ return cls(
+ backend=backend,
+ size=size,
+ mtime=mtime,
+ chunk_size=chunk_size,
+ chunk_number=chunk_number,
+ name=name,
+ suffix=suffix,
+ )
+ else:
+ raise ValueError(f"invalid git-annex key: {s!r}")
+
+ @classmethod
+ def parse_filename(cls, s: str) -> AnnexKey:
+ fields, sep, name = s.partition("--")
+ # See `keyFile` and `fileKey` in `Annex/Locations.hs` in the git-annex
+ # source
+ name = (
+ name.replace("%", "/")
+ .replace("&c", ":")
+ .replace("&s", "%")
+ .replace("&a", "&")
+ )
+ return cls.parse(fields + sep + name)
+
+
+@dataclass
+class AnnexDir:
+ topdir: str
+
+
# might be called twice in rapid succession for an annex key path
@lru_cache(maxsize=CACHE_SIZE)
-def is_annex_dir_or_key(path: Union[str, Path]) -> Optional[Tuple[str, str]]:
+def is_annex_dir_or_key(path: str | Path) -> AnnexDir | AnnexKey | None:
parts = list(Path(path).parts)
start = 0
while True:
@@ -22,16 +106,15 @@ def is_annex_dir_or_key(path: Union[str, Path]) -> Optional[Tuple[str, str]]:
topdir = str(Path(*parts[:i]))
depth = len(parts) - i
if depth <= 5: # have only two level of hash'ing directories
- return (topdir, "dir")
- # matches an annex key regex in the form of
- # BACKEND[-sSIZE][-mMTIME][-Ssize-Cchunk]--HASH[EXTENSION]
- if re.fullmatch(
- r"[A-Z0-9_]{2,14}(?:-s[0-9]+)?(?:-m[0-9]+)?(?:-S[0-9]+-C[0-9]+)?--.*",
- parts[i + 5],
- ):
+ return AnnexDir(topdir)
+ try:
+ key = AnnexKey.parse_filename(parts[i + 5])
+ except ValueError:
+ pass
+ else:
# note: key and its directory must match in name
if depth == 7 and parts[-1] == parts[-2]:
- return (topdir, "key")
+ return key
elif depth == 6:
- return (topdir, "dir")
+ return AnnexDir(topdir)
start = i + 1