back to Reference (Gold) summary
Reference (Gold): filesystem_spec
Pytest Summary for test tests
status | count |
---|---|
passed | 698 |
xfailed | 2 |
skipped | 92 |
total | 792 |
collected | 792 |
Failed pytests:
test_api.py::test_multilevel_chained_fs
test_api.py::test_multilevel_chained_fs
@pytest.mark.xfail(reason="see issue #334", strict=True) def test_multilevel_chained_fs(): """This test reproduces fsspec/filesystem_spec#334""" import zipfile d1 = tempfile.mkdtemp() f1 = os.path.join(d1, "f1.zip") with zipfile.ZipFile(f1, mode="w") as z: # filename, content z.writestr("foo.txt", "foo.txt") z.writestr("bar.txt", "bar.txt") # We expected this to be the correct syntax > with pytest.raises(IsADirectoryError): E Failed: DID NOT RAISEfsspec/tests/test_api.py:252: Failed
test_spec.py::test_find
test_spec.py::test_find
@pytest.mark.xfail def test_find(): """Test .find() method on debian server (ftp, https) with constant folder""" filesystem, host, test_path = ( FTPFileSystem, "ftp.fau.de", "ftp://ftp.fau.de/debian-cd/current/amd64/log/success", ) test_fs = filesystem(host) filenames_ftp = test_fs.find(test_path) > assert filenames_ftp E assert [] fsspec/tests/test_spec.py:699: AssertionError
Patch diff
diff --git a/fsspec/archive.py b/fsspec/archive.py
index 1a4570f..f466780 100644
--- a/fsspec/archive.py
+++ b/fsspec/archive.py
@@ -13,9 +13,13 @@ class AbstractArchiveFileSystem(AbstractFileSystem):
"""
def __str__(self):
- return f'<Archive-like object {type(self).__name__} at {id(self)}>'
+ return f"<Archive-like object {type(self).__name__} at {id(self)}>"
+
__repr__ = __str__
+ def ukey(self, path):
+ return tokenize(path, self.fo, self.protocol)
+
def _all_dirnames(self, paths):
"""Returns *all* directory names for each path in paths, including intermediate
ones.
@@ -24,4 +28,46 @@ class AbstractArchiveFileSystem(AbstractFileSystem):
----------
paths: Iterable of path strings
"""
- pass
+ if len(paths) == 0:
+ return set()
+
+ dirnames = {self._parent(path) for path in paths} - {self.root_marker}
+ return dirnames | self._all_dirnames(dirnames)
+
+ def info(self, path, **kwargs):
+ self._get_dirs()
+ path = self._strip_protocol(path)
+ if path in {"", "/"} and self.dir_cache:
+ return {"name": "", "type": "directory", "size": 0}
+ if path in self.dir_cache:
+ return self.dir_cache[path]
+ elif path + "/" in self.dir_cache:
+ return self.dir_cache[path + "/"]
+ else:
+ raise FileNotFoundError(path)
+
+ def ls(self, path, detail=True, **kwargs):
+ self._get_dirs()
+ paths = {}
+ for p, f in self.dir_cache.items():
+ p = p.rstrip("/")
+ if "/" in p:
+ root = p.rsplit("/", 1)[0]
+ else:
+ root = ""
+ if root == path.rstrip("/"):
+ paths[p] = f
+ elif all(
+ (a == b)
+ for a, b in zip(path.split("/"), [""] + p.strip("/").split("/"))
+ ):
+ # root directory entry
+ ppath = p.rstrip("/").split("/", 1)[0]
+ if ppath not in paths:
+ out = {"name": ppath, "size": 0, "type": "directory"}
+ paths[ppath] = out
+ if detail:
+ out = sorted(paths.values(), key=lambda _: _["name"])
+ return out
+ else:
+ return sorted(paths)
diff --git a/fsspec/asyn.py b/fsspec/asyn.py
index 551290c..a040efc 100644
--- a/fsspec/asyn.py
+++ b/fsspec/asyn.py
@@ -10,15 +10,17 @@ import threading
from contextlib import contextmanager
from glob import has_magic
from typing import TYPE_CHECKING, Iterable
+
from .callbacks import DEFAULT_CALLBACK
from .exceptions import FSTimeoutError
from .implementations.local import LocalFileSystem, make_path_posix, trailing_sep
from .spec import AbstractBufferedFile, AbstractFileSystem
from .utils import glob_translate, is_exception, other_paths
-private = re.compile('_[^_]')
-iothread = [None]
-loop = [None]
-_lock = None
+
+private = re.compile("_[^_]")
+iothread = [None] # dedicated fsspec IO thread
+loop = [None] # global event loop for any non-async instance
+_lock = None # global lock placeholder
get_running_loop = asyncio.get_running_loop
@@ -27,7 +29,10 @@ def get_lock():
The lock is allocated on first use to allow setting one lock per forked process.
"""
- pass
+ global _lock
+ if not _lock:
+ _lock = threading.Lock()
+ return _lock
def reset_lock():
@@ -36,7 +41,23 @@ def reset_lock():
This should be called only on the init of a forked process to reset the lock to
None, enabling the new forked process to get a new lock.
"""
- pass
+ global _lock
+
+ iothread[0] = None
+ loop[0] = None
+ _lock = None
+
+
+async def _runner(event, coro, result, timeout=None):
+ timeout = timeout if timeout else None # convert 0 or 0.0 to None
+ if timeout is not None:
+ coro = asyncio.wait_for(coro, timeout=timeout)
+ try:
+ result[0] = await coro
+ except Exception as ex:
+ result[0] = ex
+ finally:
+ event.set()
def sync(loop, func, *args, timeout=None, **kwargs):
@@ -48,7 +69,40 @@ def sync(loop, func, *args, timeout=None, **kwargs):
>>> fsspec.asyn.sync(fsspec.asyn.get_loop(), func, *args,
timeout=timeout, **kwargs)
"""
- pass
+ timeout = timeout if timeout else None # convert 0 or 0.0 to None
+ # NB: if the loop is not running *yet*, it is OK to submit work
+ # and we will wait for it
+ if loop is None or loop.is_closed():
+ raise RuntimeError("Loop is not running")
+ try:
+ loop0 = asyncio.events.get_running_loop()
+ if loop0 is loop:
+ raise NotImplementedError("Calling sync() from within a running loop")
+ except NotImplementedError:
+ raise
+ except RuntimeError:
+ pass
+ coro = func(*args, **kwargs)
+ result = [None]
+ event = threading.Event()
+ asyncio.run_coroutine_threadsafe(_runner(event, coro, result, timeout), loop)
+ while True:
+ # this loops allows thread to get interrupted
+ if event.wait(1):
+ break
+ if timeout is not None:
+ timeout -= 1
+ if timeout < 0:
+ raise FSTimeoutError
+
+ return_result = result[0]
+ if isinstance(return_result, asyncio.TimeoutError):
+ # suppress asyncio.TimeoutError, raise FSTimeoutError
+ raise FSTimeoutError from return_result
+ elif isinstance(return_result, BaseException):
+ raise return_result
+ else:
+ return return_result
def sync_wrapper(func, obj=None):
@@ -57,7 +111,25 @@ def sync_wrapper(func, obj=None):
Leave obj=None if defining within a class. Pass the instance if attaching
as an attribute of the instance.
"""
- pass
+
+ @functools.wraps(func)
+ def wrapper(*args, **kwargs):
+ self = obj or args[0]
+ return sync(self.loop, func, *args, **kwargs)
+
+ return wrapper
+
+
+@contextmanager
+def _selector_policy():
+ original_policy = asyncio.get_event_loop_policy()
+ try:
+ if os.name == "nt" and hasattr(asyncio, "WindowsSelectorEventLoopPolicy"):
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
+
+ yield
+ finally:
+ asyncio.set_event_loop_policy(original_policy)
def get_loop():
@@ -65,11 +137,23 @@ def get_loop():
The loop will be running on a separate thread.
"""
- pass
+ if loop[0] is None:
+ with get_lock():
+ # repeat the check just in case the loop got filled between the
+ # previous two calls from another thread
+ if loop[0] is None:
+ with _selector_policy():
+ loop[0] = asyncio.new_event_loop()
+ th = threading.Thread(target=loop[0].run_forever, name="fsspecIO")
+ th.daemon = True
+ th.start()
+ iothread[0] = th
+ return loop[0]
if TYPE_CHECKING:
import resource
+
ResourceError = resource.error
else:
try:
@@ -78,18 +162,54 @@ else:
resource = None
ResourceError = OSError
else:
- ResourceError = getattr(resource, 'error', OSError)
+ ResourceError = getattr(resource, "error", OSError)
+
_DEFAULT_BATCH_SIZE = 128
_NOFILES_DEFAULT_BATCH_SIZE = 1280
-def running_async() ->bool:
+def _get_batch_size(nofiles=False):
+ from fsspec.config import conf
+
+ if nofiles:
+ if "nofiles_gather_batch_size" in conf:
+ return conf["nofiles_gather_batch_size"]
+ else:
+ if "gather_batch_size" in conf:
+ return conf["gather_batch_size"]
+ if nofiles:
+ return _NOFILES_DEFAULT_BATCH_SIZE
+ if resource is None:
+ return _DEFAULT_BATCH_SIZE
+
+ try:
+ soft_limit, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
+ except (ImportError, ValueError, ResourceError):
+ return _DEFAULT_BATCH_SIZE
+
+ if soft_limit == resource.RLIM_INFINITY:
+ return -1
+ else:
+ return soft_limit // 8
+
+
+def running_async() -> bool:
"""Being executed by an event loop?"""
- pass
+ try:
+ asyncio.get_running_loop()
+ return True
+ except RuntimeError:
+ return False
-async def _run_coros_in_chunks(coros, batch_size=None, callback=
- DEFAULT_CALLBACK, timeout=None, return_exceptions=False, nofiles=False):
+async def _run_coros_in_chunks(
+ coros,
+ batch_size=None,
+ callback=DEFAULT_CALLBACK,
+ timeout=None,
+ return_exceptions=False,
+ nofiles=False,
+):
"""Run the given coroutines in chunks.
Parameters
@@ -111,13 +231,68 @@ async def _run_coros_in_chunks(coros, batch_size=None, callback=
If inferring the batch_size, does this operation involve local files?
If yes, you normally expect smaller batches.
"""
- pass
+ if batch_size is None:
+ batch_size = _get_batch_size(nofiles=nofiles)
-async_methods = ['_ls', '_cat_file', '_get_file', '_put_file', '_rm_file',
- '_cp_file', '_pipe_file', '_expand_path', '_info', '_isfile', '_isdir',
- '_exists', '_walk', '_glob', '_find', '_du', '_size', '_mkdir', '_makedirs'
- ]
+ if batch_size == -1:
+ batch_size = len(coros)
+
+ assert batch_size > 0
+
+ async def _run_coro(coro, i):
+ try:
+ return await asyncio.wait_for(coro, timeout=timeout), i
+ except Exception as e:
+ if not return_exceptions:
+ raise
+ return e, i
+ finally:
+ callback.relative_update(1)
+
+ i = 0
+ n = len(coros)
+ results = [None] * n
+ pending = set()
+
+ while pending or i < n:
+ while len(pending) < batch_size and i < n:
+ pending.add(asyncio.ensure_future(_run_coro(coros[i], i)))
+ i += 1
+
+ if not pending:
+ break
+
+ done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED)
+ while done:
+ result, k = await done.pop()
+ results[k] = result
+
+ return results
+
+
+# these methods should be implemented as async by any async-able backend
+async_methods = [
+ "_ls",
+ "_cat_file",
+ "_get_file",
+ "_put_file",
+ "_rm_file",
+ "_cp_file",
+ "_pipe_file",
+ "_expand_path",
+ "_info",
+ "_isfile",
+ "_isdir",
+ "_exists",
+ "_walk",
+ "_glob",
+ "_find",
+ "_du",
+ "_size",
+ "_mkdir",
+ "_makedirs",
+]
class AsyncFileSystem(AbstractFileSystem):
@@ -129,12 +304,15 @@ class AsyncFileSystem(AbstractFileSystem):
should inherit from this class instead of AbstractFileSystem. Docstrings are
copied from the un-underscored method in AbstractFileSystem, if not given.
"""
+
+ # note that methods do not have docstring here; they will be copied
+ # for _* methods and inferred for overridden methods.
+
async_impl = True
mirror_sync_methods = True
disable_throttling = False
- def __init__(self, *args, asynchronous=False, loop=None, batch_size=
- None, **kwargs):
+ def __init__(self, *args, asynchronous=False, loop=None, batch_size=None, **kwargs):
self.asynchronous = asynchronous
self._pid = os.getpid()
if not asynchronous:
@@ -144,12 +322,166 @@ class AsyncFileSystem(AbstractFileSystem):
self.batch_size = batch_size
super().__init__(*args, **kwargs)
+ @property
+ def loop(self):
+ if self._pid != os.getpid():
+ raise RuntimeError("This class is not fork-safe")
+ return self._loop
+
+ async def _rm_file(self, path, **kwargs):
+ raise NotImplementedError
+
+ async def _rm(self, path, recursive=False, batch_size=None, **kwargs):
+ # TODO: implement on_error
+ batch_size = batch_size or self.batch_size
+ path = await self._expand_path(path, recursive=recursive)
+ return await _run_coros_in_chunks(
+ [self._rm_file(p, **kwargs) for p in reversed(path)],
+ batch_size=batch_size,
+ nofiles=True,
+ )
+
+ async def _cp_file(self, path1, path2, **kwargs):
+ raise NotImplementedError
+
+ async def _copy(
+ self,
+ path1,
+ path2,
+ recursive=False,
+ on_error=None,
+ maxdepth=None,
+ batch_size=None,
+ **kwargs,
+ ):
+ if on_error is None and recursive:
+ on_error = "ignore"
+ elif on_error is None:
+ on_error = "raise"
+
+ if isinstance(path1, list) and isinstance(path2, list):
+ # No need to expand paths when both source and destination
+ # are provided as lists
+ paths1 = path1
+ paths2 = path2
+ else:
+ source_is_str = isinstance(path1, str)
+ paths1 = await self._expand_path(
+ path1, maxdepth=maxdepth, recursive=recursive
+ )
+ if source_is_str and (not recursive or maxdepth is not None):
+ # Non-recursive glob does not copy directories
+ paths1 = [
+ p for p in paths1 if not (trailing_sep(p) or await self._isdir(p))
+ ]
+ if not paths1:
+ return
+
+ source_is_file = len(paths1) == 1
+ dest_is_dir = isinstance(path2, str) and (
+ trailing_sep(path2) or await self._isdir(path2)
+ )
+
+ exists = source_is_str and (
+ (has_magic(path1) and source_is_file)
+ or (not has_magic(path1) and dest_is_dir and not trailing_sep(path1))
+ )
+ paths2 = other_paths(
+ paths1,
+ path2,
+ exists=exists,
+ flatten=not source_is_str,
+ )
+
+ batch_size = batch_size or self.batch_size
+ coros = [self._cp_file(p1, p2, **kwargs) for p1, p2 in zip(paths1, paths2)]
+ result = await _run_coros_in_chunks(
+ coros, batch_size=batch_size, return_exceptions=True, nofiles=True
+ )
+
+ for ex in filter(is_exception, result):
+ if on_error == "ignore" and isinstance(ex, FileNotFoundError):
+ continue
+ raise ex
+
+ async def _pipe_file(self, path, value, **kwargs):
+ raise NotImplementedError
+
+ async def _pipe(self, path, value=None, batch_size=None, **kwargs):
+ if isinstance(path, str):
+ path = {path: value}
+ batch_size = batch_size or self.batch_size
+ return await _run_coros_in_chunks(
+ [self._pipe_file(k, v, **kwargs) for k, v in path.items()],
+ batch_size=batch_size,
+ nofiles=True,
+ )
+
async def _process_limits(self, url, start, end):
"""Helper for "Range"-based _cat_file"""
- pass
+ size = None
+ suff = False
+ if start is not None and start < 0:
+ # if start is negative and end None, end is the "suffix length"
+ if end is None:
+ end = -start
+ start = ""
+ suff = True
+ else:
+ size = size or (await self._info(url))["size"]
+ start = size + start
+ elif start is None:
+ start = 0
+ if not suff:
+ if end is not None and end < 0:
+ if start is not None:
+ size = size or (await self._info(url))["size"]
+ end = size + end
+ elif end is None:
+ end = ""
+ if isinstance(end, numbers.Integral):
+ end -= 1 # bytes range is inclusive
+ return f"bytes={start}-{end}"
+
+ async def _cat_file(self, path, start=None, end=None, **kwargs):
+ raise NotImplementedError
+
+ async def _cat(
+ self, path, recursive=False, on_error="raise", batch_size=None, **kwargs
+ ):
+ paths = await self._expand_path(path, recursive=recursive)
+ coros = [self._cat_file(path, **kwargs) for path in paths]
+ batch_size = batch_size or self.batch_size
+ out = await _run_coros_in_chunks(
+ coros, batch_size=batch_size, nofiles=True, return_exceptions=True
+ )
+ if on_error == "raise":
+ ex = next(filter(is_exception, out), False)
+ if ex:
+ raise ex
+ if (
+ len(paths) > 1
+ or isinstance(path, list)
+ or paths[0] != self._strip_protocol(path)
+ ):
+ return {
+ k: v
+ for k, v in zip(paths, out)
+ if on_error != "omit" or not is_exception(v)
+ }
+ else:
+ return out[0]
- async def _cat_ranges(self, paths, starts, ends, max_gap=None,
- batch_size=None, on_error='return', **kwargs):
+ async def _cat_ranges(
+ self,
+ paths,
+ starts,
+ ends,
+ max_gap=None,
+ batch_size=None,
+ on_error="return",
+ **kwargs,
+ ):
"""Get the contents of byte ranges from one or more files
Parameters
@@ -160,10 +492,40 @@ class AsyncFileSystem(AbstractFileSystem):
Bytes limits of the read. If using a single int, the same value will be
used to read all the specified files.
"""
- pass
+ # TODO: on_error
+ if max_gap is not None:
+ # use utils.merge_offset_ranges
+ raise NotImplementedError
+ if not isinstance(paths, list):
+ raise TypeError
+ if not isinstance(starts, Iterable):
+ starts = [starts] * len(paths)
+ if not isinstance(ends, Iterable):
+ ends = [ends] * len(paths)
+ if len(starts) != len(paths) or len(ends) != len(paths):
+ raise ValueError
+ coros = [
+ self._cat_file(p, start=s, end=e, **kwargs)
+ for p, s, e in zip(paths, starts, ends)
+ ]
+ batch_size = batch_size or self.batch_size
+ return await _run_coros_in_chunks(
+ coros, batch_size=batch_size, nofiles=True, return_exceptions=True
+ )
+
+ async def _put_file(self, lpath, rpath, **kwargs):
+ raise NotImplementedError
- async def _put(self, lpath, rpath, recursive=False, callback=
- DEFAULT_CALLBACK, batch_size=None, maxdepth=None, **kwargs):
+ async def _put(
+ self,
+ lpath,
+ rpath,
+ recursive=False,
+ callback=DEFAULT_CALLBACK,
+ batch_size=None,
+ maxdepth=None,
+ **kwargs,
+ ):
"""Copy file(s) from local.
Copies a specific file or tree of files (if recursive=True). If rpath
@@ -177,10 +539,69 @@ class AsyncFileSystem(AbstractFileSystem):
constructor, or for all instances by setting the "gather_batch_size" key
in ``fsspec.config.conf``, falling back to 1/8th of the system limit .
"""
- pass
+ if isinstance(lpath, list) and isinstance(rpath, list):
+ # No need to expand paths when both source and destination
+ # are provided as lists
+ rpaths = rpath
+ lpaths = lpath
+ else:
+ source_is_str = isinstance(lpath, str)
+ if source_is_str:
+ lpath = make_path_posix(lpath)
+ fs = LocalFileSystem()
+ lpaths = fs.expand_path(lpath, recursive=recursive, maxdepth=maxdepth)
+ if source_is_str and (not recursive or maxdepth is not None):
+ # Non-recursive glob does not copy directories
+ lpaths = [p for p in lpaths if not (trailing_sep(p) or fs.isdir(p))]
+ if not lpaths:
+ return
+
+ source_is_file = len(lpaths) == 1
+ dest_is_dir = isinstance(rpath, str) and (
+ trailing_sep(rpath) or await self._isdir(rpath)
+ )
+
+ rpath = self._strip_protocol(rpath)
+ exists = source_is_str and (
+ (has_magic(lpath) and source_is_file)
+ or (not has_magic(lpath) and dest_is_dir and not trailing_sep(lpath))
+ )
+ rpaths = other_paths(
+ lpaths,
+ rpath,
+ exists=exists,
+ flatten=not source_is_str,
+ )
+
+ is_dir = {l: os.path.isdir(l) for l in lpaths}
+ rdirs = [r for l, r in zip(lpaths, rpaths) if is_dir[l]]
+ file_pairs = [(l, r) for l, r in zip(lpaths, rpaths) if not is_dir[l]]
+
+ await asyncio.gather(*[self._makedirs(d, exist_ok=True) for d in rdirs])
+ batch_size = batch_size or self.batch_size
- async def _get(self, rpath, lpath, recursive=False, callback=
- DEFAULT_CALLBACK, maxdepth=None, **kwargs):
+ coros = []
+ callback.set_size(len(file_pairs))
+ for lfile, rfile in file_pairs:
+ put_file = callback.branch_coro(self._put_file)
+ coros.append(put_file(lfile, rfile, **kwargs))
+
+ return await _run_coros_in_chunks(
+ coros, batch_size=batch_size, callback=callback
+ )
+
+ async def _get_file(self, rpath, lpath, **kwargs):
+ raise NotImplementedError
+
+ async def _get(
+ self,
+ rpath,
+ lpath,
+ recursive=False,
+ callback=DEFAULT_CALLBACK,
+ maxdepth=None,
+ **kwargs,
+ ):
"""Copy file(s) to local.
Copies a specific file or tree of files (if recursive=True). If lpath
@@ -195,7 +616,298 @@ class AsyncFileSystem(AbstractFileSystem):
constructor, or for all instances by setting the "gather_batch_size" key
in ``fsspec.config.conf``, falling back to 1/8th of the system limit .
"""
- pass
+ if isinstance(lpath, list) and isinstance(rpath, list):
+ # No need to expand paths when both source and destination
+ # are provided as lists
+ rpaths = rpath
+ lpaths = lpath
+ else:
+ source_is_str = isinstance(rpath, str)
+ # First check for rpath trailing slash as _strip_protocol removes it.
+ source_not_trailing_sep = source_is_str and not trailing_sep(rpath)
+ rpath = self._strip_protocol(rpath)
+ rpaths = await self._expand_path(
+ rpath, recursive=recursive, maxdepth=maxdepth
+ )
+ if source_is_str and (not recursive or maxdepth is not None):
+ # Non-recursive glob does not copy directories
+ rpaths = [
+ p for p in rpaths if not (trailing_sep(p) or await self._isdir(p))
+ ]
+ if not rpaths:
+ return
+
+ lpath = make_path_posix(lpath)
+ source_is_file = len(rpaths) == 1
+ dest_is_dir = isinstance(lpath, str) and (
+ trailing_sep(lpath) or LocalFileSystem().isdir(lpath)
+ )
+
+ exists = source_is_str and (
+ (has_magic(rpath) and source_is_file)
+ or (not has_magic(rpath) and dest_is_dir and source_not_trailing_sep)
+ )
+ lpaths = other_paths(
+ rpaths,
+ lpath,
+ exists=exists,
+ flatten=not source_is_str,
+ )
+
+ [os.makedirs(os.path.dirname(lp), exist_ok=True) for lp in lpaths]
+ batch_size = kwargs.pop("batch_size", self.batch_size)
+
+ coros = []
+ callback.set_size(len(lpaths))
+ for lpath, rpath in zip(lpaths, rpaths):
+ get_file = callback.branch_coro(self._get_file)
+ coros.append(get_file(rpath, lpath, **kwargs))
+ return await _run_coros_in_chunks(
+ coros, batch_size=batch_size, callback=callback
+ )
+
+ async def _isfile(self, path):
+ try:
+ return (await self._info(path))["type"] == "file"
+ except: # noqa: E722
+ return False
+
+ async def _isdir(self, path):
+ try:
+ return (await self._info(path))["type"] == "directory"
+ except OSError:
+ return False
+
+ async def _size(self, path):
+ return (await self._info(path)).get("size", None)
+
+ async def _sizes(self, paths, batch_size=None):
+ batch_size = batch_size or self.batch_size
+ return await _run_coros_in_chunks(
+ [self._size(p) for p in paths], batch_size=batch_size
+ )
+
+ async def _exists(self, path, **kwargs):
+ try:
+ await self._info(path, **kwargs)
+ return True
+ except FileNotFoundError:
+ return False
+
+ async def _info(self, path, **kwargs):
+ raise NotImplementedError
+
+ async def _ls(self, path, detail=True, **kwargs):
+ raise NotImplementedError
+
+ async def _walk(self, path, maxdepth=None, on_error="omit", **kwargs):
+ if maxdepth is not None and maxdepth < 1:
+ raise ValueError("maxdepth must be at least 1")
+
+ path = self._strip_protocol(path)
+ full_dirs = {}
+ dirs = {}
+ files = {}
+
+ detail = kwargs.pop("detail", False)
+ try:
+ listing = await self._ls(path, detail=True, **kwargs)
+ except (FileNotFoundError, OSError) as e:
+ if on_error == "raise":
+ raise
+ elif callable(on_error):
+ on_error(e)
+ if detail:
+ yield path, {}, {}
+ else:
+ yield path, [], []
+ return
+
+ for info in listing:
+ # each info name must be at least [path]/part , but here
+ # we check also for names like [path]/part/
+ pathname = info["name"].rstrip("/")
+ name = pathname.rsplit("/", 1)[-1]
+ if info["type"] == "directory" and pathname != path:
+ # do not include "self" path
+ full_dirs[name] = pathname
+ dirs[name] = info
+ elif pathname == path:
+ # file-like with same name as give path
+ files[""] = info
+ else:
+ files[name] = info
+
+ if detail:
+ yield path, dirs, files
+ else:
+ yield path, list(dirs), list(files)
+
+ if maxdepth is not None:
+ maxdepth -= 1
+ if maxdepth < 1:
+ return
+
+ for d in dirs:
+ async for _ in self._walk(
+ full_dirs[d], maxdepth=maxdepth, detail=detail, **kwargs
+ ):
+ yield _
+
+ async def _glob(self, path, maxdepth=None, **kwargs):
+ if maxdepth is not None and maxdepth < 1:
+ raise ValueError("maxdepth must be at least 1")
+
+ import re
+
+ seps = (os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)
+ ends_with_sep = path.endswith(seps) # _strip_protocol strips trailing slash
+ path = self._strip_protocol(path)
+ append_slash_to_dirname = ends_with_sep or path.endswith(
+ tuple(sep + "**" for sep in seps)
+ )
+ idx_star = path.find("*") if path.find("*") >= 0 else len(path)
+ idx_qmark = path.find("?") if path.find("?") >= 0 else len(path)
+ idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
+
+ min_idx = min(idx_star, idx_qmark, idx_brace)
+
+ detail = kwargs.pop("detail", False)
+
+ if not has_magic(path):
+ if await self._exists(path, **kwargs):
+ if not detail:
+ return [path]
+ else:
+ return {path: await self._info(path, **kwargs)}
+ else:
+ if not detail:
+ return [] # glob of non-existent returns empty
+ else:
+ return {}
+ elif "/" in path[:min_idx]:
+ min_idx = path[:min_idx].rindex("/")
+ root = path[: min_idx + 1]
+ depth = path[min_idx + 1 :].count("/") + 1
+ else:
+ root = ""
+ depth = path[min_idx + 1 :].count("/") + 1
+
+ if "**" in path:
+ if maxdepth is not None:
+ idx_double_stars = path.find("**")
+ depth_double_stars = path[idx_double_stars:].count("/") + 1
+ depth = depth - depth_double_stars + maxdepth
+ else:
+ depth = None
+
+ allpaths = await self._find(
+ root, maxdepth=depth, withdirs=True, detail=True, **kwargs
+ )
+
+ pattern = glob_translate(path + ("/" if ends_with_sep else ""))
+ pattern = re.compile(pattern)
+
+ out = {
+ p: info
+ for p, info in sorted(allpaths.items())
+ if pattern.match(
+ (
+ p + "/"
+ if append_slash_to_dirname and info["type"] == "directory"
+ else p
+ )
+ )
+ }
+
+ if detail:
+ return out
+ else:
+ return list(out)
+
+ async def _du(self, path, total=True, maxdepth=None, **kwargs):
+ sizes = {}
+ # async for?
+ for f in await self._find(path, maxdepth=maxdepth, **kwargs):
+ info = await self._info(f)
+ sizes[info["name"]] = info["size"]
+ if total:
+ return sum(sizes.values())
+ else:
+ return sizes
+
+ async def _find(self, path, maxdepth=None, withdirs=False, **kwargs):
+ path = self._strip_protocol(path)
+ out = {}
+ detail = kwargs.pop("detail", False)
+
+ # Add the root directory if withdirs is requested
+ # This is needed for posix glob compliance
+ if withdirs and path != "" and await self._isdir(path):
+ out[path] = await self._info(path)
+
+ # async for?
+ async for _, dirs, files in self._walk(path, maxdepth, detail=True, **kwargs):
+ if withdirs:
+ files.update(dirs)
+ out.update({info["name"]: info for name, info in files.items()})
+ if not out and (await self._isfile(path)):
+ # walk works on directories, but find should also return [path]
+ # when path happens to be a file
+ out[path] = {}
+ names = sorted(out)
+ if not detail:
+ return names
+ else:
+ return {name: out[name] for name in names}
+
+ async def _expand_path(self, path, recursive=False, maxdepth=None):
+ if maxdepth is not None and maxdepth < 1:
+ raise ValueError("maxdepth must be at least 1")
+
+ if isinstance(path, str):
+ out = await self._expand_path([path], recursive, maxdepth)
+ else:
+ out = set()
+ path = [self._strip_protocol(p) for p in path]
+ for p in path: # can gather here
+ if has_magic(p):
+ bit = set(await self._glob(p, maxdepth=maxdepth))
+ out |= bit
+ if recursive:
+ # glob call above expanded one depth so if maxdepth is defined
+ # then decrement it in expand_path call below. If it is zero
+ # after decrementing then avoid expand_path call.
+ if maxdepth is not None and maxdepth <= 1:
+ continue
+ out |= set(
+ await self._expand_path(
+ list(bit),
+ recursive=recursive,
+ maxdepth=maxdepth - 1 if maxdepth is not None else None,
+ )
+ )
+ continue
+ elif recursive:
+ rec = set(await self._find(p, maxdepth=maxdepth, withdirs=True))
+ out |= rec
+ if p not in out and (recursive is False or (await self._exists(p))):
+ # should only check once, for the root
+ out.add(p)
+ if not out:
+ raise FileNotFoundError(path)
+ return sorted(out)
+
+ async def _mkdir(self, path, create_parents=True, **kwargs):
+ pass # not necessary to implement, may not have directories
+
+ async def _makedirs(self, path, exist_ok=False):
+ pass # not necessary to implement, may not have directories
+
+ async def open_async(self, path, mode="rb", **kwargs):
+ if "b" not in mode or kwargs.get("compression"):
+ raise ValueError
+ raise NotImplementedError
def mirror_sync_methods(obj):
@@ -211,14 +923,65 @@ def mirror_sync_methods(obj):
AbstractFileSystem
- AsyncFileSystem: async-specific default coroutines
"""
- pass
+ from fsspec import AbstractFileSystem
+
+ for method in async_methods + dir(AsyncFileSystem):
+ if not method.startswith("_"):
+ continue
+ smethod = method[1:]
+ if private.match(method):
+ isco = inspect.iscoroutinefunction(getattr(obj, method, None))
+ unsync = getattr(getattr(obj, smethod, False), "__func__", None)
+ is_default = unsync is getattr(AbstractFileSystem, smethod, "")
+ if isco and is_default:
+ mth = sync_wrapper(getattr(obj, method), obj=obj)
+ setattr(obj, smethod, mth)
+ if not mth.__doc__:
+ mth.__doc__ = getattr(
+ getattr(AbstractFileSystem, smethod, None), "__doc__", ""
+ )
class FSSpecCoroutineCancel(Exception):
pass
+def _dump_running_tasks(
+ printout=True, cancel=True, exc=FSSpecCoroutineCancel, with_task=False
+):
+ import traceback
+
+ tasks = [t for t in asyncio.tasks.all_tasks(loop[0]) if not t.done()]
+ if printout:
+ [task.print_stack() for task in tasks]
+ out = [
+ {
+ "locals": task._coro.cr_frame.f_locals,
+ "file": task._coro.cr_frame.f_code.co_filename,
+ "firstline": task._coro.cr_frame.f_code.co_firstlineno,
+ "linelo": task._coro.cr_frame.f_lineno,
+ "stack": traceback.format_stack(task._coro.cr_frame),
+ "task": task if with_task else None,
+ }
+ for task in tasks
+ ]
+ if cancel:
+ for t in tasks:
+ cbs = t._callbacks
+ t.cancel()
+ asyncio.futures.Future.set_exception(t, exc)
+ asyncio.futures.Future.cancel(t)
+ [cb[0](t) for cb in cbs] # cancels any dependent concurrent.futures
+ try:
+ t._coro.throw(exc) # exits coro, unless explicitly handled
+ except exc:
+ pass
+ return out
+
+
class AbstractAsyncStreamedFile(AbstractBufferedFile):
+ # no read buffering, and always auto-commit
+ # TODO: readahead might still be useful here, but needs async version
async def read(self, length=-1):
"""
@@ -229,7 +992,19 @@ class AbstractAsyncStreamedFile(AbstractBufferedFile):
length: int (-1)
Number of bytes to read; if <0, all remaining bytes.
"""
- pass
+ length = -1 if length is None else int(length)
+ if self.mode != "rb":
+ raise ValueError("File not in read mode")
+ if length < 0:
+ length = self.size - self.loc
+ if self.closed:
+ raise ValueError("I/O operation on closed file.")
+ if length == 0:
+ # don't even bother calling fetch
+ return b""
+ out = await self._fetch_range(self.loc, self.loc + length)
+ self.loc += len(out)
+ return out
async def write(self, data):
"""
@@ -243,17 +1018,79 @@ class AbstractAsyncStreamedFile(AbstractBufferedFile):
data: bytes
Set of bytes to be written.
"""
- pass
+ if self.mode not in {"wb", "ab"}:
+ raise ValueError("File not in write mode")
+ if self.closed:
+ raise ValueError("I/O operation on closed file.")
+ if self.forced:
+ raise ValueError("This file has been force-flushed, can only close")
+ out = self.buffer.write(data)
+ self.loc += out
+ if self.buffer.tell() >= self.blocksize:
+ await self.flush()
+ return out
async def close(self):
"""Close file
Finalizes writes, discards cache
"""
- pass
+ if getattr(self, "_unclosable", False):
+ return
+ if self.closed:
+ return
+ if self.mode == "rb":
+ self.cache = None
+ else:
+ if not self.forced:
+ await self.flush(force=True)
+
+ if self.fs is not None:
+ self.fs.invalidate_cache(self.path)
+ self.fs.invalidate_cache(self.fs._parent(self.path))
+
+ self.closed = True
+
+ async def flush(self, force=False):
+ if self.closed:
+ raise ValueError("Flush on closed file")
+ if force and self.forced:
+ raise ValueError("Force flush cannot be called more than once")
+ if force:
+ self.forced = True
+
+ if self.mode not in {"wb", "ab"}:
+ # no-op to flush on read-mode
+ return
+
+ if not force and self.buffer.tell() < self.blocksize:
+ # Defer write on small block
+ return
+
+ if self.offset is None:
+ # Initialize a multipart upload
+ self.offset = 0
+ try:
+ await self._initiate_upload()
+ except: # noqa: E722
+ self.closed = True
+ raise
+
+ if await self._upload_chunk(final=force) is not False:
+ self.offset += self.buffer.seek(0, 2)
+ self.buffer = io.BytesIO()
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.close()
+
+ async def _fetch_range(self, start, end):
+ raise NotImplementedError
+
+ async def _initiate_upload(self):
+ pass
+
+ async def _upload_chunk(self, final=False):
+ raise NotImplementedError
diff --git a/fsspec/caching.py b/fsspec/caching.py
index c4fc674..a3f7a1c 100644
--- a/fsspec/caching.py
+++ b/fsspec/caching.py
@@ -1,4 +1,5 @@
from __future__ import annotations
+
import collections
import functools
import logging
@@ -7,16 +8,33 @@ import os
import threading
import warnings
from concurrent.futures import Future, ThreadPoolExecutor
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, Generic, NamedTuple, Optional, OrderedDict, TypeVar
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ Callable,
+ ClassVar,
+ Generic,
+ NamedTuple,
+ Optional,
+ OrderedDict,
+ TypeVar,
+)
+
if TYPE_CHECKING:
import mmap
+
from typing_extensions import ParamSpec
- P = ParamSpec('P')
+
+ P = ParamSpec("P")
else:
- P = TypeVar('P')
-T = TypeVar('T')
-logger = logging.getLogger('fsspec')
-Fetcher = Callable[[int, int], bytes]
+ P = TypeVar("P")
+
+T = TypeVar("T")
+
+
+logger = logging.getLogger("fsspec")
+
+Fetcher = Callable[[int, int], bytes] # Maps (start, end) to bytes
class BaseCache:
@@ -34,26 +52,48 @@ class BaseCache:
size: int
How big this file is
"""
- name: ClassVar[str] = 'none'
- def __init__(self, blocksize: int, fetcher: Fetcher, size: int) ->None:
+ name: ClassVar[str] = "none"
+
+ def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
self.blocksize = blocksize
self.nblocks = 0
self.fetcher = fetcher
self.size = size
self.hit_count = 0
self.miss_count = 0
+ # the bytes that we actually requested
self.total_requested_bytes = 0
- def _reset_stats(self) ->None:
+ def _fetch(self, start: int | None, stop: int | None) -> bytes:
+ if start is None:
+ start = 0
+ if stop is None:
+ stop = self.size
+ if start >= self.size or start >= stop:
+ return b""
+ return self.fetcher(start, stop)
+
+ def _reset_stats(self) -> None:
"""Reset hit and miss counts for a more ganular report e.g. by file."""
- pass
+ self.hit_count = 0
+ self.miss_count = 0
+ self.total_requested_bytes = 0
- def _log_stats(self) ->str:
+ def _log_stats(self) -> str:
"""Return a formatted string of the cache statistics."""
- pass
-
- def __repr__(self) ->str:
+ if self.hit_count == 0 and self.miss_count == 0:
+ # a cache that does nothing, this is for logs only
+ return ""
+ return " , %s: %d hits, %d misses, %d total requested bytes" % (
+ self.name,
+ self.hit_count,
+ self.miss_count,
+ self.total_requested_bytes,
+ )
+
+ def __repr__(self) -> str:
+ # TODO: use rich for better formatting
return f"""
<{self.__class__.__name__}:
block size : {self.blocksize}
@@ -73,21 +113,80 @@ class MMapCache(BaseCache):
This cache method might only work on posix
"""
- name = 'mmap'
- def __init__(self, blocksize: int, fetcher: Fetcher, size: int,
- location: (str | None)=None, blocks: (set[int] | None)=None) ->None:
+ name = "mmap"
+
+ def __init__(
+ self,
+ blocksize: int,
+ fetcher: Fetcher,
+ size: int,
+ location: str | None = None,
+ blocks: set[int] | None = None,
+ ) -> None:
super().__init__(blocksize, fetcher, size)
self.blocks = set() if blocks is None else blocks
self.location = location
self.cache = self._makefile()
- def __getstate__(self) ->dict[str, Any]:
+ def _makefile(self) -> mmap.mmap | bytearray:
+ import mmap
+ import tempfile
+
+ if self.size == 0:
+ return bytearray()
+
+ # posix version
+ if self.location is None or not os.path.exists(self.location):
+ if self.location is None:
+ fd = tempfile.TemporaryFile()
+ self.blocks = set()
+ else:
+ fd = open(self.location, "wb+")
+ fd.seek(self.size - 1)
+ fd.write(b"1")
+ fd.flush()
+ else:
+ fd = open(self.location, "r+b")
+
+ return mmap.mmap(fd.fileno(), self.size)
+
+ def _fetch(self, start: int | None, end: int | None) -> bytes:
+ logger.debug(f"MMap cache fetching {start}-{end}")
+ if start is None:
+ start = 0
+ if end is None:
+ end = self.size
+ if start >= self.size or start >= end:
+ return b""
+ start_block = start // self.blocksize
+ end_block = end // self.blocksize
+ need = [i for i in range(start_block, end_block + 1) if i not in self.blocks]
+ hits = [i for i in range(start_block, end_block + 1) if i in self.blocks]
+ self.miss_count += len(need)
+ self.hit_count += len(hits)
+ while need:
+ # TODO: not a for loop so we can consolidate blocks later to
+ # make fewer fetch calls; this could be parallel
+ i = need.pop(0)
+
+ sstart = i * self.blocksize
+ send = min(sstart + self.blocksize, self.size)
+ self.total_requested_bytes += send - sstart
+ logger.debug(f"MMap get block #{i} ({sstart}-{send})")
+ self.cache[sstart:send] = self.fetcher(sstart, send)
+ self.blocks.add(i)
+
+ return self.cache[start:end]
+
+ def __getstate__(self) -> dict[str, Any]:
state = self.__dict__.copy()
- del state['cache']
+ # Remove the unpicklable entries.
+ del state["cache"]
return state
- def __setstate__(self, state: dict[str, Any]) ->None:
+ def __setstate__(self, state: dict[str, Any]) -> None:
+ # Restore instance attributes
self.__dict__.update(state)
self.cache = self._makefile()
@@ -99,14 +198,44 @@ class ReadAheadCache(BaseCache):
fill holes in the cache or keep fragments alive. It is best suited to
many small reads in a sequential order (e.g., reading lines from a file).
"""
- name = 'readahead'
- def __init__(self, blocksize: int, fetcher: Fetcher, size: int) ->None:
+ name = "readahead"
+
+ def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
super().__init__(blocksize, fetcher, size)
- self.cache = b''
+ self.cache = b""
self.start = 0
self.end = 0
+ def _fetch(self, start: int | None, end: int | None) -> bytes:
+ if start is None:
+ start = 0
+ if end is None or end > self.size:
+ end = self.size
+ if start >= self.size or start >= end:
+ return b""
+ l = end - start
+ if start >= self.start and end <= self.end:
+ # cache hit
+ self.hit_count += 1
+ return self.cache[start - self.start : end - self.start]
+ elif self.start <= start < self.end:
+ # partial hit
+ self.miss_count += 1
+ part = self.cache[start - self.start :]
+ l -= len(part)
+ start = self.end
+ else:
+ # miss
+ self.miss_count += 1
+ part = b""
+ end = min(self.size, end + self.blocksize)
+ self.total_requested_bytes += end - start
+ self.cache = self.fetcher(start, end) # new block replaces old
+ self.start = start
+ self.end = self.start + len(self.cache)
+ return part + self.cache[:l]
+
class FirstChunkCache(BaseCache):
"""Caches the first block of a file only
@@ -114,14 +243,45 @@ class FirstChunkCache(BaseCache):
This may be useful for file types where the metadata is stored in the header,
but is randomly accessed.
"""
- name = 'first'
- def __init__(self, blocksize: int, fetcher: Fetcher, size: int) ->None:
+ name = "first"
+
+ def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
if blocksize > size:
+ # this will buffer the whole thing
blocksize = size
super().__init__(blocksize, fetcher, size)
self.cache: bytes | None = None
+ def _fetch(self, start: int | None, end: int | None) -> bytes:
+ start = start or 0
+ if start > self.size:
+ logger.debug("FirstChunkCache: requested start > file size")
+ return b""
+
+ end = min(end, self.size)
+
+ if start < self.blocksize:
+ if self.cache is None:
+ self.miss_count += 1
+ if end > self.blocksize:
+ self.total_requested_bytes += end
+ data = self.fetcher(0, end)
+ self.cache = data[: self.blocksize]
+ return data[start:]
+ self.cache = self.fetcher(0, self.blocksize)
+ self.total_requested_bytes += self.blocksize
+ part = self.cache[start:end]
+ if end > self.blocksize:
+ self.total_requested_bytes += end - self.blocksize
+ part += self.fetcher(self.blocksize, end)
+ self.hit_count += 1
+ return part
+ else:
+ self.miss_count += 1
+ self.total_requested_bytes += end - start
+ return self.fetcher(start, end)
+
class BlockCache(BaseCache):
"""
@@ -145,15 +305,16 @@ class BlockCache(BaseCache):
The maximum number of blocks to cache for. The maximum memory
use for this cache is then ``blocksize * maxblocks``.
"""
- name = 'blockcache'
- def __init__(self, blocksize: int, fetcher: Fetcher, size: int,
- maxblocks: int=32) ->None:
+ name = "blockcache"
+
+ def __init__(
+ self, blocksize: int, fetcher: Fetcher, size: int, maxblocks: int = 32
+ ) -> None:
super().__init__(blocksize, fetcher, size)
self.nblocks = math.ceil(size / blocksize)
self.maxblocks = maxblocks
- self._fetch_block_cached = functools.lru_cache(maxblocks)(self.
- _fetch_block)
+ self._fetch_block_cached = functools.lru_cache(maxblocks)(self._fetch_block)
def cache_info(self):
"""
@@ -164,26 +325,63 @@ class BlockCache(BaseCache):
NamedTuple
Returned directly from the LRU Cache used internally.
"""
- pass
+ return self._fetch_block_cached.cache_info()
- def __getstate__(self) ->dict[str, Any]:
+ def __getstate__(self) -> dict[str, Any]:
state = self.__dict__
- del state['_fetch_block_cached']
+ del state["_fetch_block_cached"]
return state
- def __setstate__(self, state: dict[str, Any]) ->None:
+ def __setstate__(self, state: dict[str, Any]) -> None:
self.__dict__.update(state)
- self._fetch_block_cached = functools.lru_cache(state['maxblocks'])(self
- ._fetch_block)
-
- def _fetch_block(self, block_number: int) ->bytes:
+ self._fetch_block_cached = functools.lru_cache(state["maxblocks"])(
+ self._fetch_block
+ )
+
+ def _fetch(self, start: int | None, end: int | None) -> bytes:
+ if start is None:
+ start = 0
+ if end is None:
+ end = self.size
+ if start >= self.size or start >= end:
+ return b""
+
+ # byte position -> block numbers
+ start_block_number = start // self.blocksize
+ end_block_number = end // self.blocksize
+
+ # these are cached, so safe to do multiple calls for the same start and end.
+ for block_number in range(start_block_number, end_block_number + 1):
+ self._fetch_block_cached(block_number)
+
+ return self._read_cache(
+ start,
+ end,
+ start_block_number=start_block_number,
+ end_block_number=end_block_number,
+ )
+
+ def _fetch_block(self, block_number: int) -> bytes:
"""
Fetch the block of data for `block_number`.
"""
- pass
-
- def _read_cache(self, start: int, end: int, start_block_number: int,
- end_block_number: int) ->bytes:
+ if block_number > self.nblocks:
+ raise ValueError(
+ f"'block_number={block_number}' is greater than "
+ f"the number of blocks ({self.nblocks})"
+ )
+
+ start = block_number * self.blocksize
+ end = start + self.blocksize
+ self.total_requested_bytes += end - start
+ self.miss_count += 1
+ logger.info("BlockCache fetching block %d", block_number)
+ block_contents = super()._fetch(start, end)
+ return block_contents
+
+ def _read_cache(
+ self, start: int, end: int, start_block_number: int, end_block_number: int
+ ) -> bytes:
"""
Read from our block cache.
@@ -194,7 +392,32 @@ class BlockCache(BaseCache):
start_block_number, end_block_number : int
The start and end block numbers.
"""
- pass
+ start_pos = start % self.blocksize
+ end_pos = end % self.blocksize
+
+ self.hit_count += 1
+ if start_block_number == end_block_number:
+ block: bytes = self._fetch_block_cached(start_block_number)
+ return block[start_pos:end_pos]
+
+ else:
+ # read from the initial
+ out = [self._fetch_block_cached(start_block_number)[start_pos:]]
+
+ # intermediate blocks
+ # Note: it'd be nice to combine these into one big request. However
+ # that doesn't play nicely with our LRU cache.
+ out.extend(
+ map(
+ self._fetch_block_cached,
+ range(start_block_number + 1, end_block_number),
+ )
+ )
+
+ # final block
+ out.append(self._fetch_block_cached(end_block_number)[:end_pos])
+
+ return b"".join(out)
class BytesCache(BaseCache):
@@ -209,33 +432,118 @@ class BytesCache(BaseCache):
As we read more data, whether to discard the start of the buffer when
we are more than a blocksize ahead of it.
"""
- name: ClassVar[str] = 'bytes'
- def __init__(self, blocksize: int, fetcher: Fetcher, size: int, trim:
- bool=True) ->None:
+ name: ClassVar[str] = "bytes"
+
+ def __init__(
+ self, blocksize: int, fetcher: Fetcher, size: int, trim: bool = True
+ ) -> None:
super().__init__(blocksize, fetcher, size)
- self.cache = b''
+ self.cache = b""
self.start: int | None = None
self.end: int | None = None
self.trim = trim
- def __len__(self) ->int:
+ def _fetch(self, start: int | None, end: int | None) -> bytes:
+ # TODO: only set start/end after fetch, in case it fails?
+ # is this where retry logic might go?
+ if start is None:
+ start = 0
+ if end is None:
+ end = self.size
+ if start >= self.size or start >= end:
+ return b""
+ if (
+ self.start is not None
+ and start >= self.start
+ and self.end is not None
+ and end < self.end
+ ):
+ # cache hit: we have all the required data
+ offset = start - self.start
+ self.hit_count += 1
+ return self.cache[offset : offset + end - start]
+
+ if self.blocksize:
+ bend = min(self.size, end + self.blocksize)
+ else:
+ bend = end
+
+ if bend == start or start > self.size:
+ return b""
+
+ if (self.start is None or start < self.start) and (
+ self.end is None or end > self.end
+ ):
+ # First read, or extending both before and after
+ self.total_requested_bytes += bend - start
+ self.miss_count += 1
+ self.cache = self.fetcher(start, bend)
+ self.start = start
+ else:
+ assert self.start is not None
+ assert self.end is not None
+ self.miss_count += 1
+
+ if start < self.start:
+ if self.end is None or self.end - end > self.blocksize:
+ self.total_requested_bytes += bend - start
+ self.cache = self.fetcher(start, bend)
+ self.start = start
+ else:
+ self.total_requested_bytes += self.start - start
+ new = self.fetcher(start, self.start)
+ self.start = start
+ self.cache = new + self.cache
+ elif self.end is not None and bend > self.end:
+ if self.end > self.size:
+ pass
+ elif end - self.end > self.blocksize:
+ self.total_requested_bytes += bend - start
+ self.cache = self.fetcher(start, bend)
+ self.start = start
+ else:
+ self.total_requested_bytes += bend - self.end
+ new = self.fetcher(self.end, bend)
+ self.cache = self.cache + new
+
+ self.end = self.start + len(self.cache)
+ offset = start - self.start
+ out = self.cache[offset : offset + end - start]
+ if self.trim:
+ num = (self.end - self.start) // (self.blocksize + 1)
+ if num > 1:
+ self.start += self.blocksize * num
+ self.cache = self.cache[self.blocksize * num :]
+ return out
+
+ def __len__(self) -> int:
return len(self.cache)
class AllBytes(BaseCache):
"""Cache entire contents of the file"""
- name: ClassVar[str] = 'all'
- def __init__(self, blocksize: (int | None)=None, fetcher: (Fetcher |
- None)=None, size: (int | None)=None, data: (bytes | None)=None) ->None:
- super().__init__(blocksize, fetcher, size)
+ name: ClassVar[str] = "all"
+
+ def __init__(
+ self,
+ blocksize: int | None = None,
+ fetcher: Fetcher | None = None,
+ size: int | None = None,
+ data: bytes | None = None,
+ ) -> None:
+ super().__init__(blocksize, fetcher, size) # type: ignore[arg-type]
if data is None:
self.miss_count += 1
self.total_requested_bytes += self.size
data = self.fetcher(0, self.size)
self.data = data
+ def _fetch(self, start: int | None, stop: int | None) -> bytes:
+ self.hit_count += 1
+ return self.data[start:stop]
+
class KnownPartsOfAFile(BaseCache):
"""
@@ -259,13 +567,22 @@ class KnownPartsOfAFile(BaseCache):
padded. Note that zero padding will not be used for reads that
begin outside a known byte-range.
"""
- name: ClassVar[str] = 'parts'
- def __init__(self, blocksize: int, fetcher: Fetcher, size: int, data:
- Optional[dict[tuple[int, int], bytes]]=None, strict: bool=True, **_:
- Any):
+ name: ClassVar[str] = "parts"
+
+ def __init__(
+ self,
+ blocksize: int,
+ fetcher: Fetcher,
+ size: int,
+ data: Optional[dict[tuple[int, int], bytes]] = None,
+ strict: bool = True,
+ **_: Any,
+ ):
super().__init__(blocksize, fetcher, size)
self.strict = strict
+
+ # simple consolidation of contiguous blocks
if data:
old_offsets = sorted(data.keys())
offsets = [old_offsets[0]]
@@ -273,15 +590,61 @@ class KnownPartsOfAFile(BaseCache):
for start, stop in old_offsets[1:]:
start0, stop0 = offsets[-1]
if start == stop0:
- offsets[-1] = start0, stop
+ offsets[-1] = (start0, stop)
blocks[-1] += data.pop((start, stop))
else:
offsets.append((start, stop))
blocks.append(data.pop((start, stop)))
+
self.data = dict(zip(offsets, blocks))
else:
self.data = {}
+ def _fetch(self, start: int | None, stop: int | None) -> bytes:
+ if start is None:
+ start = 0
+ if stop is None:
+ stop = self.size
+
+ out = b""
+ for (loc0, loc1), data in self.data.items():
+ # If self.strict=False, use zero-padded data
+ # for reads beyond the end of a "known" buffer
+ if loc0 <= start < loc1:
+ off = start - loc0
+ out = data[off : off + stop - start]
+ if not self.strict or loc0 <= stop <= loc1:
+ # The request is within a known range, or
+ # it begins within a known range, and we
+ # are allowed to pad reads beyond the
+ # buffer with zero
+ out += b"\x00" * (stop - start - len(out))
+ self.hit_count += 1
+ return out
+ else:
+ # The request ends outside a known range,
+ # and we are being "strict" about reads
+ # beyond the buffer
+ start = loc1
+ break
+
+ # We only get here if there is a request outside the
+ # known parts of the file. In an ideal world, this
+ # should never happen
+ if self.fetcher is None:
+ # We cannot fetch the data, so raise an error
+ raise ValueError(f"Read is outside the known file parts: {(start, stop)}. ")
+ # We can fetch the data, but should warn the user
+ # that this may be slow
+ warnings.warn(
+ f"Read is outside the known file parts: {(start, stop)}. "
+ f"IO/caching performance may be poor!"
+ )
+ logger.debug(f"KnownPartsOfAFile cache fetching {start}-{stop}")
+ self.total_requested_bytes += stop - start
+ self.miss_count += 1
+ return out + super()._fetch(start, stop)
+
class UpdatableLRU(Generic[P, T]):
"""
@@ -290,14 +653,13 @@ class UpdatableLRU(Generic[P, T]):
Used by BackgroudBlockCache
"""
-
class CacheInfo(NamedTuple):
hits: int
misses: int
maxsize: int
currsize: int
- def __init__(self, func: Callable[P, T], max_size: int=128) ->None:
+ def __init__(self, func: Callable[P, T], max_size: int = 128) -> None:
self._cache: OrderedDict[Any, T] = collections.OrderedDict()
self._func = func
self._max_size = max_size
@@ -305,22 +667,44 @@ class UpdatableLRU(Generic[P, T]):
self._misses = 0
self._lock = threading.Lock()
- def __call__(self, *args: P.args, **kwargs: P.kwargs) ->T:
+ def __call__(self, *args: P.args, **kwargs: P.kwargs) -> T:
if kwargs:
- raise TypeError(f'Got unexpected keyword argument {kwargs.keys()}')
+ raise TypeError(f"Got unexpected keyword argument {kwargs.keys()}")
with self._lock:
if args in self._cache:
self._cache.move_to_end(args)
self._hits += 1
return self._cache[args]
+
result = self._func(*args, **kwargs)
+
with self._lock:
self._cache[args] = result
self._misses += 1
if len(self._cache) > self._max_size:
self._cache.popitem(last=False)
+
return result
+ def is_key_cached(self, *args: Any) -> bool:
+ with self._lock:
+ return args in self._cache
+
+ def add_key(self, result: T, *args: Any) -> None:
+ with self._lock:
+ self._cache[args] = result
+ if len(self._cache) > self._max_size:
+ self._cache.popitem(last=False)
+
+ def cache_info(self) -> UpdatableLRU.CacheInfo:
+ with self._lock:
+ return self.CacheInfo(
+ maxsize=self._max_size,
+ currsize=len(self._cache),
+ hits=self._hits,
+ misses=self._misses,
+ )
+
class BackgroundBlockCache(BaseCache):
"""
@@ -347,20 +731,23 @@ class BackgroundBlockCache(BaseCache):
The maximum number of blocks to cache for. The maximum memory
use for this cache is then ``blocksize * maxblocks``.
"""
- name: ClassVar[str] = 'background'
- def __init__(self, blocksize: int, fetcher: Fetcher, size: int,
- maxblocks: int=32) ->None:
+ name: ClassVar[str] = "background"
+
+ def __init__(
+ self, blocksize: int, fetcher: Fetcher, size: int, maxblocks: int = 32
+ ) -> None:
super().__init__(blocksize, fetcher, size)
self.nblocks = math.ceil(size / blocksize)
self.maxblocks = maxblocks
self._fetch_block_cached = UpdatableLRU(self._fetch_block, maxblocks)
+
self._thread_executor = ThreadPoolExecutor(max_workers=1)
self._fetch_future_block_number: int | None = None
self._fetch_future: Future[bytes] | None = None
self._fetch_future_lock = threading.Lock()
- def cache_info(self) ->UpdatableLRU.CacheInfo:
+ def cache_info(self) -> UpdatableLRU.CacheInfo:
"""
The statistics on the block cache.
@@ -369,34 +756,122 @@ class BackgroundBlockCache(BaseCache):
NamedTuple
Returned directly from the LRU Cache used internally.
"""
- pass
+ return self._fetch_block_cached.cache_info()
- def __getstate__(self) ->dict[str, Any]:
+ def __getstate__(self) -> dict[str, Any]:
state = self.__dict__
- del state['_fetch_block_cached']
- del state['_thread_executor']
- del state['_fetch_future_block_number']
- del state['_fetch_future']
- del state['_fetch_future_lock']
+ del state["_fetch_block_cached"]
+ del state["_thread_executor"]
+ del state["_fetch_future_block_number"]
+ del state["_fetch_future"]
+ del state["_fetch_future_lock"]
return state
- def __setstate__(self, state) ->None:
+ def __setstate__(self, state) -> None:
self.__dict__.update(state)
- self._fetch_block_cached = UpdatableLRU(self._fetch_block, state[
- 'maxblocks'])
+ self._fetch_block_cached = UpdatableLRU(self._fetch_block, state["maxblocks"])
self._thread_executor = ThreadPoolExecutor(max_workers=1)
self._fetch_future_block_number = None
self._fetch_future = None
self._fetch_future_lock = threading.Lock()
- def _fetch_block(self, block_number: int, log_info: str='sync') ->bytes:
+ def _fetch(self, start: int | None, end: int | None) -> bytes:
+ if start is None:
+ start = 0
+ if end is None:
+ end = self.size
+ if start >= self.size or start >= end:
+ return b""
+
+ # byte position -> block numbers
+ start_block_number = start // self.blocksize
+ end_block_number = end // self.blocksize
+
+ fetch_future_block_number = None
+ fetch_future = None
+ with self._fetch_future_lock:
+ # Background thread is running. Check we we can or must join it.
+ if self._fetch_future is not None:
+ assert self._fetch_future_block_number is not None
+ if self._fetch_future.done():
+ logger.info("BlockCache joined background fetch without waiting.")
+ self._fetch_block_cached.add_key(
+ self._fetch_future.result(), self._fetch_future_block_number
+ )
+ # Cleanup the fetch variables. Done with fetching the block.
+ self._fetch_future_block_number = None
+ self._fetch_future = None
+ else:
+ # Must join if we need the block for the current fetch
+ must_join = bool(
+ start_block_number
+ <= self._fetch_future_block_number
+ <= end_block_number
+ )
+ if must_join:
+ # Copy to the local variables to release lock
+ # before waiting for result
+ fetch_future_block_number = self._fetch_future_block_number
+ fetch_future = self._fetch_future
+
+ # Cleanup the fetch variables. Have a local copy.
+ self._fetch_future_block_number = None
+ self._fetch_future = None
+
+ # Need to wait for the future for the current read
+ if fetch_future is not None:
+ logger.info("BlockCache waiting for background fetch.")
+ # Wait until result and put it in cache
+ self._fetch_block_cached.add_key(
+ fetch_future.result(), fetch_future_block_number
+ )
+
+ # these are cached, so safe to do multiple calls for the same start and end.
+ for block_number in range(start_block_number, end_block_number + 1):
+ self._fetch_block_cached(block_number)
+
+ # fetch next block in the background if nothing is running in the background,
+ # the block is within file and it is not already cached
+ end_block_plus_1 = end_block_number + 1
+ with self._fetch_future_lock:
+ if (
+ self._fetch_future is None
+ and end_block_plus_1 <= self.nblocks
+ and not self._fetch_block_cached.is_key_cached(end_block_plus_1)
+ ):
+ self._fetch_future_block_number = end_block_plus_1
+ self._fetch_future = self._thread_executor.submit(
+ self._fetch_block, end_block_plus_1, "async"
+ )
+
+ return self._read_cache(
+ start,
+ end,
+ start_block_number=start_block_number,
+ end_block_number=end_block_number,
+ )
+
+ def _fetch_block(self, block_number: int, log_info: str = "sync") -> bytes:
"""
Fetch the block of data for `block_number`.
"""
- pass
-
- def _read_cache(self, start: int, end: int, start_block_number: int,
- end_block_number: int) ->bytes:
+ if block_number > self.nblocks:
+ raise ValueError(
+ f"'block_number={block_number}' is greater than "
+ f"the number of blocks ({self.nblocks})"
+ )
+
+ start = block_number * self.blocksize
+ end = start + self.blocksize
+ logger.info("BlockCache fetching block (%s) %d", log_info, block_number)
+ self.total_requested_bytes += end - start
+ self.miss_count += 1
+ block_contents = super()._fetch(start, end)
+ return block_contents
+
+ def _read_cache(
+ self, start: int, end: int, start_block_number: int, end_block_number: int
+ ) -> bytes:
"""
Read from our block cache.
@@ -407,13 +882,43 @@ class BackgroundBlockCache(BaseCache):
start_block_number, end_block_number : int
The start and end block numbers.
"""
- pass
+ start_pos = start % self.blocksize
+ end_pos = end % self.blocksize
+ # kind of pointless to count this as a hit, but it is
+ self.hit_count += 1
-caches: dict[str | None, type[BaseCache]] = {None: BaseCache}
+ if start_block_number == end_block_number:
+ block = self._fetch_block_cached(start_block_number)
+ return block[start_pos:end_pos]
+
+ else:
+ # read from the initial
+ out = [self._fetch_block_cached(start_block_number)[start_pos:]]
+
+ # intermediate blocks
+ # Note: it'd be nice to combine these into one big request. However
+ # that doesn't play nicely with our LRU cache.
+ out.extend(
+ map(
+ self._fetch_block_cached,
+ range(start_block_number + 1, end_block_number),
+ )
+ )
+
+ # final block
+ out.append(self._fetch_block_cached(end_block_number)[:end_pos])
+
+ return b"".join(out)
-def register_cache(cls: type[BaseCache], clobber: bool=False) ->None:
+caches: dict[str | None, type[BaseCache]] = {
+ # one custom case
+ None: BaseCache,
+}
+
+
+def register_cache(cls: type[BaseCache], clobber: bool = False) -> None:
"""'Register' cache implementation.
Parameters
@@ -426,9 +931,21 @@ def register_cache(cls: type[BaseCache], clobber: bool=False) ->None:
------
ValueError
"""
- pass
-
-
-for c in (BaseCache, MMapCache, BytesCache, ReadAheadCache, BlockCache,
- FirstChunkCache, AllBytes, KnownPartsOfAFile, BackgroundBlockCache):
+ name = cls.name
+ if not clobber and name in caches:
+ raise ValueError(f"Cache with name {name!r} is already known: {caches[name]}")
+ caches[name] = cls
+
+
+for c in (
+ BaseCache,
+ MMapCache,
+ BytesCache,
+ ReadAheadCache,
+ BlockCache,
+ FirstChunkCache,
+ AllBytes,
+ KnownPartsOfAFile,
+ BackgroundBlockCache,
+):
register_cache(c)
diff --git a/fsspec/callbacks.py b/fsspec/callbacks.py
index fd7312d..7ca99ca 100644
--- a/fsspec/callbacks.py
+++ b/fsspec/callbacks.py
@@ -36,7 +36,6 @@ class Callback:
def close(self):
"""Close callback."""
- pass
def branched(self, path_1, path_2, **kwargs):
"""
@@ -67,13 +66,21 @@ class Callback:
callback: Callback
A callback instance to be passed to the child method
"""
- pass
+ self.branch(path_1, path_2, kwargs)
+ # mutate kwargs so that we can force the caller to pass "callback=" explicitly
+ return kwargs.pop("callback", DEFAULT_CALLBACK)
def branch_coro(self, fn):
"""
Wraps a coroutine, and pass a new child callback to it.
"""
- pass
+
+ @wraps(fn)
+ async def func(path1, path2: str, **kwargs):
+ with self.branched(path1, path2, **kwargs) as child:
+ return await fn(path1, path2, callback=child, **kwargs)
+
+ return func
def set_size(self, size):
"""
@@ -86,7 +93,8 @@ class Callback:
----------
size: int
"""
- pass
+ self.size = size
+ self.call()
def absolute_update(self, value):
"""
@@ -98,7 +106,8 @@ class Callback:
----------
value: int
"""
- pass
+ self.value = value
+ self.call()
def relative_update(self, inc=1):
"""
@@ -110,7 +119,8 @@ class Callback:
----------
inc: int
"""
- pass
+ self.value += inc
+ self.call()
def call(self, hook_name=None, **kwargs):
"""
@@ -124,7 +134,16 @@ class Callback:
If given, execute on this hook
kwargs: passed on to (all) hook(s)
"""
- pass
+ if not self.hooks:
+ return
+ kw = self.kw.copy()
+ kw.update(kwargs)
+ if hook_name:
+ if hook_name not in self.hooks:
+ return
+ return self.hooks[hook_name](self.size, self.value, **kw)
+ for hook in self.hooks.values() or []:
+ hook(self.size, self.value, **kw)
def wrap(self, iterable):
"""
@@ -135,7 +154,9 @@ class Callback:
iterable: Iterable
The iterable that is being wrapped
"""
- pass
+ for item in iterable:
+ self.relative_update()
+ yield item
def branch(self, path_1, path_2, kwargs):
"""
@@ -159,6 +180,9 @@ class Callback:
-------
"""
+ return None
+
+ def no_op(self, *_, **__):
pass
def __getattr__(self, item):
@@ -175,7 +199,9 @@ class Callback:
``NoOpCallback``. This is an alternative to including
``callback=DEFAULT_CALLBACK`` directly in a method signature.
"""
- pass
+ if maybe_callback is None:
+ return DEFAULT_CALLBACK
+ return maybe_callback
class NoOpCallback(Callback):
@@ -183,6 +209,9 @@ class NoOpCallback(Callback):
This implementation of Callback does exactly nothing
"""
+ def call(self, *args, **kwargs):
+ return None
+
class DotPrinterCallback(Callback):
"""
@@ -192,17 +221,17 @@ class DotPrinterCallback(Callback):
demonstrate how the outer layer may print "#" and the inner layer "."
"""
- def __init__(self, chr_to_print='#', **kwargs):
+ def __init__(self, chr_to_print="#", **kwargs):
self.chr = chr_to_print
super().__init__(**kwargs)
def branch(self, path_1, path_2, kwargs):
"""Mutate kwargs to add new instance with different print char"""
- pass
+ kwargs["callback"] = DotPrinterCallback(".")
def call(self, **kwargs):
"""Just outputs a character"""
- pass
+ print(self.chr, end="")
class TqdmCallback(Callback):
@@ -266,14 +295,28 @@ class TqdmCallback(Callback):
def __init__(self, tqdm_kwargs=None, *args, **kwargs):
try:
from tqdm import tqdm
+
except ImportError as exce:
raise ImportError(
- 'Using TqdmCallback requires tqdm to be installed') from exce
- self._tqdm_cls = kwargs.pop('tqdm_cls', tqdm)
+ "Using TqdmCallback requires tqdm to be installed"
+ ) from exce
+
+ self._tqdm_cls = kwargs.pop("tqdm_cls", tqdm)
self._tqdm_kwargs = tqdm_kwargs or {}
self.tqdm = None
super().__init__(*args, **kwargs)
+ def call(self, *args, **kwargs):
+ if self.tqdm is None:
+ self.tqdm = self._tqdm_cls(total=self.size, **self._tqdm_kwargs)
+ self.tqdm.total = self.size
+ self.tqdm.update(self.value - self.tqdm.n)
+
+ def close(self):
+ if self.tqdm is not None:
+ self.tqdm.close()
+ self.tqdm = None
+
def __del__(self):
return self.close()
diff --git a/fsspec/compression.py b/fsspec/compression.py
index 9562369..fc519c2 100644
--- a/fsspec/compression.py
+++ b/fsspec/compression.py
@@ -1,7 +1,17 @@
"""Helper functions for a standard streaming compression API"""
+
from zipfile import ZipFile
+
import fsspec.utils
from fsspec.spec import AbstractBufferedFile
+
+
+def noop_file(file, mode, **kwargs):
+ return file
+
+
+# TODO: files should also be available as contexts
+# should be functions of the form func(infile, mode=, **kwargs) -> file-like
compr = {None: noop_file}
@@ -25,72 +35,141 @@ def register_compression(name, callback, extensions, force=False):
ValueError: If name or extensions already registered, and not force.
"""
- pass
+ if isinstance(extensions, str):
+ extensions = [extensions]
+
+ # Validate registration
+ if name in compr and not force:
+ raise ValueError(f"Duplicate compression registration: {name}")
+
+ for ext in extensions:
+ if ext in fsspec.utils.compressions and not force:
+ raise ValueError(f"Duplicate compression file extension: {ext} ({name})")
+
+ compr[name] = callback
+ for ext in extensions:
+ fsspec.utils.compressions[ext] = name
+
+
+def unzip(infile, mode="rb", filename=None, **kwargs):
+ if "r" not in mode:
+ filename = filename or "file"
+ z = ZipFile(infile, mode="w", **kwargs)
+ fo = z.open(filename, mode="w")
+ fo.close = lambda closer=fo.close: closer() or z.close()
+ return fo
+ z = ZipFile(infile)
+ if filename is None:
+ filename = z.namelist()[0]
+ return z.open(filename, mode="r", **kwargs)
+
+
+register_compression("zip", unzip, "zip")
-register_compression('zip', unzip, 'zip')
try:
from bz2 import BZ2File
except ImportError:
pass
else:
- register_compression('bz2', BZ2File, 'bz2')
-try:
+ register_compression("bz2", BZ2File, "bz2")
+
+try: # pragma: no cover
from isal import igzip
- register_compression('gzip', isal, 'gz')
+
+ def isal(infile, mode="rb", **kwargs):
+ return igzip.IGzipFile(fileobj=infile, mode=mode, **kwargs)
+
+ register_compression("gzip", isal, "gz")
except ImportError:
from gzip import GzipFile
- register_compression('gzip', lambda f, **kwargs: GzipFile(fileobj=f, **
- kwargs), 'gz')
+
+ register_compression(
+ "gzip", lambda f, **kwargs: GzipFile(fileobj=f, **kwargs), "gz"
+ )
+
try:
from lzma import LZMAFile
- register_compression('lzma', LZMAFile, 'lzma')
- register_compression('xz', LZMAFile, 'xz')
+
+ register_compression("lzma", LZMAFile, "lzma")
+ register_compression("xz", LZMAFile, "xz")
except ImportError:
pass
+
try:
import lzmaffi
- register_compression('lzma', lzmaffi.LZMAFile, 'lzma', force=True)
- register_compression('xz', lzmaffi.LZMAFile, 'xz', force=True)
+
+ register_compression("lzma", lzmaffi.LZMAFile, "lzma", force=True)
+ register_compression("xz", lzmaffi.LZMAFile, "xz", force=True)
except ImportError:
pass
class SnappyFile(AbstractBufferedFile):
-
def __init__(self, infile, mode, **kwargs):
import snappy
- super().__init__(fs=None, path='snappy', mode=mode.strip('b') + 'b',
- size=999999999, **kwargs)
+
+ super().__init__(
+ fs=None, path="snappy", mode=mode.strip("b") + "b", size=999999999, **kwargs
+ )
self.infile = infile
- if 'r' in mode:
+ if "r" in mode:
self.codec = snappy.StreamDecompressor()
else:
self.codec = snappy.StreamCompressor()
+ def _upload_chunk(self, final=False):
+ self.buffer.seek(0)
+ out = self.codec.add_chunk(self.buffer.read())
+ self.infile.write(out)
+ return True
+
+ def seek(self, loc, whence=0):
+ raise NotImplementedError("SnappyFile is not seekable")
+
+ def seekable(self):
+ return False
+
def _fetch_range(self, start, end):
"""Get the specified set of bytes from remote"""
- pass
+ data = self.infile.read(end - start)
+ return self.codec.decompress(data)
try:
import snappy
- snappy.compress(b'')
- register_compression('snappy', SnappyFile, [])
+
+ snappy.compress(b"")
+ # Snappy may use the .sz file extension, but this is not part of the
+ # standard implementation.
+ register_compression("snappy", SnappyFile, [])
+
except (ImportError, NameError, AttributeError):
pass
+
try:
import lz4.frame
- register_compression('lz4', lz4.frame.open, 'lz4')
+
+ register_compression("lz4", lz4.frame.open, "lz4")
except ImportError:
pass
+
try:
import zstandard as zstd
- register_compression('zstd', zstandard_file, 'zst')
+
+ def zstandard_file(infile, mode="rb"):
+ if "r" in mode:
+ cctx = zstd.ZstdDecompressor()
+ return cctx.stream_reader(infile)
+ else:
+ cctx = zstd.ZstdCompressor(level=10)
+ return cctx.stream_writer(infile)
+
+ register_compression("zstd", zstandard_file, "zst")
except ImportError:
pass
def available_compressions():
"""Return a list of the implemented compressions."""
- pass
+ return list(compr)
diff --git a/fsspec/config.py b/fsspec/config.py
index 00a7d90..76d9af1 100644
--- a/fsspec/config.py
+++ b/fsspec/config.py
@@ -1,12 +1,14 @@
from __future__ import annotations
+
import configparser
import json
import os
import warnings
from typing import Any
+
conf: dict[str, dict[str, Any]] = {}
-default_conf_dir = os.path.join(os.path.expanduser('~'), '.config/fsspec')
-conf_dir = os.environ.get('FSSPEC_CONFIG_DIR', default_conf_dir)
+default_conf_dir = os.path.join(os.path.expanduser("~"), ".config/fsspec")
+conf_dir = os.environ.get("FSSPEC_CONFIG_DIR", default_conf_dir)
def set_conf_env(conf_dict, envdict=os.environ):
@@ -28,7 +30,35 @@ def set_conf_env(conf_dict, envdict=os.environ):
envdict : dict-like(str, str)
Source for the values - usually the real environment
"""
- pass
+ kwarg_keys = []
+ for key in envdict:
+ if key.startswith("FSSPEC_") and len(key) > 7 and key[7] != "_":
+ if key.count("_") > 1:
+ kwarg_keys.append(key)
+ continue
+ try:
+ value = json.loads(envdict[key])
+ except json.decoder.JSONDecodeError as ex:
+ warnings.warn(
+ f"Ignoring environment variable {key} due to a parse failure: {ex}"
+ )
+ else:
+ if isinstance(value, dict):
+ _, proto = key.split("_", 1)
+ conf_dict.setdefault(proto.lower(), {}).update(value)
+ else:
+ warnings.warn(
+ f"Ignoring environment variable {key} due to not being a dict:"
+ f" {type(value)}"
+ )
+ elif key.startswith("FSSPEC"):
+ warnings.warn(
+ f"Ignoring environment variable {key} due to having an unexpected name"
+ )
+
+ for key in kwarg_keys:
+ _, proto, kwarg = key.split("_", 2)
+ conf_dict.setdefault(proto.lower(), {})[kwarg.lower()] = envdict[key]
def set_conf_files(cdir, conf_dict):
@@ -48,7 +78,22 @@ def set_conf_files(cdir, conf_dict):
conf_dict : dict(str, dict)
This dict will be mutated
"""
- pass
+ if not os.path.isdir(cdir):
+ return
+ allfiles = sorted(os.listdir(cdir))
+ for fn in allfiles:
+ if fn.endswith(".ini"):
+ ini = configparser.ConfigParser()
+ ini.read(os.path.join(cdir, fn))
+ for key in ini:
+ if key == "DEFAULT":
+ continue
+ conf_dict.setdefault(key, {}).update(dict(ini[key]))
+ if fn.endswith(".json"):
+ with open(os.path.join(cdir, fn)) as f:
+ js = json.load(f)
+ for key in js:
+ conf_dict.setdefault(key, {}).update(dict(js[key]))
def apply_config(cls, kwargs, conf_dict=None):
@@ -68,7 +113,18 @@ def apply_config(cls, kwargs, conf_dict=None):
-------
dict : the modified set of kwargs
"""
- pass
+ if conf_dict is None:
+ conf_dict = conf
+ protos = cls.protocol if isinstance(cls.protocol, (tuple, list)) else [cls.protocol]
+ kw = {}
+ for proto in protos:
+ # default kwargs from the current state of the config
+ if proto in conf_dict:
+ kw.update(conf_dict[proto])
+ # explicit kwargs always win
+ kw.update(**kwargs)
+ kwargs = kw
+ return kwargs
set_conf_files(conf_dir, conf)
diff --git a/fsspec/core.py b/fsspec/core.py
index 0ba5d69..bd4f98d 100644
--- a/fsspec/core.py
+++ b/fsspec/core.py
@@ -1,16 +1,32 @@
from __future__ import annotations
+
import io
import logging
import os
import re
from glob import has_magic
from pathlib import Path
-from fsspec.caching import BaseCache, BlockCache, BytesCache, MMapCache, ReadAheadCache, caches
+
+# for backwards compat, we export cache things from here too
+from fsspec.caching import ( # noqa: F401
+ BaseCache,
+ BlockCache,
+ BytesCache,
+ MMapCache,
+ ReadAheadCache,
+ caches,
+)
from fsspec.compression import compr
from fsspec.config import conf
from fsspec.registry import filesystem, get_filesystem_class
-from fsspec.utils import _unstrip_protocol, build_name_function, infer_compression, stringify_path
-logger = logging.getLogger('fsspec')
+from fsspec.utils import (
+ _unstrip_protocol,
+ build_name_function,
+ infer_compression,
+ stringify_path,
+)
+
+logger = logging.getLogger("fsspec")
class OpenFile:
@@ -46,8 +62,16 @@ class OpenFile:
If given and autoopen is True, seek to this location immediately
"""
- def __init__(self, fs, path, mode='rb', compression=None, encoding=None,
- errors=None, newline=None):
+ def __init__(
+ self,
+ fs,
+ path,
+ mode="rb",
+ compression=None,
+ encoding=None,
+ errors=None,
+ newline=None,
+ ):
self.fs = fs
self.path = path
self.mode = mode
@@ -58,39 +82,61 @@ class OpenFile:
self.fobjects = []
def __reduce__(self):
- return OpenFile, (self.fs, self.path, self.mode, self.compression,
- self.encoding, self.errors, self.newline)
+ return (
+ OpenFile,
+ (
+ self.fs,
+ self.path,
+ self.mode,
+ self.compression,
+ self.encoding,
+ self.errors,
+ self.newline,
+ ),
+ )
def __repr__(self):
return f"<OpenFile '{self.path}'>"
def __enter__(self):
- mode = self.mode.replace('t', '').replace('b', '') + 'b'
+ mode = self.mode.replace("t", "").replace("b", "") + "b"
+
try:
f = self.fs.open(self.path, mode=mode)
except FileNotFoundError as e:
if has_magic(self.path):
raise FileNotFoundError(
- """%s not found. The URL contains glob characters: you maybe needed
-to pass expand=True in fsspec.open() or the storage_options of
-your library. You can also set the config value 'open_expand'
-before import, or fsspec.core.DEFAULT_EXPAND at runtime, to True."""
- , self.path) from e
+ "%s not found. The URL contains glob characters: you maybe needed\n"
+ "to pass expand=True in fsspec.open() or the storage_options of \n"
+ "your library. You can also set the config value 'open_expand'\n"
+ "before import, or fsspec.core.DEFAULT_EXPAND at runtime, to True.",
+ self.path,
+ ) from e
raise
+
self.fobjects = [f]
+
if self.compression is not None:
compress = compr[self.compression]
f = compress(f, mode=mode[0])
self.fobjects.append(f)
- if 'b' not in self.mode:
- f = PickleableTextIOWrapper(f, encoding=self.encoding, errors=
- self.errors, newline=self.newline)
+
+ if "b" not in self.mode:
+ # assume, for example, that 'r' is equivalent to 'rt' as in builtin
+ f = PickleableTextIOWrapper(
+ f, encoding=self.encoding, errors=self.errors, newline=self.newline
+ )
self.fobjects.append(f)
+
return self.fobjects[-1]
def __exit__(self, *args):
self.close()
+ @property
+ def full_name(self):
+ return _unstrip_protocol(self.path, self.fs)
+
def open(self):
"""Materialise this as a real open file without context
@@ -98,11 +144,15 @@ before import, or fsspec.core.DEFAULT_EXPAND at runtime, to True."""
instances persisting. You must, therefore, keep a reference to the OpenFile
during the life of the file-like it generates.
"""
- pass
+ return self.__enter__()
def close(self):
"""Close all encapsulated file objects"""
- pass
+ for f in reversed(self.fobjects):
+ if "r" not in self.mode and not f.closed:
+ f.flush()
+ f.close()
+ self.fobjects.clear()
class OpenFiles(list):
@@ -117,7 +167,7 @@ class OpenFiles(list):
this may happen concurrently, if the target filesystem supports it.
"""
- def __init__(self, *args, mode='rb', fs=None):
+ def __init__(self, *args, mode="rb", fs=None):
self.mode = mode
self.fs = fs
self.files = []
@@ -125,13 +175,15 @@ class OpenFiles(list):
def __enter__(self):
if self.fs is None:
- raise ValueError('Context has already been used')
+ raise ValueError("Context has already been used")
+
fs = self.fs
while True:
- if hasattr(fs, 'open_many'):
+ if hasattr(fs, "open_many"):
+ # check for concurrent cache download; or set up for upload
self.files = fs.open_many(self)
return self.files
- if hasattr(fs, 'fs') and fs.fs is not None:
+ if hasattr(fs, "fs") and fs.fs is not None:
fs = fs.fs
else:
break
@@ -140,12 +192,13 @@ class OpenFiles(list):
def __exit__(self, *args):
fs = self.fs
[s.__exit__(*args) for s in self]
- if 'r' not in self.mode:
+ if "r" not in self.mode:
while True:
- if hasattr(fs, 'open_many'):
+ if hasattr(fs, "open_many"):
+ # check for concurrent cache upload
fs.commit_many(self.files)
return
- if hasattr(fs, 'fs') and fs.fs is not None:
+ if hasattr(fs, "fs") and fs.fs is not None:
fs = fs.fs
else:
break
@@ -157,12 +210,23 @@ class OpenFiles(list):
return out
def __repr__(self):
- return f'<List of {len(self)} OpenFile instances>'
-
-
-def open_files(urlpath, mode='rb', compression=None, encoding='utf8',
- errors=None, name_function=None, num=1, protocol=None, newline=None,
- auto_mkdir=True, expand=True, **kwargs):
+ return f"<List of {len(self)} OpenFile instances>"
+
+
+def open_files(
+ urlpath,
+ mode="rb",
+ compression=None,
+ encoding="utf8",
+ errors=None,
+ name_function=None,
+ num=1,
+ protocol=None,
+ newline=None,
+ auto_mkdir=True,
+ expand=True,
+ **kwargs,
+):
"""Given a path or paths, return a list of ``OpenFile`` objects.
For writing, a str path must contain the "*" character, which will be filled
@@ -228,7 +292,71 @@ def open_files(urlpath, mode='rb', compression=None, encoding='utf8',
- For implementations in separate packages see
https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations
"""
- pass
+ fs, fs_token, paths = get_fs_token_paths(
+ urlpath,
+ mode,
+ num=num,
+ name_function=name_function,
+ storage_options=kwargs,
+ protocol=protocol,
+ expand=expand,
+ )
+ if fs.protocol == "file":
+ fs.auto_mkdir = auto_mkdir
+ elif "r" not in mode and auto_mkdir:
+ parents = {fs._parent(path) for path in paths}
+ for parent in parents:
+ try:
+ fs.makedirs(parent, exist_ok=True)
+ except PermissionError:
+ pass
+ return OpenFiles(
+ [
+ OpenFile(
+ fs,
+ path,
+ mode=mode,
+ compression=compression,
+ encoding=encoding,
+ errors=errors,
+ newline=newline,
+ )
+ for path in paths
+ ],
+ mode=mode,
+ fs=fs,
+ )
+
+
+def _un_chain(path, kwargs):
+ x = re.compile(".*[^a-z]+.*") # test for non protocol-like single word
+ bits = (
+ [p if "://" in p or x.match(p) else p + "://" for p in path.split("::")]
+ if "::" in path
+ else [path]
+ )
+ # [[url, protocol, kwargs], ...]
+ out = []
+ previous_bit = None
+ kwargs = kwargs.copy()
+ for bit in reversed(bits):
+ protocol = kwargs.pop("protocol", None) or split_protocol(bit)[0] or "file"
+ cls = get_filesystem_class(protocol)
+ extra_kwargs = cls._get_kwargs_from_urls(bit)
+ kws = kwargs.pop(protocol, {})
+ if bit is bits[0]:
+ kws.update(kwargs)
+ kw = dict(**extra_kwargs, **kws)
+ bit = cls._strip_protocol(bit)
+ if (
+ protocol in {"blockcache", "filecache", "simplecache"}
+ and "target_protocol" not in kw
+ ):
+ bit = previous_bit
+ out.append((bit, protocol, kw))
+ previous_bit = bit
+ out.reverse()
+ return out
def url_to_fs(url, **kwargs):
@@ -251,14 +379,50 @@ def url_to_fs(url, **kwargs):
urlpath : str
The file-systems-specific URL for ``url``.
"""
- pass
-
-
-DEFAULT_EXPAND = conf.get('open_expand', False)
-
-
-def open(urlpath, mode='rb', compression=None, encoding='utf8', errors=None,
- protocol=None, newline=None, expand=None, **kwargs):
+ url = stringify_path(url)
+ # non-FS arguments that appear in fsspec.open()
+ # inspect could keep this in sync with open()'s signature
+ known_kwargs = {
+ "compression",
+ "encoding",
+ "errors",
+ "expand",
+ "mode",
+ "name_function",
+ "newline",
+ "num",
+ }
+ kwargs = {k: v for k, v in kwargs.items() if k not in known_kwargs}
+ chain = _un_chain(url, kwargs)
+ inkwargs = {}
+ # Reverse iterate the chain, creating a nested target_* structure
+ for i, ch in enumerate(reversed(chain)):
+ urls, protocol, kw = ch
+ if i == len(chain) - 1:
+ inkwargs = dict(**kw, **inkwargs)
+ continue
+ inkwargs["target_options"] = dict(**kw, **inkwargs)
+ inkwargs["target_protocol"] = protocol
+ inkwargs["fo"] = urls
+ urlpath, protocol, _ = chain[0]
+ fs = filesystem(protocol, **inkwargs)
+ return fs, urlpath
+
+
+DEFAULT_EXPAND = conf.get("open_expand", False)
+
+
+def open(
+ urlpath,
+ mode="rb",
+ compression=None,
+ encoding="utf8",
+ errors=None,
+ protocol=None,
+ newline=None,
+ expand=None,
+ **kwargs,
+):
"""Given a path or paths, return one ``OpenFile`` object.
Parameters
@@ -316,11 +480,28 @@ def open(urlpath, mode='rb', compression=None, encoding='utf8', errors=None,
- For implementations in separate packages see
https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations
"""
- pass
-
-
-def open_local(url: (str | list[str] | Path | list[Path]), mode: str='rb',
- **storage_options: dict) ->(str | list[str]):
+ expand = DEFAULT_EXPAND if expand is None else expand
+ out = open_files(
+ urlpath=[urlpath],
+ mode=mode,
+ compression=compression,
+ encoding=encoding,
+ errors=errors,
+ protocol=protocol,
+ newline=newline,
+ expand=expand,
+ **kwargs,
+ )
+ if not out:
+ raise FileNotFoundError(urlpath)
+ return out[0]
+
+
+def open_local(
+ url: str | list[str] | Path | list[Path],
+ mode: str = "rb",
+ **storage_options: dict,
+) -> str | list[str]:
"""Open file(s) which can be resolved to local
For files which either are local, or get downloaded upon open
@@ -334,17 +515,47 @@ def open_local(url: (str | list[str] | Path | list[Path]), mode: str='rb',
storage_options:
passed on to FS for or used by open_files (e.g., compression)
"""
- pass
+ if "r" not in mode:
+ raise ValueError("Can only ensure local files when reading")
+ of = open_files(url, mode=mode, **storage_options)
+ if not getattr(of[0].fs, "local_file", False):
+ raise ValueError(
+ "open_local can only be used on a filesystem which"
+ " has attribute local_file=True"
+ )
+ with of as files:
+ paths = [f.name for f in files]
+ if (isinstance(url, str) and not has_magic(url)) or isinstance(url, Path):
+ return paths[0]
+ return paths
+
+
+def get_compression(urlpath, compression):
+ if compression == "infer":
+ compression = infer_compression(urlpath)
+ if compression is not None and compression not in compr:
+ raise ValueError(f"Compression type {compression} not supported")
+ return compression
def split_protocol(urlpath):
"""Return protocol, path pair"""
- pass
+ urlpath = stringify_path(urlpath)
+ if "://" in urlpath:
+ protocol, path = urlpath.split("://", 1)
+ if len(protocol) > 1:
+ # excludes Windows paths
+ return protocol, path
+ if urlpath.startswith("data:"):
+ return urlpath.split(":", 1)
+ return None, urlpath
def strip_protocol(urlpath):
"""Return only path part of full URL, according to appropriate backend"""
- pass
+ protocol, _ = split_protocol(urlpath)
+ cls = get_filesystem_class(protocol)
+ return cls._strip_protocol(urlpath)
def expand_paths_if_needed(paths, mode, num, fs, name_function):
@@ -363,11 +574,46 @@ def expand_paths_if_needed(paths, mode, num, fs, name_function):
``urlpath.replace('*', name_function(partition_index))``.
:return: list of paths
"""
- pass
+ expanded_paths = []
+ paths = list(paths)
+
+ if "w" in mode: # read mode
+ if sum([1 for p in paths if "*" in p]) > 1:
+ raise ValueError(
+ "When writing data, only one filename mask can be specified."
+ )
+ num = max(num, len(paths))
+
+ for curr_path in paths:
+ if "*" in curr_path:
+ # expand using name_function
+ expanded_paths.extend(_expand_paths(curr_path, name_function, num))
+ else:
+ expanded_paths.append(curr_path)
+ # if we generated more paths that asked for, trim the list
+ if len(expanded_paths) > num:
+ expanded_paths = expanded_paths[:num]
+
+ else: # read mode
+ for curr_path in paths:
+ if has_magic(curr_path):
+ # expand using glob
+ expanded_paths.extend(fs.glob(curr_path))
+ else:
+ expanded_paths.append(curr_path)
+
+ return expanded_paths
-def get_fs_token_paths(urlpath, mode='rb', num=1, name_function=None,
- storage_options=None, protocol=None, expand=True):
+def get_fs_token_paths(
+ urlpath,
+ mode="rb",
+ num=1,
+ name_function=None,
+ storage_options=None,
+ protocol=None,
+ expand=True,
+):
"""Filesystem, deterministic token, and paths from a urlpath and options.
Parameters
@@ -390,7 +636,83 @@ def get_fs_token_paths(urlpath, mode='rb', num=1, name_function=None,
expand: bool
Expand string paths for writing, assuming the path is a directory
"""
- pass
+ if isinstance(urlpath, (list, tuple, set)):
+ if not urlpath:
+ raise ValueError("empty urlpath sequence")
+ urlpath0 = stringify_path(list(urlpath)[0])
+ else:
+ urlpath0 = stringify_path(urlpath)
+ storage_options = storage_options or {}
+ if protocol:
+ storage_options["protocol"] = protocol
+ chain = _un_chain(urlpath0, storage_options or {})
+ inkwargs = {}
+ # Reverse iterate the chain, creating a nested target_* structure
+ for i, ch in enumerate(reversed(chain)):
+ urls, nested_protocol, kw = ch
+ if i == len(chain) - 1:
+ inkwargs = dict(**kw, **inkwargs)
+ continue
+ inkwargs["target_options"] = dict(**kw, **inkwargs)
+ inkwargs["target_protocol"] = nested_protocol
+ inkwargs["fo"] = urls
+ paths, protocol, _ = chain[0]
+ fs = filesystem(protocol, **inkwargs)
+ if isinstance(urlpath, (list, tuple, set)):
+ pchains = [
+ _un_chain(stringify_path(u), storage_options or {})[0] for u in urlpath
+ ]
+ if len({pc[1] for pc in pchains}) > 1:
+ raise ValueError("Protocol mismatch getting fs from %s", urlpath)
+ paths = [pc[0] for pc in pchains]
+ else:
+ paths = fs._strip_protocol(paths)
+ if isinstance(paths, (list, tuple, set)):
+ if expand:
+ paths = expand_paths_if_needed(paths, mode, num, fs, name_function)
+ elif not isinstance(paths, list):
+ paths = list(paths)
+ else:
+ if "w" in mode and expand:
+ paths = _expand_paths(paths, name_function, num)
+ elif "x" in mode and expand:
+ paths = _expand_paths(paths, name_function, num)
+ elif "*" in paths:
+ paths = [f for f in sorted(fs.glob(paths)) if not fs.isdir(f)]
+ else:
+ paths = [paths]
+
+ return fs, fs._fs_token, paths
+
+
+def _expand_paths(path, name_function, num):
+ if isinstance(path, str):
+ if path.count("*") > 1:
+ raise ValueError("Output path spec must contain exactly one '*'.")
+ elif "*" not in path:
+ path = os.path.join(path, "*.part")
+
+ if name_function is None:
+ name_function = build_name_function(num - 1)
+
+ paths = [path.replace("*", name_function(i)) for i in range(num)]
+ if paths != sorted(paths):
+ logger.warning(
+ "In order to preserve order between partitions"
+ " paths created with ``name_function`` should "
+ "sort to partition order"
+ )
+ elif isinstance(path, (tuple, list)):
+ assert len(path) == num
+ paths = list(path)
+ else:
+ raise ValueError(
+ "Path should be either\n"
+ "1. A list of paths: ['foo.json', 'bar.json', ...]\n"
+ "2. A directory: 'foo/\n"
+ "3. A path with a '*' in it: 'foo.*.json'"
+ )
+ return paths
class PickleableTextIOWrapper(io.TextIOWrapper):
@@ -400,10 +722,16 @@ class PickleableTextIOWrapper(io.TextIOWrapper):
AbstractBufferedFile are.
"""
- def __init__(self, buffer, encoding=None, errors=None, newline=None,
- line_buffering=False, write_through=False):
- self.args = (buffer, encoding, errors, newline, line_buffering,
- write_through)
+ def __init__(
+ self,
+ buffer,
+ encoding=None,
+ errors=None,
+ newline=None,
+ line_buffering=False,
+ write_through=False,
+ ):
+ self.args = buffer, encoding, errors, newline, line_buffering, write_through
super().__init__(*self.args)
def __reduce__(self):
diff --git a/fsspec/dircache.py b/fsspec/dircache.py
index b6c92be..eca1956 100644
--- a/fsspec/dircache.py
+++ b/fsspec/dircache.py
@@ -24,8 +24,13 @@ class DirCache(MutableMapping):
caching off
"""
- def __init__(self, use_listings_cache=True, listings_expiry_time=None,
- max_paths=None, **kwargs):
+ def __init__(
+ self,
+ use_listings_cache=True,
+ listings_expiry_time=None,
+ max_paths=None,
+ **kwargs,
+ ):
"""
Parameters
@@ -43,20 +48,21 @@ class DirCache(MutableMapping):
self._cache = {}
self._times = {}
if max_paths:
- self._q = lru_cache(max_paths + 1)(lambda key: self._cache.pop(
- key, None))
+ self._q = lru_cache(max_paths + 1)(lambda key: self._cache.pop(key, None))
self.use_listings_cache = use_listings_cache
self.listings_expiry_time = listings_expiry_time
self.max_paths = max_paths
def __getitem__(self, item):
if self.listings_expiry_time is not None:
- if self._times.get(item, 0) - time.time(
- ) < -self.listings_expiry_time:
+ if self._times.get(item, 0) - time.time() < -self.listings_expiry_time:
del self._cache[item]
if self.max_paths:
self._q(item)
- return self._cache[item]
+ return self._cache[item] # maybe raises KeyError
+
+ def clear(self):
+ self._cache.clear()
def __len__(self):
return len(self._cache)
@@ -82,8 +88,11 @@ class DirCache(MutableMapping):
def __iter__(self):
entries = list(self._cache)
+
return (k for k in entries if k in self)
def __reduce__(self):
- return DirCache, (self.use_listings_cache, self.
- listings_expiry_time, self.max_paths)
+ return (
+ DirCache,
+ (self.use_listings_cache, self.listings_expiry_time, self.max_paths),
+ )
diff --git a/fsspec/exceptions.py b/fsspec/exceptions.py
index 0593f0e..ae89054 100644
--- a/fsspec/exceptions.py
+++ b/fsspec/exceptions.py
@@ -1,6 +1,7 @@
"""
fsspec user-defined exception classes
"""
+
import asyncio
diff --git a/fsspec/fuse.py b/fsspec/fuse.py
index de1075f..6ca8c97 100644
--- a/fsspec/fuse.py
+++ b/fsspec/fuse.py
@@ -5,25 +5,149 @@ import stat
import threading
import time
from errno import EIO, ENOENT
+
from fuse import FUSE, FuseOSError, LoggingMixIn, Operations
+
from fsspec import __version__
from fsspec.core import url_to_fs
-logger = logging.getLogger('fsspec.fuse')
+logger = logging.getLogger("fsspec.fuse")
-class FUSEr(Operations):
+class FUSEr(Operations):
def __init__(self, fs, path, ready_file=False):
self.fs = fs
self.cache = {}
- self.root = path.rstrip('/') + '/'
+ self.root = path.rstrip("/") + "/"
self.counter = 0
- logger.info('Starting FUSE at %s', path)
+ logger.info("Starting FUSE at %s", path)
self._ready_file = ready_file
+ def getattr(self, path, fh=None):
+ logger.debug("getattr %s", path)
+ if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]:
+ return {"type": "file", "st_size": 5}
+
+ path = "".join([self.root, path.lstrip("/")]).rstrip("/")
+ try:
+ info = self.fs.info(path)
+ except FileNotFoundError:
+ raise FuseOSError(ENOENT)
+
+ data = {"st_uid": info.get("uid", 1000), "st_gid": info.get("gid", 1000)}
+ perm = info.get("mode", 0o777)
+
+ if info["type"] != "file":
+ data["st_mode"] = stat.S_IFDIR | perm
+ data["st_size"] = 0
+ data["st_blksize"] = 0
+ else:
+ data["st_mode"] = stat.S_IFREG | perm
+ data["st_size"] = info["size"]
+ data["st_blksize"] = 5 * 2**20
+ data["st_nlink"] = 1
+ data["st_atime"] = info["atime"] if "atime" in info else time.time()
+ data["st_ctime"] = info["ctime"] if "ctime" in info else time.time()
+ data["st_mtime"] = info["mtime"] if "mtime" in info else time.time()
+ return data
+
+ def readdir(self, path, fh):
+ logger.debug("readdir %s", path)
+ path = "".join([self.root, path.lstrip("/")])
+ files = self.fs.ls(path, False)
+ files = [os.path.basename(f.rstrip("/")) for f in files]
+ return [".", ".."] + files
+
+ def mkdir(self, path, mode):
+ path = "".join([self.root, path.lstrip("/")])
+ self.fs.mkdir(path)
+ return 0
+
+ def rmdir(self, path):
+ path = "".join([self.root, path.lstrip("/")])
+ self.fs.rmdir(path)
+ return 0
+
+ def read(self, path, size, offset, fh):
+ logger.debug("read %s", (path, size, offset))
+ if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]:
+ # status indicator
+ return b"ready"
+
+ f = self.cache[fh]
+ f.seek(offset)
+ out = f.read(size)
+ return out
+
+ def write(self, path, data, offset, fh):
+ logger.debug("write %s", (path, offset))
+ f = self.cache[fh]
+ f.seek(offset)
+ f.write(data)
+ return len(data)
+
+ def create(self, path, flags, fi=None):
+ logger.debug("create %s", (path, flags))
+ fn = "".join([self.root, path.lstrip("/")])
+ self.fs.touch(fn) # OS will want to get attributes immediately
+ f = self.fs.open(fn, "wb")
+ self.cache[self.counter] = f
+ self.counter += 1
+ return self.counter - 1
+
+ def open(self, path, flags):
+ logger.debug("open %s", (path, flags))
+ fn = "".join([self.root, path.lstrip("/")])
+ if flags % 2 == 0:
+ # read
+ mode = "rb"
+ else:
+ # write/create
+ mode = "wb"
+ self.cache[self.counter] = self.fs.open(fn, mode)
+ self.counter += 1
+ return self.counter - 1
-def run(fs, path, mount_point, foreground=True, threads=False, ready_file=
- False, ops_class=FUSEr):
+ def truncate(self, path, length, fh=None):
+ fn = "".join([self.root, path.lstrip("/")])
+ if length != 0:
+ raise NotImplementedError
+ # maybe should be no-op since open with write sets size to zero anyway
+ self.fs.touch(fn)
+
+ def unlink(self, path):
+ fn = "".join([self.root, path.lstrip("/")])
+ try:
+ self.fs.rm(fn, False)
+ except (OSError, FileNotFoundError):
+ raise FuseOSError(EIO)
+
+ def release(self, path, fh):
+ try:
+ if fh in self.cache:
+ f = self.cache[fh]
+ f.close()
+ self.cache.pop(fh)
+ except Exception as e:
+ print(e)
+ return 0
+
+ def chmod(self, path, mode):
+ if hasattr(self.fs, "chmod"):
+ path = "".join([self.root, path.lstrip("/")])
+ return self.fs.chmod(path, mode)
+ raise NotImplementedError
+
+
+def run(
+ fs,
+ path,
+ mount_point,
+ foreground=True,
+ threads=False,
+ ready_file=False,
+ ops_class=FUSEr,
+):
"""Mount stuff in a local directory
This uses fusepy to make it appear as if a given path on an fsspec
@@ -59,7 +183,22 @@ def run(fs, path, mount_point, foreground=True, threads=False, ready_file=
to file.
"""
- pass
+ func = lambda: FUSE(
+ ops_class(fs, path, ready_file=ready_file),
+ mount_point,
+ nothreads=not threads,
+ foreground=foreground,
+ )
+ if not foreground:
+ th = threading.Thread(target=func)
+ th.daemon = True
+ th.start()
+ return th
+ else: # pragma: no cover
+ try:
+ func()
+ except KeyboardInterrupt:
+ pass
def main(args):
@@ -89,9 +228,97 @@ def main(args):
-o 'ftp-username=anonymous' \\
-o 'ftp-password=xieyanbo'
"""
- pass
+ class RawDescriptionArgumentParser(argparse.ArgumentParser):
+ def format_help(self):
+ usage = super().format_help()
+ parts = usage.split("\n\n")
+ parts[1] = self.description.rstrip()
+ return "\n\n".join(parts)
+
+ parser = RawDescriptionArgumentParser(prog="fsspec.fuse", description=main.__doc__)
+ parser.add_argument("--version", action="version", version=__version__)
+ parser.add_argument("url", type=str, help="fs url")
+ parser.add_argument("source_path", type=str, help="source directory in fs")
+ parser.add_argument("mount_point", type=str, help="local directory")
+ parser.add_argument(
+ "-o",
+ "--option",
+ action="append",
+ help="Any options of protocol included in the chained URL",
+ )
+ parser.add_argument(
+ "-l", "--log-file", type=str, help="Logging FUSE debug info (Default: '')"
+ )
+ parser.add_argument(
+ "-f",
+ "--foreground",
+ action="store_false",
+ help="Running in foreground or not (Default: False)",
+ )
+ parser.add_argument(
+ "-t",
+ "--threads",
+ action="store_false",
+ help="Running with threads support (Default: False)",
+ )
+ parser.add_argument(
+ "-r",
+ "--ready-file",
+ action="store_false",
+ help="The `.fuse_ready` file will exist after FUSE is ready. "
+ "(Debugging purpose, Default: False)",
+ )
+ args = parser.parse_args(args)
+
+ kwargs = {}
+ for item in args.option or []:
+ key, sep, value = item.partition("=")
+ if not sep:
+ parser.error(message=f"Wrong option: {item!r}")
+ val = value.lower()
+ if val.endswith("[int]"):
+ value = int(value[: -len("[int]")])
+ elif val.endswith("[bool]"):
+ value = val[: -len("[bool]")] in ["1", "yes", "true"]
-if __name__ == '__main__':
+ if "-" in key:
+ fs_name, setting_name = key.split("-", 1)
+ if fs_name in kwargs:
+ kwargs[fs_name][setting_name] = value
+ else:
+ kwargs[fs_name] = {setting_name: value}
+ else:
+ kwargs[key] = value
+
+ if args.log_file:
+ logging.basicConfig(
+ level=logging.DEBUG,
+ filename=args.log_file,
+ format="%(asctime)s %(message)s",
+ )
+
+ class LoggingFUSEr(FUSEr, LoggingMixIn):
+ pass
+
+ fuser = LoggingFUSEr
+ else:
+ fuser = FUSEr
+
+ fs, url_path = url_to_fs(args.url, **kwargs)
+ logger.debug("Mounting %s to %s", url_path, str(args.mount_point))
+ run(
+ fs,
+ args.source_path,
+ args.mount_point,
+ foreground=args.foreground,
+ threads=args.threads,
+ ready_file=args.ready_file,
+ ops_class=fuser,
+ )
+
+
+if __name__ == "__main__":
import sys
+
main(sys.argv[1:])
diff --git a/fsspec/generic.py b/fsspec/generic.py
index 48ba37c..9bad0f0 100644
--- a/fsspec/generic.py
+++ b/fsspec/generic.py
@@ -1,26 +1,56 @@
from __future__ import annotations
+
import inspect
import logging
import os
import shutil
import uuid
from typing import Optional
+
from .asyn import AsyncFileSystem, _run_coros_in_chunks, sync_wrapper
from .callbacks import DEFAULT_CALLBACK
from .core import filesystem, get_filesystem_class, split_protocol, url_to_fs
+
_generic_fs = {}
-logger = logging.getLogger('fsspec.generic')
-default_method = 'default'
+logger = logging.getLogger("fsspec.generic")
+
+
+def set_generic_fs(protocol, **storage_options):
+ _generic_fs[protocol] = filesystem(protocol, **storage_options)
+
+
+default_method = "default"
def _resolve_fs(url, method=None, protocol=None, storage_options=None):
"""Pick instance of backend FS"""
- pass
+ method = method or default_method
+ protocol = protocol or split_protocol(url)[0]
+ storage_options = storage_options or {}
+ if method == "default":
+ return filesystem(protocol)
+ if method == "generic":
+ return _generic_fs[protocol]
+ if method == "current":
+ cls = get_filesystem_class(protocol)
+ return cls.current()
+ if method == "options":
+ fs, _ = url_to_fs(url, **storage_options.get(protocol, {}))
+ return fs
+ raise ValueError(f"Unknown FS resolution method: {method}")
-def rsync(source, destination, delete_missing=False, source_field='size',
- dest_field='size', update_cond='different', inst_kwargs=None, fs=None,
- **kwargs):
+def rsync(
+ source,
+ destination,
+ delete_missing=False,
+ source_field="size",
+ dest_field="size",
+ update_cond="different",
+ inst_kwargs=None,
+ fs=None,
+ **kwargs,
+):
"""Sync files between two directory trees
(experimental)
@@ -62,7 +92,56 @@ def rsync(source, destination, delete_missing=False, source_field='size',
-------
dict of the copy operations that were performed, {source: destination}
"""
- pass
+ fs = fs or GenericFileSystem(**(inst_kwargs or {}))
+ source = fs._strip_protocol(source)
+ destination = fs._strip_protocol(destination)
+ allfiles = fs.find(source, withdirs=True, detail=True)
+ if not fs.isdir(source):
+ raise ValueError("Can only rsync on a directory")
+ otherfiles = fs.find(destination, withdirs=True, detail=True)
+ dirs = [
+ a
+ for a, v in allfiles.items()
+ if v["type"] == "directory" and a.replace(source, destination) not in otherfiles
+ ]
+ logger.debug(f"{len(dirs)} directories to create")
+ if dirs:
+ fs.make_many_dirs(
+ [dirn.replace(source, destination) for dirn in dirs], exist_ok=True
+ )
+ allfiles = {a: v for a, v in allfiles.items() if v["type"] == "file"}
+ logger.debug(f"{len(allfiles)} files to consider for copy")
+ to_delete = [
+ o
+ for o, v in otherfiles.items()
+ if o.replace(destination, source) not in allfiles and v["type"] == "file"
+ ]
+ for k, v in allfiles.copy().items():
+ otherfile = k.replace(source, destination)
+ if otherfile in otherfiles:
+ if update_cond == "always":
+ allfiles[k] = otherfile
+ elif update_cond == "different":
+ inf1 = source_field(v) if callable(source_field) else v[source_field]
+ v2 = otherfiles[otherfile]
+ inf2 = dest_field(v2) if callable(dest_field) else v2[dest_field]
+ if inf1 != inf2:
+ # details mismatch, make copy
+ allfiles[k] = otherfile
+ else:
+ # details match, don't copy
+ allfiles.pop(k)
+ else:
+ # file not in target yet
+ allfiles[k] = otherfile
+ logger.debug(f"{len(allfiles)} files to copy")
+ if allfiles:
+ source_files, target_files = zip(*allfiles.items())
+ fs.cp(source_files, target_files, **kwargs)
+ logger.debug(f"{len(to_delete)} files to delete")
+ if delete_missing and to_delete:
+ fs.rm(to_delete)
+ return allfiles
class GenericFileSystem(AsyncFileSystem):
@@ -77,9 +156,10 @@ class GenericFileSystem(AsyncFileSystem):
Note: instances of this FS are always async, even if you never use it with any async
backend.
"""
- protocol = 'generic'
- def __init__(self, default_method='default', **kwargs):
+ protocol = "generic" # there is no real reason to ever use a protocol with this FS
+
+ def __init__(self, default_method="default", **kwargs):
"""
Parameters
@@ -96,10 +176,236 @@ class GenericFileSystem(AsyncFileSystem):
self.method = default_method
super().__init__(**kwargs)
+ def _parent(self, path):
+ fs = _resolve_fs(path, self.method)
+ return fs.unstrip_protocol(fs._parent(path))
+
+ def _strip_protocol(self, path):
+ # normalization only
+ fs = _resolve_fs(path, self.method)
+ return fs.unstrip_protocol(fs._strip_protocol(path))
+
+ async def _find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
+ fs = _resolve_fs(path, self.method)
+ if fs.async_impl:
+ out = await fs._find(
+ path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
+ )
+ else:
+ out = fs.find(
+ path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
+ )
+ result = {}
+ for k, v in out.items():
+ v = v.copy() # don't corrupt target FS dircache
+ name = fs.unstrip_protocol(k)
+ v["name"] = name
+ result[name] = v
+ if detail:
+ return result
+ return list(result)
+
+ async def _info(self, url, **kwargs):
+ fs = _resolve_fs(url, self.method)
+ if fs.async_impl:
+ out = await fs._info(url, **kwargs)
+ else:
+ out = fs.info(url, **kwargs)
+ out = out.copy() # don't edit originals
+ out["name"] = fs.unstrip_protocol(out["name"])
+ return out
+
+ async def _ls(
+ self,
+ url,
+ detail=True,
+ **kwargs,
+ ):
+ fs = _resolve_fs(url, self.method)
+ if fs.async_impl:
+ out = await fs._ls(url, detail=True, **kwargs)
+ else:
+ out = fs.ls(url, detail=True, **kwargs)
+ out = [o.copy() for o in out] # don't edit originals
+ for o in out:
+ o["name"] = fs.unstrip_protocol(o["name"])
+ if detail:
+ return out
+ else:
+ return [o["name"] for o in out]
+
+ async def _cat_file(
+ self,
+ url,
+ **kwargs,
+ ):
+ fs = _resolve_fs(url, self.method)
+ if fs.async_impl:
+ return await fs._cat_file(url, **kwargs)
+ else:
+ return fs.cat_file(url, **kwargs)
+
+ async def _pipe_file(
+ self,
+ path,
+ value,
+ **kwargs,
+ ):
+ fs = _resolve_fs(path, self.method)
+ if fs.async_impl:
+ return await fs._pipe_file(path, value, **kwargs)
+ else:
+ return fs.pipe_file(path, value, **kwargs)
+
+ async def _rm(self, url, **kwargs):
+ urls = url
+ if isinstance(urls, str):
+ urls = [urls]
+ fs = _resolve_fs(urls[0], self.method)
+ if fs.async_impl:
+ await fs._rm(urls, **kwargs)
+ else:
+ fs.rm(url, **kwargs)
+
+ async def _makedirs(self, path, exist_ok=False):
+ logger.debug("Make dir %s", path)
+ fs = _resolve_fs(path, self.method)
+ if fs.async_impl:
+ await fs._makedirs(path, exist_ok=exist_ok)
+ else:
+ fs.makedirs(path, exist_ok=exist_ok)
+
def rsync(self, source, destination, **kwargs):
"""Sync files between two directory trees
See `func:rsync` for more details.
"""
- pass
+ rsync(source, destination, fs=self, **kwargs)
+
+ async def _cp_file(
+ self,
+ url,
+ url2,
+ blocksize=2**20,
+ callback=DEFAULT_CALLBACK,
+ **kwargs,
+ ):
+ fs = _resolve_fs(url, self.method)
+ fs2 = _resolve_fs(url2, self.method)
+ if fs is fs2:
+ # pure remote
+ if fs.async_impl:
+ return await fs._cp_file(url, url2, **kwargs)
+ else:
+ return fs.cp_file(url, url2, **kwargs)
+ kw = {"blocksize": 0, "cache_type": "none"}
+ try:
+ f1 = (
+ await fs.open_async(url, "rb")
+ if hasattr(fs, "open_async")
+ else fs.open(url, "rb", **kw)
+ )
+ callback.set_size(await maybe_await(f1.size))
+ f2 = (
+ await fs2.open_async(url2, "wb")
+ if hasattr(fs2, "open_async")
+ else fs2.open(url2, "wb", **kw)
+ )
+ while f1.size is None or f2.tell() < f1.size:
+ data = await maybe_await(f1.read(blocksize))
+ if f1.size is None and not data:
+ break
+ await maybe_await(f2.write(data))
+ callback.absolute_update(f2.tell())
+ finally:
+ try:
+ await maybe_await(f2.close())
+ await maybe_await(f1.close())
+ except NameError:
+ # fail while opening f1 or f2
+ pass
+
+ async def _make_many_dirs(self, urls, exist_ok=True):
+ fs = _resolve_fs(urls[0], self.method)
+ if fs.async_impl:
+ coros = [fs._makedirs(u, exist_ok=exist_ok) for u in urls]
+ await _run_coros_in_chunks(coros)
+ else:
+ for u in urls:
+ fs.makedirs(u, exist_ok=exist_ok)
+
make_many_dirs = sync_wrapper(_make_many_dirs)
+
+ async def _copy(
+ self,
+ path1: list[str],
+ path2: list[str],
+ recursive: bool = False,
+ on_error: str = "ignore",
+ maxdepth: Optional[int] = None,
+ batch_size: Optional[int] = None,
+ tempdir: Optional[str] = None,
+ **kwargs,
+ ):
+ if recursive:
+ raise NotImplementedError
+ fs = _resolve_fs(path1[0], self.method)
+ fs2 = _resolve_fs(path2[0], self.method)
+ # not expanding paths atm., assume call is from rsync()
+ if fs is fs2:
+ # pure remote
+ if fs.async_impl:
+ return await fs._copy(path1, path2, **kwargs)
+ else:
+ return fs.copy(path1, path2, **kwargs)
+ await copy_file_op(
+ fs, path1, fs2, path2, tempdir, batch_size, on_error=on_error
+ )
+
+
+async def copy_file_op(
+ fs1, url1, fs2, url2, tempdir=None, batch_size=20, on_error="ignore"
+):
+ import tempfile
+
+ tempdir = tempdir or tempfile.mkdtemp()
+ try:
+ coros = [
+ _copy_file_op(
+ fs1,
+ u1,
+ fs2,
+ u2,
+ os.path.join(tempdir, uuid.uuid4().hex),
+ on_error=on_error,
+ )
+ for u1, u2 in zip(url1, url2)
+ ]
+ await _run_coros_in_chunks(coros, batch_size=batch_size)
+ finally:
+ shutil.rmtree(tempdir)
+
+
+async def _copy_file_op(fs1, url1, fs2, url2, local, on_error="ignore"):
+ ex = () if on_error == "raise" else Exception
+ logger.debug("Copy %s -> %s", url1, url2)
+ try:
+ if fs1.async_impl:
+ await fs1._get_file(url1, local)
+ else:
+ fs1.get_file(url1, local)
+ if fs2.async_impl:
+ await fs2._put_file(local, url2)
+ else:
+ fs2.put_file(local, url2)
+ os.unlink(local)
+ logger.debug("Copy %s -> %s; done", url1, url2)
+ except ex as e:
+ logger.debug("ignoring cp exception for %s: %s", url1, e)
+
+
+async def maybe_await(cor):
+ if inspect.iscoroutine(cor):
+ return await cor
+ else:
+ return cor
diff --git a/fsspec/gui.py b/fsspec/gui.py
index ad74c4c..113317e 100644
--- a/fsspec/gui.py
+++ b/fsspec/gui.py
@@ -4,11 +4,14 @@ import logging
import os
import re
from typing import ClassVar, Sequence
+
import panel as pn
+
from .core import OpenFile, get_filesystem_class, split_protocol
from .registry import known_implementations
+
pn.extension()
-logger = logging.getLogger('fsspec.gui')
+logger = logging.getLogger("fsspec.gui")
class SigSlot:
@@ -22,9 +25,15 @@ class SigSlot:
By default, all signals emit a DEBUG logging statement.
"""
+
+ # names of signals that this class may emit each of which must be
+ # set by _register for any new instance
signals: ClassVar[Sequence[str]] = []
+ # names of actions that this class may respond to
slots: ClassVar[Sequence[str]] = []
+ # each of which must be a method name
+
def __init__(self):
self._ignoring_events = False
self._sigs = {}
@@ -33,10 +42,12 @@ class SigSlot:
def _setup(self):
"""Create GUI elements and register signals"""
- pass
+ self.panel = pn.pane.PaneBase()
+ # no signals to set up in the base class
- def _register(self, widget, name, thing='value', log_level=logging.
- DEBUG, auto=False):
+ def _register(
+ self, widget, name, thing="value", log_level=logging.DEBUG, auto=False
+ ):
"""Watch the given attribute of a widget and assign it a named event
This is normally called at the time a widget is instantiated, in the
@@ -58,11 +69,32 @@ class SigSlot:
If True, automatically connects with a method in this class of the
same name.
"""
- pass
+ if name not in self.signals:
+ raise ValueError(f"Attempt to assign an undeclared signal: {name}")
+ self._sigs[name] = {
+ "widget": widget,
+ "callbacks": [],
+ "thing": thing,
+ "log": log_level,
+ }
+ wn = "-".join(
+ [
+ getattr(widget, "name", str(widget)) if widget is not None else "none",
+ thing,
+ ]
+ )
+ self._map[wn] = name
+ if widget is not None:
+ widget.param.watch(self._signal, thing, onlychanged=True)
+ if auto and hasattr(self, name):
+ self.connect(name, getattr(self, name))
def _repr_mimebundle_(self, *args, **kwargs):
"""Display in a notebook or a server"""
- pass
+ try:
+ return self.panel._repr_mimebundle_(*args, **kwargs)
+ except (ValueError, AttributeError):
+ raise NotImplementedError("Panel does not seem to be set up properly")
def connect(self, signal, slot):
"""Associate call back with given event
@@ -74,7 +106,7 @@ class SigSlot:
Alternatively, the callback can be a string, in which case it means
emitting the correspondingly-named event (i.e., connect to self)
"""
- pass
+ self._sigs[signal]["callbacks"].append(slot)
def _signal(self, event):
"""This is called by a an action on a widget
@@ -84,7 +116,10 @@ class SigSlot:
Tests can execute this method by directly changing the values of
widget components.
"""
- pass
+ if not self._ignoring_events:
+ wn = "-".join([event.obj.name, event.name])
+ if wn in self._map and self._map[wn] in self._sigs:
+ self._emit(self._map[wn], event.new)
@contextlib.contextmanager
def ignore_events(self):
@@ -92,7 +127,11 @@ class SigSlot:
(does not propagate to children)
"""
- pass
+ self._ignoring_events = True
+ try:
+ yield
+ finally:
+ self._ignoring_events = False
def _emit(self, sig, value=None):
"""An event happened, call its callbacks
@@ -102,22 +141,67 @@ class SigSlot:
Calling of callbacks will halt whenever one returns False.
"""
- pass
+ logger.log(self._sigs[sig]["log"], f"{sig}: {value}")
+ for callback in self._sigs[sig]["callbacks"]:
+ if isinstance(callback, str):
+ self._emit(callback)
+ else:
+ try:
+ # running callbacks should not break the interface
+ ret = callback(value)
+ if ret is False:
+ break
+ except Exception as e:
+ logger.exception(
+ "Exception (%s) while executing callback for signal: %s",
+ e,
+ sig,
+ )
def show(self, threads=False):
"""Open a new browser tab and display this instance's interface"""
- pass
+ self.panel.show(threads=threads, verbose=False)
+ return self
class SingleSelect(SigSlot):
"""A multiselect which only allows you to select one item for an event"""
- signals = ['_selected', 'selected']
- slots = ['set_options', 'set_selection', 'add', 'clear', 'select']
+
+ signals = ["_selected", "selected"] # the first is internal
+ slots = ["set_options", "set_selection", "add", "clear", "select"]
def __init__(self, **kwargs):
self.kwargs = kwargs
super().__init__()
+ def _setup(self):
+ self.panel = pn.widgets.MultiSelect(**self.kwargs)
+ self._register(self.panel, "_selected", "value")
+ self._register(None, "selected")
+ self.connect("_selected", self.select_one)
+
+ def _signal(self, *args, **kwargs):
+ super()._signal(*args, **kwargs)
+
+ def select_one(self, *_):
+ with self.ignore_events():
+ val = [self.panel.value[-1]] if self.panel.value else []
+ self.panel.value = val
+ self._emit("selected", self.panel.value)
+
+ def set_options(self, options):
+ self.panel.options = options
+
+ def clear(self):
+ self.panel.options = []
+
+ @property
+ def value(self):
+ return self.panel.value
+
+ def set_selection(self, selection):
+ self.panel.value = [selection]
+
class FileSelector(SigSlot):
"""Panel-based graphical file selector widget
@@ -125,9 +209,17 @@ class FileSelector(SigSlot):
Instances of this widget are interactive and can be displayed in jupyter by having
them as the output of a cell, or in a separate browser tab using ``.show()``.
"""
- signals = ['protocol_changed', 'selection_changed', 'directory_entered',
- 'home_clicked', 'up_clicked', 'go_clicked', 'filters_changed']
- slots = ['set_filters', 'go_home']
+
+ signals = [
+ "protocol_changed",
+ "selection_changed",
+ "directory_entered",
+ "home_clicked",
+ "up_clicked",
+ "go_clicked",
+ "filters_changed",
+ ]
+ slots = ["set_filters", "go_home"]
def __init__(self, url=None, filters=None, ignore=None, kwargs=None):
"""
@@ -149,31 +241,91 @@ class FileSelector(SigSlot):
if url:
self.init_protocol, url = split_protocol(url)
else:
- self.init_protocol, url = 'file', os.getcwd()
+ self.init_protocol, url = "file", os.getcwd()
self.init_url = url
- self.init_kwargs = (kwargs if isinstance(kwargs, str) else str(kwargs)
- ) or '{}'
+ self.init_kwargs = (kwargs if isinstance(kwargs, str) else str(kwargs)) or "{}"
self.filters = filters
self.ignore = [re.compile(i) for i in ignore or []]
self._fs = None
super().__init__()
+ def _setup(self):
+ self.url = pn.widgets.TextInput(
+ name="url",
+ value=self.init_url,
+ align="end",
+ sizing_mode="stretch_width",
+ width_policy="max",
+ )
+ self.protocol = pn.widgets.Select(
+ options=sorted(known_implementations),
+ value=self.init_protocol,
+ name="protocol",
+ align="center",
+ )
+ self.kwargs = pn.widgets.TextInput(
+ name="kwargs", value=self.init_kwargs, align="center"
+ )
+ self.go = pn.widgets.Button(name="⇨", align="end", width=45)
+ self.main = SingleSelect(size=10)
+ self.home = pn.widgets.Button(name="🏠", width=40, height=30, align="end")
+ self.up = pn.widgets.Button(name="‹", width=30, height=30, align="end")
+
+ self._register(self.protocol, "protocol_changed", auto=True)
+ self._register(self.go, "go_clicked", "clicks", auto=True)
+ self._register(self.up, "up_clicked", "clicks", auto=True)
+ self._register(self.home, "home_clicked", "clicks", auto=True)
+ self._register(None, "selection_changed")
+ self.main.connect("selected", self.selection_changed)
+ self._register(None, "directory_entered")
+ self.prev_protocol = self.protocol.value
+ self.prev_kwargs = self.storage_options
+
+ self.filter_sel = pn.widgets.CheckBoxGroup(
+ value=[], options=[], inline=False, align="end", width_policy="min"
+ )
+ self._register(self.filter_sel, "filters_changed", auto=True)
+
+ self.panel = pn.Column(
+ pn.Row(self.protocol, self.kwargs),
+ pn.Row(self.home, self.up, self.url, self.go, self.filter_sel),
+ self.main.panel,
+ )
+ self.set_filters(self.filters)
+ self.go_clicked()
+
+ def set_filters(self, filters=None):
+ self.filters = filters
+ if filters:
+ self.filter_sel.options = filters
+ self.filter_sel.value = filters
+ else:
+ self.filter_sel.options = []
+ self.filter_sel.value = []
+
@property
def storage_options(self):
"""Value of the kwargs box as a dictionary"""
- pass
+ return ast.literal_eval(self.kwargs.value) or {}
@property
def fs(self):
"""Current filesystem instance"""
- pass
+ if self._fs is None:
+ cls = get_filesystem_class(self.protocol.value)
+ self._fs = cls(**self.storage_options)
+ return self._fs
@property
def urlpath(self):
"""URL of currently selected item"""
- pass
+ return (
+ (f"{self.protocol.value}://{self.main.value[0]}")
+ if self.main.value
+ else None
+ )
- def open_file(self, mode='rb', compression=None, encoding=None):
+ def open_file(self, mode="rb", compression=None, encoding=None):
"""Create OpenFile instance for the currently selected item
For example, in a notebook you might do something like
@@ -197,4 +349,66 @@ class FileSelector(SigSlot):
encoding: str (optional)
If using text mode, use this encoding; defaults to UTF8.
"""
- pass
+ if self.urlpath is None:
+ raise ValueError("No file selected")
+ return OpenFile(self.fs, self.urlpath, mode, compression, encoding)
+
+ def filters_changed(self, values):
+ self.filters = values
+ self.go_clicked()
+
+ def selection_changed(self, *_):
+ if self.urlpath is None:
+ return
+ if self.fs.isdir(self.urlpath):
+ self.url.value = self.fs._strip_protocol(self.urlpath)
+ self.go_clicked()
+
+ def go_clicked(self, *_):
+ if (
+ self.prev_protocol != self.protocol.value
+ or self.prev_kwargs != self.storage_options
+ ):
+ self._fs = None # causes fs to be recreated
+ self.prev_protocol = self.protocol.value
+ self.prev_kwargs = self.storage_options
+ listing = sorted(
+ self.fs.ls(self.url.value, detail=True), key=lambda x: x["name"]
+ )
+ listing = [
+ l
+ for l in listing
+ if not any(i.match(l["name"].rsplit("/", 1)[-1]) for i in self.ignore)
+ ]
+ folders = {
+ "📁 " + o["name"].rsplit("/", 1)[-1]: o["name"]
+ for o in listing
+ if o["type"] == "directory"
+ }
+ files = {
+ "📄 " + o["name"].rsplit("/", 1)[-1]: o["name"]
+ for o in listing
+ if o["type"] == "file"
+ }
+ if self.filters:
+ files = {
+ k: v
+ for k, v in files.items()
+ if any(v.endswith(ext) for ext in self.filters)
+ }
+ self.main.set_options(dict(**folders, **files))
+
+ def protocol_changed(self, *_):
+ self._fs = None
+ self.main.options = []
+ self.url.value = ""
+
+ def home_clicked(self, *_):
+ self.protocol.value = self.init_protocol
+ self.kwargs.value = self.init_kwargs
+ self.url.value = self.init_url
+ self.go_clicked()
+
+ def up_clicked(self, *_):
+ self.url.value = self.fs._parent(self.url.value)
+ self.go_clicked()
diff --git a/fsspec/implementations/arrow.py b/fsspec/implementations/arrow.py
index e065995..f9fea70 100644
--- a/fsspec/implementations/arrow.py
+++ b/fsspec/implementations/arrow.py
@@ -6,8 +6,34 @@ import shutil
from contextlib import suppress
from functools import cached_property, wraps
from urllib.parse import parse_qs
+
from fsspec.spec import AbstractFileSystem
-from fsspec.utils import get_package_version_without_import, infer_storage_options, mirror_from, tokenize
+from fsspec.utils import (
+ get_package_version_without_import,
+ infer_storage_options,
+ mirror_from,
+ tokenize,
+)
+
+
+def wrap_exceptions(func):
+ @wraps(func)
+ def wrapper(*args, **kwargs):
+ try:
+ return func(*args, **kwargs)
+ except OSError as exception:
+ if not exception.args:
+ raise
+
+ message, *args = exception.args
+ if isinstance(message, str) and "does not exist" in message:
+ raise FileNotFoundError(errno.ENOENT, message) from exception
+ else:
+ raise
+
+ return wrapper
+
+
PYARROW_VERSION = None
@@ -19,24 +45,193 @@ class ArrowFSWrapper(AbstractFileSystem):
fs : pyarrow.fs.FileSystem
"""
- root_marker = '/'
+
+ root_marker = "/"
def __init__(self, fs, **kwargs):
global PYARROW_VERSION
- PYARROW_VERSION = get_package_version_without_import('pyarrow')
+ PYARROW_VERSION = get_package_version_without_import("pyarrow")
self.fs = fs
super().__init__(**kwargs)
+ @property
+ def protocol(self):
+ return self.fs.type_name
-@mirror_from('stream', ['read', 'seek', 'tell', 'write', 'readable',
- 'writable', 'close', 'size', 'seekable'])
-class ArrowFile(io.IOBase):
+ @cached_property
+ def fsid(self):
+ return "hdfs_" + tokenize(self.fs.host, self.fs.port)
+
+ @classmethod
+ def _strip_protocol(cls, path):
+ ops = infer_storage_options(path)
+ path = ops["path"]
+ if path.startswith("//"):
+ # special case for "hdfs://path" (without the triple slash)
+ path = path[1:]
+ return path
+
+ def ls(self, path, detail=False, **kwargs):
+ path = self._strip_protocol(path)
+ from pyarrow.fs import FileSelector
+
+ entries = [
+ self._make_entry(entry)
+ for entry in self.fs.get_file_info(FileSelector(path))
+ ]
+ if detail:
+ return entries
+ else:
+ return [entry["name"] for entry in entries]
+
+ def info(self, path, **kwargs):
+ path = self._strip_protocol(path)
+ [info] = self.fs.get_file_info([path])
+ return self._make_entry(info)
+
+ def exists(self, path):
+ path = self._strip_protocol(path)
+ try:
+ self.info(path)
+ except FileNotFoundError:
+ return False
+ else:
+ return True
+
+ def _make_entry(self, info):
+ from pyarrow.fs import FileType
+
+ if info.type is FileType.Directory:
+ kind = "directory"
+ elif info.type is FileType.File:
+ kind = "file"
+ elif info.type is FileType.NotFound:
+ raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), info.path)
+ else:
+ kind = "other"
+
+ return {
+ "name": info.path,
+ "size": info.size,
+ "type": kind,
+ "mtime": info.mtime,
+ }
+
+ @wrap_exceptions
+ def cp_file(self, path1, path2, **kwargs):
+ path1 = self._strip_protocol(path1).rstrip("/")
+ path2 = self._strip_protocol(path2).rstrip("/")
+
+ with self._open(path1, "rb") as lstream:
+ tmp_fname = f"{path2}.tmp.{secrets.token_hex(6)}"
+ try:
+ with self.open(tmp_fname, "wb") as rstream:
+ shutil.copyfileobj(lstream, rstream)
+ self.fs.move(tmp_fname, path2)
+ except BaseException: # noqa
+ with suppress(FileNotFoundError):
+ self.fs.delete_file(tmp_fname)
+ raise
+ @wrap_exceptions
+ def mv(self, path1, path2, **kwargs):
+ path1 = self._strip_protocol(path1).rstrip("/")
+ path2 = self._strip_protocol(path2).rstrip("/")
+ self.fs.move(path1, path2)
+
+ @wrap_exceptions
+ def rm_file(self, path):
+ path = self._strip_protocol(path)
+ self.fs.delete_file(path)
+
+ @wrap_exceptions
+ def rm(self, path, recursive=False, maxdepth=None):
+ path = self._strip_protocol(path).rstrip("/")
+ if self.isdir(path):
+ if recursive:
+ self.fs.delete_dir(path)
+ else:
+ raise ValueError("Can't delete directories without recursive=False")
+ else:
+ self.fs.delete_file(path)
+
+ @wrap_exceptions
+ def _open(self, path, mode="rb", block_size=None, seekable=True, **kwargs):
+ if mode == "rb":
+ if seekable:
+ method = self.fs.open_input_file
+ else:
+ method = self.fs.open_input_stream
+ elif mode == "wb":
+ method = self.fs.open_output_stream
+ elif mode == "ab":
+ method = self.fs.open_append_stream
+ else:
+ raise ValueError(f"unsupported mode for Arrow filesystem: {mode!r}")
+
+ _kwargs = {}
+ if mode != "rb" or not seekable:
+ if int(PYARROW_VERSION.split(".")[0]) >= 4:
+ # disable compression auto-detection
+ _kwargs["compression"] = None
+ stream = method(path, **_kwargs)
+
+ return ArrowFile(self, stream, path, mode, block_size, **kwargs)
+
+ @wrap_exceptions
+ def mkdir(self, path, create_parents=True, **kwargs):
+ path = self._strip_protocol(path)
+ if create_parents:
+ self.makedirs(path, exist_ok=True)
+ else:
+ self.fs.create_dir(path, recursive=False)
+
+ @wrap_exceptions
+ def makedirs(self, path, exist_ok=False):
+ path = self._strip_protocol(path)
+ self.fs.create_dir(path, recursive=True)
+
+ @wrap_exceptions
+ def rmdir(self, path):
+ path = self._strip_protocol(path)
+ self.fs.delete_dir(path)
+
+ @wrap_exceptions
+ def modified(self, path):
+ path = self._strip_protocol(path)
+ return self.fs.get_file_info(path).mtime
+
+ def cat_file(self, path, start=None, end=None, **kwargs):
+ kwargs["seekable"] = start not in [None, 0]
+ return super().cat_file(path, start=None, end=None, **kwargs)
+
+ def get_file(self, rpath, lpath, **kwargs):
+ kwargs["seekable"] = False
+ super().get_file(rpath, lpath, **kwargs)
+
+
+@mirror_from(
+ "stream",
+ [
+ "read",
+ "seek",
+ "tell",
+ "write",
+ "readable",
+ "writable",
+ "close",
+ "size",
+ "seekable",
+ ],
+)
+class ArrowFile(io.IOBase):
def __init__(self, fs, stream, path, mode, block_size=None, **kwargs):
self.path = path
self.mode = mode
+
self.fs = fs
self.stream = stream
+
self.blocksize = self.block_size = block_size
self.kwargs = kwargs
@@ -50,10 +245,19 @@ class ArrowFile(io.IOBase):
class HadoopFileSystem(ArrowFSWrapper):
"""A wrapper on top of the pyarrow.fs.HadoopFileSystem
to connect it's interface with fsspec"""
- protocol = 'hdfs'
- def __init__(self, host='default', port=0, user=None, kerb_ticket=None,
- replication=3, extra_conf=None, **kwargs):
+ protocol = "hdfs"
+
+ def __init__(
+ self,
+ host="default",
+ port=0,
+ user=None,
+ kerb_ticket=None,
+ replication=3,
+ extra_conf=None,
+ **kwargs,
+ ):
"""
Parameters
@@ -72,6 +276,29 @@ class HadoopFileSystem(ArrowFSWrapper):
Passed on to HadoopFileSystem
"""
from pyarrow.fs import HadoopFileSystem
- fs = HadoopFileSystem(host=host, port=port, user=user, kerb_ticket=
- kerb_ticket, replication=replication, extra_conf=extra_conf)
+
+ fs = HadoopFileSystem(
+ host=host,
+ port=port,
+ user=user,
+ kerb_ticket=kerb_ticket,
+ replication=replication,
+ extra_conf=extra_conf,
+ )
super().__init__(fs=fs, **kwargs)
+
+ @staticmethod
+ def _get_kwargs_from_urls(path):
+ ops = infer_storage_options(path)
+ out = {}
+ if ops.get("host", None):
+ out["host"] = ops["host"]
+ if ops.get("username", None):
+ out["user"] = ops["username"]
+ if ops.get("port", None):
+ out["port"] = ops["port"]
+ if ops.get("url_query", None):
+ queries = parse_qs(ops["url_query"])
+ if queries.get("replication", None):
+ out["replication"] = int(queries["replication"][0])
+ return out
diff --git a/fsspec/implementations/cache_mapper.py b/fsspec/implementations/cache_mapper.py
index 3294867..6e7c7d8 100644
--- a/fsspec/implementations/cache_mapper.py
+++ b/fsspec/implementations/cache_mapper.py
@@ -1,6 +1,8 @@
from __future__ import annotations
+
import abc
import hashlib
+
from fsspec.implementations.local import make_path_posix
@@ -10,13 +12,16 @@ class AbstractCacheMapper(abc.ABC):
"""
@abc.abstractmethod
- def __call__(self, path: str) ->str:
- ...
+ def __call__(self, path: str) -> str: ...
- def __eq__(self, other: object) ->bool:
+ def __eq__(self, other: object) -> bool:
+ # Identity only depends on class. When derived classes have attributes
+ # they will need to be included.
return isinstance(other, type(self))
- def __hash__(self) ->int:
+ def __hash__(self) -> int:
+ # Identity only depends on class. When derived classes have attributes
+ # they will need to be included.
return hash(type(self))
@@ -28,39 +33,43 @@ class BasenameCacheMapper(AbstractCacheMapper):
basename will have the same cached basename.
"""
- def __init__(self, directory_levels: int=0):
+ def __init__(self, directory_levels: int = 0):
if directory_levels < 0:
raise ValueError(
- 'BasenameCacheMapper requires zero or positive directory_levels'
- )
+ "BasenameCacheMapper requires zero or positive directory_levels"
+ )
self.directory_levels = directory_levels
- self._separator = '_@_'
- def __call__(self, path: str) ->str:
+ # Separator for directories when encoded as strings.
+ self._separator = "_@_"
+
+ def __call__(self, path: str) -> str:
path = make_path_posix(path)
- prefix, *bits = path.rsplit('/', self.directory_levels + 1)
+ prefix, *bits = path.rsplit("/", self.directory_levels + 1)
if bits:
return self._separator.join(bits)
else:
- return prefix
+ return prefix # No separator found, simple filename
- def __eq__(self, other: object) ->bool:
- return super().__eq__(other
- ) and self.directory_levels == other.directory_levels
+ def __eq__(self, other: object) -> bool:
+ return super().__eq__(other) and self.directory_levels == other.directory_levels
- def __hash__(self) ->int:
+ def __hash__(self) -> int:
return super().__hash__() ^ hash(self.directory_levels)
class HashCacheMapper(AbstractCacheMapper):
"""Cache mapper that uses a hash of the remote URL."""
- def __call__(self, path: str) ->str:
+ def __call__(self, path: str) -> str:
return hashlib.sha256(path.encode()).hexdigest()
-def create_cache_mapper(same_names: bool) ->AbstractCacheMapper:
+def create_cache_mapper(same_names: bool) -> AbstractCacheMapper:
"""Factory method to create cache mapper for backward compatibility with
``CachingFileSystem`` constructor using ``same_names`` kwarg.
"""
- pass
+ if same_names:
+ return BasenameCacheMapper()
+ else:
+ return HashCacheMapper()
diff --git a/fsspec/implementations/cache_metadata.py b/fsspec/implementations/cache_metadata.py
index 9a2c33e..bd9b5cd 100644
--- a/fsspec/implementations/cache_metadata.py
+++ b/fsspec/implementations/cache_metadata.py
@@ -1,18 +1,25 @@
from __future__ import annotations
+
import os
import pickle
import time
from typing import TYPE_CHECKING
+
from fsspec.utils import atomic_write
+
try:
import ujson as json
except ImportError:
if not TYPE_CHECKING:
import json
+
if TYPE_CHECKING:
from typing import Any, Dict, Iterator, Literal
+
from typing_extensions import TypeAlias
+
from .cached import CachingFileSystem
+
Detail: TypeAlias = Dict[str, Any]
@@ -37,22 +44,40 @@ class CacheMetadata:
is stored in the last of these directories by convention.
"""
if not storage:
- raise ValueError(
- 'CacheMetadata expects at least one storage location')
+ raise ValueError("CacheMetadata expects at least one storage location")
+
self._storage = storage
self.cached_files: list[Detail] = [{}]
+
+ # Private attribute to force saving of metadata in pickle format rather than
+ # JSON for use in tests to confirm can read both pickle and JSON formats.
self._force_save_pickle = False
- def _load(self, fn: str) ->Detail:
+ def _load(self, fn: str) -> Detail:
"""Low-level function to load metadata from specific file"""
- pass
-
- def _save(self, metadata_to_save: Detail, fn: str) ->None:
+ try:
+ with open(fn, "r") as f:
+ loaded = json.load(f)
+ except ValueError:
+ with open(fn, "rb") as f:
+ loaded = pickle.load(f)
+ for c in loaded.values():
+ if isinstance(c.get("blocks"), list):
+ c["blocks"] = set(c["blocks"])
+ return loaded
+
+ def _save(self, metadata_to_save: Detail, fn: str) -> None:
"""Low-level function to save metadata to specific file"""
- pass
-
- def _scan_locations(self, writable_only: bool=False) ->Iterator[tuple[
- str, str, bool]]:
+ if self._force_save_pickle:
+ with atomic_write(fn) as f:
+ pickle.dump(metadata_to_save, f)
+ else:
+ with atomic_write(fn, mode="w") as f:
+ json.dump(metadata_to_save, f)
+
+ def _scan_locations(
+ self, writable_only: bool = False
+ ) -> Iterator[tuple[str, str, bool]]:
"""Yield locations (filenames) where metadata is stored, and whether
writable or not.
@@ -65,51 +90,143 @@ class CacheMetadata:
-------
Yields (str, str, bool)
"""
- pass
-
- def check_file(self, path: str, cfs: (CachingFileSystem | None)) ->(Literal
- [False] | tuple[Detail, str]):
+ n = len(self._storage)
+ for i, storage in enumerate(self._storage):
+ writable = i == n - 1
+ if writable_only and not writable:
+ continue
+ yield os.path.join(storage, "cache"), storage, writable
+
+ def check_file(
+ self, path: str, cfs: CachingFileSystem | None
+ ) -> Literal[False] | tuple[Detail, str]:
"""If path is in cache return its details, otherwise return ``False``.
If the optional CachingFileSystem is specified then it is used to
perform extra checks to reject possible matches, such as if they are
too old.
"""
- pass
-
- def clear_expired(self, expiry_time: int) ->tuple[list[str], bool]:
+ for (fn, base, _), cache in zip(self._scan_locations(), self.cached_files):
+ if path not in cache:
+ continue
+ detail = cache[path].copy()
+
+ if cfs is not None:
+ if cfs.check_files and detail["uid"] != cfs.fs.ukey(path):
+ # Wrong file as determined by hash of file properties
+ continue
+ if cfs.expiry and time.time() - detail["time"] > cfs.expiry:
+ # Cached file has expired
+ continue
+
+ fn = os.path.join(base, detail["fn"])
+ if os.path.exists(fn):
+ return detail, fn
+ return False
+
+ def clear_expired(self, expiry_time: int) -> tuple[list[str], bool]:
"""Remove expired metadata from the cache.
Returns names of files corresponding to expired metadata and a boolean
flag indicating whether the writable cache is empty. Caller is
responsible for deleting the expired files.
"""
- pass
-
- def load(self) ->None:
+ expired_files = []
+ for path, detail in self.cached_files[-1].copy().items():
+ if time.time() - detail["time"] > expiry_time:
+ fn = detail.get("fn", "")
+ if not fn:
+ raise RuntimeError(
+ f"Cache metadata does not contain 'fn' for {path}"
+ )
+ fn = os.path.join(self._storage[-1], fn)
+ expired_files.append(fn)
+ self.cached_files[-1].pop(path)
+
+ if self.cached_files[-1]:
+ cache_path = os.path.join(self._storage[-1], "cache")
+ self._save(self.cached_files[-1], cache_path)
+
+ writable_cache_empty = not self.cached_files[-1]
+ return expired_files, writable_cache_empty
+
+ def load(self) -> None:
"""Load all metadata from disk and store in ``self.cached_files``"""
- pass
-
- def on_close_cached_file(self, f: Any, path: str) ->None:
+ cached_files = []
+ for fn, _, _ in self._scan_locations():
+ if os.path.exists(fn):
+ # TODO: consolidate blocks here
+ cached_files.append(self._load(fn))
+ else:
+ cached_files.append({})
+ self.cached_files = cached_files or [{}]
+
+ def on_close_cached_file(self, f: Any, path: str) -> None:
"""Perform side-effect actions on closing a cached file.
The actual closing of the file is the responsibility of the caller.
"""
- pass
+ # File must be writeble, so in self.cached_files[-1]
+ c = self.cached_files[-1][path]
+ if c["blocks"] is not True and len(c["blocks"]) * f.blocksize >= f.size:
+ c["blocks"] = True
- def pop_file(self, path: str) ->(str | None):
+ def pop_file(self, path: str) -> str | None:
"""Remove metadata of cached file.
If path is in the cache, return the filename of the cached file,
otherwise return ``None``. Caller is responsible for deleting the
cached file.
"""
- pass
-
- def save(self) ->None:
+ details = self.check_file(path, None)
+ if not details:
+ return None
+ _, fn = details
+ if fn.startswith(self._storage[-1]):
+ self.cached_files[-1].pop(path)
+ self.save()
+ else:
+ raise PermissionError(
+ "Can only delete cached file in last, writable cache location"
+ )
+ return fn
+
+ def save(self) -> None:
"""Save metadata to disk"""
- pass
-
- def update_file(self, path: str, detail: Detail) ->None:
+ for (fn, _, writable), cache in zip(self._scan_locations(), self.cached_files):
+ if not writable:
+ continue
+
+ if os.path.exists(fn):
+ cached_files = self._load(fn)
+ for k, c in cached_files.items():
+ if k in cache:
+ if c["blocks"] is True or cache[k]["blocks"] is True:
+ c["blocks"] = True
+ else:
+ # self.cached_files[*][*]["blocks"] must continue to
+ # point to the same set object so that updates
+ # performed by MMapCache are propagated back to
+ # self.cached_files.
+ blocks = cache[k]["blocks"]
+ blocks.update(c["blocks"])
+ c["blocks"] = blocks
+ c["time"] = max(c["time"], cache[k]["time"])
+ c["uid"] = cache[k]["uid"]
+
+ # Files can be added to cache after it was written once
+ for k, c in cache.items():
+ if k not in cached_files:
+ cached_files[k] = c
+ else:
+ cached_files = cache
+ cache = {k: v.copy() for k, v in cached_files.items()}
+ for c in cache.values():
+ if isinstance(c["blocks"], set):
+ c["blocks"] = list(c["blocks"])
+ self._save(cache, fn)
+ self.cached_files[-1] = cached_files
+
+ def update_file(self, path: str, detail: Detail) -> None:
"""Update metadata for specific file in memory, do not save"""
- pass
+ self.cached_files[-1][path] = detail
diff --git a/fsspec/implementations/cached.py b/fsspec/implementations/cached.py
index bd56e3c..447e4f2 100644
--- a/fsspec/implementations/cached.py
+++ b/fsspec/implementations/cached.py
@@ -1,4 +1,5 @@
from __future__ import annotations
+
import inspect
import logging
import os
@@ -7,6 +8,7 @@ import time
import weakref
from shutil import rmtree
from typing import TYPE_CHECKING, Any, Callable, ClassVar
+
from fsspec import AbstractFileSystem, filesystem
from fsspec.callbacks import DEFAULT_CALLBACK
from fsspec.compression import compr
@@ -17,13 +19,23 @@ from fsspec.implementations.cache_metadata import CacheMetadata
from fsspec.spec import AbstractBufferedFile
from fsspec.transaction import Transaction
from fsspec.utils import infer_compression
+
if TYPE_CHECKING:
from fsspec.implementations.cache_mapper import AbstractCacheMapper
-logger = logging.getLogger('fsspec.cached')
+
+logger = logging.getLogger("fsspec.cached")
class WriteCachedTransaction(Transaction):
- pass
+ def complete(self, commit=True):
+ rpaths = [f.path for f in self.files]
+ lpaths = [f.fn for f in self.files]
+ if commit:
+ self.fs.put(lpaths, rpaths)
+ self.files.clear()
+ self.fs._intrans = False
+ self.fs._transaction = None
+ self.fs = None # break cycle
class CachingFileSystem(AbstractFileSystem):
@@ -45,13 +57,23 @@ class CachingFileSystem(AbstractFileSystem):
derived from fsspec.spec.AbstractBufferedFile ; LocalFileSystem is also
allowed, for testing
"""
- protocol: ClassVar[str | tuple[str, ...]] = ('blockcache', 'cached')
- def __init__(self, target_protocol=None, cache_storage='TMP',
- cache_check=10, check_files=False, expiry_time=604800,
- target_options=None, fs=None, same_names: (bool | None)=None,
- compression=None, cache_mapper: (AbstractCacheMapper | None)=None,
- **kwargs):
+ protocol: ClassVar[str | tuple[str, ...]] = ("blockcache", "cached")
+
+ def __init__(
+ self,
+ target_protocol=None,
+ cache_storage="TMP",
+ cache_check=10,
+ check_files=False,
+ expiry_time=604800,
+ target_options=None,
+ fs=None,
+ same_names: bool | None = None,
+ compression=None,
+ cache_mapper: AbstractCacheMapper | None = None,
+ **kwargs,
+ ):
"""
Parameters
@@ -96,19 +118,21 @@ class CachingFileSystem(AbstractFileSystem):
super().__init__(**kwargs)
if fs is None and target_protocol is None:
raise ValueError(
- 'Please provide filesystem instance(fs) or target_protocol')
+ "Please provide filesystem instance(fs) or target_protocol"
+ )
if not (fs is None) ^ (target_protocol is None):
raise ValueError(
- 'Both filesystems (fs) and target_protocol may not be both given.'
- )
- if cache_storage == 'TMP':
+ "Both filesystems (fs) and target_protocol may not be both given."
+ )
+ if cache_storage == "TMP":
tempdir = tempfile.mkdtemp()
storage = [tempdir]
weakref.finalize(self, self._remove_tempdir, tempdir)
- elif isinstance(cache_storage, str):
- storage = [cache_storage]
else:
- storage = cache_storage
+ if isinstance(cache_storage, str):
+ storage = [cache_storage]
+ else:
+ storage = cache_storage
os.makedirs(storage[-1], exist_ok=True)
self.storage = storage
self.kwargs = target_options or {}
@@ -116,51 +140,89 @@ class CachingFileSystem(AbstractFileSystem):
self.check_files = check_files
self.expiry = expiry_time
self.compression = compression
+
+ # Size of cache in bytes. If None then the size is unknown and will be
+ # recalculated the next time cache_size() is called. On writes to the
+ # cache this is reset to None.
self._cache_size = None
+
if same_names is not None and cache_mapper is not None:
raise ValueError(
- 'Cannot specify both same_names and cache_mapper in CachingFileSystem.__init__'
- )
+ "Cannot specify both same_names and cache_mapper in "
+ "CachingFileSystem.__init__"
+ )
if cache_mapper is not None:
self._mapper = cache_mapper
else:
- self._mapper = create_cache_mapper(same_names if same_names is not
- None else False)
- self.target_protocol = target_protocol if isinstance(target_protocol,
- str) else fs.protocol if isinstance(fs.protocol, str
- ) else fs.protocol[0]
+ self._mapper = create_cache_mapper(
+ same_names if same_names is not None else False
+ )
+
+ self.target_protocol = (
+ target_protocol
+ if isinstance(target_protocol, str)
+ else (fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0])
+ )
self._metadata = CacheMetadata(self.storage)
self.load_cache()
- self.fs = fs if fs is not None else filesystem(target_protocol, **
- self.kwargs)
+ self.fs = fs if fs is not None else filesystem(target_protocol, **self.kwargs)
def _strip_protocol(path):
+ # acts as a method, since each instance has a difference target
return self.fs._strip_protocol(type(self)._strip_protocol(path))
+
self._strip_protocol: Callable = _strip_protocol
+ @staticmethod
+ def _remove_tempdir(tempdir):
+ try:
+ rmtree(tempdir)
+ except Exception:
+ pass
+
+ def _mkcache(self):
+ os.makedirs(self.storage[-1], exist_ok=True)
+
def cache_size(self):
"""Return size of cache in bytes.
If more than one cache directory is in use, only the size of the last
one (the writable cache directory) is returned.
"""
- pass
+ if self._cache_size is None:
+ cache_dir = self.storage[-1]
+ self._cache_size = filesystem("file").du(cache_dir, withdirs=True)
+ return self._cache_size
def load_cache(self):
"""Read set of stored blocks from file"""
- pass
+ self._metadata.load()
+ self._mkcache()
+ self.last_cache = time.time()
def save_cache(self):
"""Save set of stored blocks from file"""
- pass
+ self._mkcache()
+ self._metadata.save()
+ self.last_cache = time.time()
+ self._cache_size = None
def _check_cache(self):
"""Reload caches if time elapsed or any disappeared"""
- pass
+ self._mkcache()
+ if not self.cache_check:
+ # explicitly told not to bother checking
+ return
+ timecond = time.time() - self.last_cache > self.cache_check
+ existcond = all(os.path.exists(storage) for storage in self.storage)
+ if timecond or not existcond:
+ self.load_cache()
def _check_file(self, path):
"""Is path in cache and still valid"""
- pass
+ path = self._strip_protocol(path)
+ self._check_cache()
+ return self._metadata.check_file(path, self)
def clear_cache(self):
"""Remove all files and metadata from the cache
@@ -168,7 +230,9 @@ class CachingFileSystem(AbstractFileSystem):
In the case of multiple cache locations, this clears only the last one,
which is assumed to be the read/write one.
"""
- pass
+ rmtree(self.storage[-1])
+ self.load_cache()
+ self._cache_size = None
def clear_expired_cache(self, expiry_time=None):
"""Remove all expired files and metadata from the cache
@@ -183,7 +247,22 @@ class CachingFileSystem(AbstractFileSystem):
If not defined the default is equivalent to the attribute from the
file caching instantiation.
"""
- pass
+
+ if not expiry_time:
+ expiry_time = self.expiry
+
+ self._check_cache()
+
+ expired_files, writable_cache_empty = self._metadata.clear_expired(expiry_time)
+ for fn in expired_files:
+ if os.path.exists(fn):
+ os.remove(fn)
+
+ if writable_cache_empty:
+ rmtree(self.storage[-1])
+ self.load_cache()
+
+ self._cache_size = None
def pop_from_cache(self, path):
"""Remove cached version of given file
@@ -192,10 +271,21 @@ class CachingFileSystem(AbstractFileSystem):
location which is not the last, it is assumed to be read-only, and
raises PermissionError
"""
- pass
+ path = self._strip_protocol(path)
+ fn = self._metadata.pop_file(path)
+ if fn is not None:
+ os.remove(fn)
+ self._cache_size = None
- def _open(self, path, mode='rb', block_size=None, autocommit=True,
- cache_options=None, **kwargs):
+ def _open(
+ self,
+ path,
+ mode="rb",
+ block_size=None,
+ autocommit=True,
+ cache_options=None,
+ **kwargs,
+ ):
"""Wrap the target _open
If the whole file exists in the cache, just open it locally and
@@ -208,47 +298,183 @@ class CachingFileSystem(AbstractFileSystem):
We monkey-patch this file, so that when it closes, we call
``close_and_update`` to save the state of the blocks.
"""
- pass
+ path = self._strip_protocol(path)
+
+ path = self.fs._strip_protocol(path)
+ if "r" not in mode:
+ return self.fs._open(
+ path,
+ mode=mode,
+ block_size=block_size,
+ autocommit=autocommit,
+ cache_options=cache_options,
+ **kwargs,
+ )
+ detail = self._check_file(path)
+ if detail:
+ # file is in cache
+ detail, fn = detail
+ hash, blocks = detail["fn"], detail["blocks"]
+ if blocks is True:
+ # stored file is complete
+ logger.debug("Opening local copy of %s", path)
+ return open(fn, mode)
+ # TODO: action where partial file exists in read-only cache
+ logger.debug("Opening partially cached copy of %s", path)
+ else:
+ hash = self._mapper(path)
+ fn = os.path.join(self.storage[-1], hash)
+ blocks = set()
+ detail = {
+ "original": path,
+ "fn": hash,
+ "blocks": blocks,
+ "time": time.time(),
+ "uid": self.fs.ukey(path),
+ }
+ self._metadata.update_file(path, detail)
+ logger.debug("Creating local sparse file for %s", path)
+
+ # call target filesystems open
+ self._mkcache()
+ f = self.fs._open(
+ path,
+ mode=mode,
+ block_size=block_size,
+ autocommit=autocommit,
+ cache_options=cache_options,
+ cache_type="none",
+ **kwargs,
+ )
+ if self.compression:
+ comp = (
+ infer_compression(path)
+ if self.compression == "infer"
+ else self.compression
+ )
+ f = compr[comp](f, mode="rb")
+ if "blocksize" in detail:
+ if detail["blocksize"] != f.blocksize:
+ raise BlocksizeMismatchError(
+ f"Cached file must be reopened with same block"
+ f" size as original (old: {detail['blocksize']},"
+ f" new {f.blocksize})"
+ )
+ else:
+ detail["blocksize"] = f.blocksize
+ f.cache = MMapCache(f.blocksize, f._fetch_range, f.size, fn, blocks)
+ close = f.close
+ f.close = lambda: self.close_and_update(f, close)
+ self.save_cache()
+ return f
+
+ def _parent(self, path):
+ return self.fs._parent(path)
+
+ def hash_name(self, path: str, *args: Any) -> str:
+ # Kept for backward compatibility with downstream libraries.
+ # Ignores extra arguments, previously same_name boolean.
+ return self._mapper(path)
def close_and_update(self, f, close):
"""Called when a file is closing, so store the set of blocks"""
- pass
+ if f.closed:
+ return
+ path = self._strip_protocol(f.path)
+ self._metadata.on_close_cached_file(f, path)
+ try:
+ logger.debug("going to save")
+ self.save_cache()
+ logger.debug("saved")
+ except OSError:
+ logger.debug("Cache saving failed while closing file")
+ except NameError:
+ logger.debug("Cache save failed due to interpreter shutdown")
+ close()
+ f.closed = True
+
+ def ls(self, path, detail=True):
+ return self.fs.ls(path, detail)
def __getattribute__(self, item):
- if item in {'load_cache', '_open', 'save_cache', 'close_and_update',
- '__init__', '__getattribute__', '__reduce__',
- '_make_local_details', 'open', 'cat', 'cat_file', 'cat_ranges',
- 'get', 'read_block', 'tail', 'head', 'info', 'ls', 'exists',
- 'isfile', 'isdir', '_check_file', '_check_cache', '_mkcache',
- 'clear_cache', 'clear_expired_cache', 'pop_from_cache',
- 'local_file', '_paths_from_path', 'get_mapper', 'open_many',
- 'commit_many', 'hash_name', '__hash__', '__eq__', 'to_json',
- 'to_dict', 'cache_size', 'pipe_file', 'pipe',
- 'start_transaction', 'end_transaction'}:
+ if item in {
+ "load_cache",
+ "_open",
+ "save_cache",
+ "close_and_update",
+ "__init__",
+ "__getattribute__",
+ "__reduce__",
+ "_make_local_details",
+ "open",
+ "cat",
+ "cat_file",
+ "cat_ranges",
+ "get",
+ "read_block",
+ "tail",
+ "head",
+ "info",
+ "ls",
+ "exists",
+ "isfile",
+ "isdir",
+ "_check_file",
+ "_check_cache",
+ "_mkcache",
+ "clear_cache",
+ "clear_expired_cache",
+ "pop_from_cache",
+ "local_file",
+ "_paths_from_path",
+ "get_mapper",
+ "open_many",
+ "commit_many",
+ "hash_name",
+ "__hash__",
+ "__eq__",
+ "to_json",
+ "to_dict",
+ "cache_size",
+ "pipe_file",
+ "pipe",
+ "start_transaction",
+ "end_transaction",
+ }:
+ # all the methods defined in this class. Note `open` here, since
+ # it calls `_open`, but is actually in superclass
return lambda *args, **kw: getattr(type(self), item).__get__(self)(
- *args, **kw)
- if item in ['__reduce_ex__']:
+ *args, **kw
+ )
+ if item in ["__reduce_ex__"]:
raise AttributeError
- if item in ['transaction']:
+ if item in ["transaction"]:
+ # property
return type(self).transaction.__get__(self)
- if item in ['_cache', 'transaction_type']:
+ if item in ["_cache", "transaction_type"]:
+ # class attributes
return getattr(type(self), item)
- if item == '__class__':
+ if item == "__class__":
return type(self)
- d = object.__getattribute__(self, '__dict__')
- fs = d.get('fs', None)
+ d = object.__getattribute__(self, "__dict__")
+ fs = d.get("fs", None) # fs is not immediately defined
if item in d:
return d[item]
elif fs is not None:
if item in fs.__dict__:
+ # attribute of instance
return fs.__dict__[item]
+ # attributed belonging to the target filesystem
cls = type(fs)
m = getattr(cls, item)
if (inspect.isfunction(m) or inspect.isdatadescriptor(m)) and (
- not hasattr(m, '__self__') or m.__self__ is None):
+ not hasattr(m, "__self__") or m.__self__ is None
+ ):
+ # instance method
return m.__get__(fs, cls)
- return m
+ return m # class method or attribute
else:
+ # attributes of the superclass, while target is being set up
return super().__getattribute__(item)
def __eq__(self, other):
@@ -257,18 +483,29 @@ class CachingFileSystem(AbstractFileSystem):
return True
if not isinstance(other, type(self)):
return False
- return (self.storage == other.storage and self.kwargs == other.
- kwargs and self.cache_check == other.cache_check and self.
- check_files == other.check_files and self.expiry == other.
- expiry and self.compression == other.compression and self.
- _mapper == other._mapper and self.target_protocol == other.
- target_protocol)
+ return (
+ self.storage == other.storage
+ and self.kwargs == other.kwargs
+ and self.cache_check == other.cache_check
+ and self.check_files == other.check_files
+ and self.expiry == other.expiry
+ and self.compression == other.compression
+ and self._mapper == other._mapper
+ and self.target_protocol == other.target_protocol
+ )
def __hash__(self):
"""Calculate hash."""
- return hash(tuple(self.storage)) ^ hash(str(self.kwargs)) ^ hash(self
- .cache_check) ^ hash(self.check_files) ^ hash(self.expiry) ^ hash(
- self.compression) ^ hash(self._mapper) ^ hash(self.target_protocol)
+ return (
+ hash(tuple(self.storage))
+ ^ hash(str(self.kwargs))
+ ^ hash(self.cache_check)
+ ^ hash(self.check_files)
+ ^ hash(self.expiry)
+ ^ hash(self.compression)
+ ^ hash(self._mapper)
+ ^ hash(self.target_protocol)
+ )
class WholeFileCacheFileSystem(CachingFileSystem):
@@ -284,9 +521,192 @@ class WholeFileCacheFileSystem(CachingFileSystem):
The class still needs access to the remote store for listing files,
and may refresh cached files.
"""
- protocol = 'filecache'
+
+ protocol = "filecache"
local_file = True
+ def open_many(self, open_files, **kwargs):
+ paths = [of.path for of in open_files]
+ if "r" in open_files.mode:
+ self._mkcache()
+ else:
+ return [
+ LocalTempFile(
+ self.fs,
+ path,
+ mode=open_files.mode,
+ fn=os.path.join(self.storage[-1], self._mapper(path)),
+ **kwargs,
+ )
+ for path in paths
+ ]
+
+ if self.compression:
+ raise NotImplementedError
+ details = [self._check_file(sp) for sp in paths]
+ downpath = [p for p, d in zip(paths, details) if not d]
+ downfn0 = [
+ os.path.join(self.storage[-1], self._mapper(p))
+ for p, d in zip(paths, details)
+ ] # keep these path names for opening later
+ downfn = [fn for fn, d in zip(downfn0, details) if not d]
+ if downpath:
+ # skip if all files are already cached and up to date
+ self.fs.get(downpath, downfn)
+
+ # update metadata - only happens when downloads are successful
+ newdetail = [
+ {
+ "original": path,
+ "fn": self._mapper(path),
+ "blocks": True,
+ "time": time.time(),
+ "uid": self.fs.ukey(path),
+ }
+ for path in downpath
+ ]
+ for path, detail in zip(downpath, newdetail):
+ self._metadata.update_file(path, detail)
+ self.save_cache()
+
+ def firstpart(fn):
+ # helper to adapt both whole-file and simple-cache
+ return fn[1] if isinstance(fn, tuple) else fn
+
+ return [
+ open(firstpart(fn0) if fn0 else fn1, mode=open_files.mode)
+ for fn0, fn1 in zip(details, downfn0)
+ ]
+
+ def commit_many(self, open_files):
+ self.fs.put([f.fn for f in open_files], [f.path for f in open_files])
+ [f.close() for f in open_files]
+ for f in open_files:
+ # in case autocommit is off, and so close did not already delete
+ try:
+ os.remove(f.name)
+ except FileNotFoundError:
+ pass
+ self._cache_size = None
+
+ def _make_local_details(self, path):
+ hash = self._mapper(path)
+ fn = os.path.join(self.storage[-1], hash)
+ detail = {
+ "original": path,
+ "fn": hash,
+ "blocks": True,
+ "time": time.time(),
+ "uid": self.fs.ukey(path),
+ }
+ self._metadata.update_file(path, detail)
+ logger.debug("Copying %s to local cache", path)
+ return fn
+
+ def cat(
+ self,
+ path,
+ recursive=False,
+ on_error="raise",
+ callback=DEFAULT_CALLBACK,
+ **kwargs,
+ ):
+ paths = self.expand_path(
+ path, recursive=recursive, maxdepth=kwargs.get("maxdepth", None)
+ )
+ getpaths = []
+ storepaths = []
+ fns = []
+ out = {}
+ for p in paths.copy():
+ try:
+ detail = self._check_file(p)
+ if not detail:
+ fn = self._make_local_details(p)
+ getpaths.append(p)
+ storepaths.append(fn)
+ else:
+ detail, fn = detail if isinstance(detail, tuple) else (None, detail)
+ fns.append(fn)
+ except Exception as e:
+ if on_error == "raise":
+ raise
+ if on_error == "return":
+ out[p] = e
+ paths.remove(p)
+
+ if getpaths:
+ self.fs.get(getpaths, storepaths)
+ self.save_cache()
+
+ callback.set_size(len(paths))
+ for p, fn in zip(paths, fns):
+ with open(fn, "rb") as f:
+ out[p] = f.read()
+ callback.relative_update(1)
+ if isinstance(path, str) and len(paths) == 1 and recursive is False:
+ out = out[paths[0]]
+ return out
+
+ def _open(self, path, mode="rb", **kwargs):
+ path = self._strip_protocol(path)
+ if "r" not in mode:
+ hash = self._mapper(path)
+ fn = os.path.join(self.storage[-1], hash)
+ user_specified_kwargs = {
+ k: v
+ for k, v in kwargs.items()
+ # those kwargs were added by open(), we don't want them
+ if k not in ["autocommit", "block_size", "cache_options"]
+ }
+ return LocalTempFile(self, path, mode=mode, fn=fn, **user_specified_kwargs)
+ detail = self._check_file(path)
+ if detail:
+ detail, fn = detail
+ _, blocks = detail["fn"], detail["blocks"]
+ if blocks is True:
+ logger.debug("Opening local copy of %s", path)
+
+ # In order to support downstream filesystems to be able to
+ # infer the compression from the original filename, like
+ # the `TarFileSystem`, let's extend the `io.BufferedReader`
+ # fileobject protocol by adding a dedicated attribute
+ # `original`.
+ f = open(fn, mode)
+ f.original = detail.get("original")
+ return f
+ else:
+ raise ValueError(
+ f"Attempt to open partially cached file {path}"
+ f" as a wholly cached file"
+ )
+ else:
+ fn = self._make_local_details(path)
+ kwargs["mode"] = mode
+
+ # call target filesystems open
+ self._mkcache()
+ if self.compression:
+ with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2:
+ if isinstance(f, AbstractBufferedFile):
+ # want no type of caching if just downloading whole thing
+ f.cache = BaseCache(0, f.cache.fetcher, f.size)
+ comp = (
+ infer_compression(path)
+ if self.compression == "infer"
+ else self.compression
+ )
+ f = compr[comp](f, mode="rb")
+ data = True
+ while data:
+ block = getattr(f, "blocksize", 5 * 2**20)
+ data = f.read(block)
+ f2.write(data)
+ else:
+ self.fs.get_file(path, fn)
+ self.save_cache()
+ return self._open(path, mode)
+
class SimpleCacheFileSystem(WholeFileCacheFileSystem):
"""Caches whole remote files on first access
@@ -303,25 +723,159 @@ class SimpleCacheFileSystem(WholeFileCacheFileSystem):
not checked until that time.
"""
- protocol = 'simplecache'
+
+ protocol = "simplecache"
local_file = True
transaction_type = WriteCachedTransaction
def __init__(self, **kwargs):
kw = kwargs.copy()
- for key in ['cache_check', 'expiry_time', 'check_files']:
+ for key in ["cache_check", "expiry_time", "check_files"]:
kw[key] = False
super().__init__(**kw)
for storage in self.storage:
if not os.path.exists(storage):
os.makedirs(storage, exist_ok=True)
+ def _check_file(self, path):
+ self._check_cache()
+ sha = self._mapper(path)
+ for storage in self.storage:
+ fn = os.path.join(storage, sha)
+ if os.path.exists(fn):
+ return fn
+
+ def save_cache(self):
+ pass
+
+ def load_cache(self):
+ pass
+
+ def pipe_file(self, path, value=None, **kwargs):
+ if self._intrans:
+ with self.open(path, "wb") as f:
+ f.write(value)
+ else:
+ super().pipe_file(path, value)
+
+ def ls(self, path, detail=True, **kwargs):
+ path = self._strip_protocol(path)
+ details = []
+ try:
+ details = self.fs.ls(
+ path, detail=True, **kwargs
+ ).copy() # don't edit original!
+ except FileNotFoundError as e:
+ ex = e
+ else:
+ ex = None
+ if self._intrans:
+ path1 = path.rstrip("/") + "/"
+ for f in self.transaction.files:
+ if f.path == path:
+ details.append(
+ {"name": path, "size": f.size or f.tell(), "type": "file"}
+ )
+ elif f.path.startswith(path1):
+ if f.path.count("/") == path1.count("/"):
+ details.append(
+ {"name": f.path, "size": f.size or f.tell(), "type": "file"}
+ )
+ else:
+ dname = "/".join(f.path.split("/")[: path1.count("/") + 1])
+ details.append({"name": dname, "size": 0, "type": "directory"})
+ if ex is not None and not details:
+ raise ex
+ if detail:
+ return details
+ return sorted(_["name"] for _ in details)
+
+ def info(self, path, **kwargs):
+ path = self._strip_protocol(path)
+ if self._intrans:
+ f = [_ for _ in self.transaction.files if _.path == path]
+ if f:
+ size = os.path.getsize(f[0].fn) if f[0].closed else f[0].tell()
+ return {"name": path, "size": size, "type": "file"}
+ f = any(_.path.startswith(path + "/") for _ in self.transaction.files)
+ if f:
+ return {"name": path, "size": 0, "type": "directory"}
+ return self.fs.info(path, **kwargs)
+
+ def pipe(self, path, value=None, **kwargs):
+ if isinstance(path, str):
+ self.pipe_file(self._strip_protocol(path), value, **kwargs)
+ elif isinstance(path, dict):
+ for k, v in path.items():
+ self.pipe_file(self._strip_protocol(k), v, **kwargs)
+ else:
+ raise ValueError("path must be str or dict")
+
+ def cat_ranges(
+ self, paths, starts, ends, max_gap=None, on_error="return", **kwargs
+ ):
+ lpaths = [self._check_file(p) for p in paths]
+ rpaths = [p for l, p in zip(lpaths, paths) if l is False]
+ lpaths = [l for l, p in zip(lpaths, paths) if l is False]
+ self.fs.get(rpaths, lpaths)
+ return super().cat_ranges(
+ paths, starts, ends, max_gap=max_gap, on_error=on_error, **kwargs
+ )
+
+ def _open(self, path, mode="rb", **kwargs):
+ path = self._strip_protocol(path)
+ sha = self._mapper(path)
+
+ if "r" not in mode:
+ fn = os.path.join(self.storage[-1], sha)
+ user_specified_kwargs = {
+ k: v
+ for k, v in kwargs.items()
+ if k not in ["autocommit", "block_size", "cache_options"]
+ } # those were added by open()
+ return LocalTempFile(
+ self,
+ path,
+ mode=mode,
+ autocommit=not self._intrans,
+ fn=fn,
+ **user_specified_kwargs,
+ )
+ fn = self._check_file(path)
+ if fn:
+ return open(fn, mode)
+
+ fn = os.path.join(self.storage[-1], sha)
+ logger.debug("Copying %s to local cache", path)
+ kwargs["mode"] = mode
+
+ self._mkcache()
+ self._cache_size = None
+ if self.compression:
+ with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2:
+ if isinstance(f, AbstractBufferedFile):
+ # want no type of caching if just downloading whole thing
+ f.cache = BaseCache(0, f.cache.fetcher, f.size)
+ comp = (
+ infer_compression(path)
+ if self.compression == "infer"
+ else self.compression
+ )
+ f = compr[comp](f, mode="rb")
+ data = True
+ while data:
+ block = getattr(f, "blocksize", 5 * 2**20)
+ data = f.read(block)
+ f2.write(data)
+ else:
+ self.fs.get_file(path, fn)
+ return self._open(path, mode)
+
class LocalTempFile:
"""A temporary local file, which will be uploaded on commit"""
- def __init__(self, fs, path, fn, mode='wb', autocommit=True, seek=0, **
- kwargs):
+ def __init__(self, fs, path, fn, mode="wb", autocommit=True, seek=0, **kwargs):
self.fn = fn
self.fh = open(fn, mode)
self.mode = mode
@@ -335,8 +889,11 @@ class LocalTempFile:
self.kwargs = kwargs
def __reduce__(self):
- return LocalTempFile, (self.fs, self.path, self.fn, 'r+b', self.
- autocommit, self.tell())
+ # always open in r+b to allow continuing writing at a location
+ return (
+ LocalTempFile,
+ (self.fs, self.path, self.fn, "r+b", self.autocommit, self.tell()),
+ )
def __enter__(self):
return self.fh
@@ -344,8 +901,29 @@ class LocalTempFile:
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
- def __repr__(self) ->str:
- return f'LocalTempFile: {self.path}'
+ def close(self):
+ # self.size = self.fh.tell()
+ if self.closed:
+ return
+ self.fh.close()
+ self.closed = True
+ if self.autocommit:
+ self.commit()
+
+ def discard(self):
+ self.fh.close()
+ os.remove(self.fn)
+
+ def commit(self):
+ self.fs.put(self.fn, self.path, **self.kwargs)
+ # we do not delete local copy - it's still in the cache
+
+ @property
+ def name(self):
+ return self.fn
+
+ def __repr__(self) -> str:
+ return f"LocalTempFile: {self.path}"
def __getattr__(self, item):
return getattr(self.fh, item)
diff --git a/fsspec/implementations/dask.py b/fsspec/implementations/dask.py
index ead2260..3e12764 100644
--- a/fsspec/implementations/dask.py
+++ b/fsspec/implementations/dask.py
@@ -1,11 +1,26 @@
import dask
from distributed.client import Client, _get_global_client
from distributed.worker import Worker
+
from fsspec import filesystem
from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
from fsspec.utils import infer_storage_options
+def _get_client(client):
+ if client is None:
+ return _get_global_client()
+ elif isinstance(client, Client):
+ return client
+ else:
+ # e.g., connection string
+ return Client(client)
+
+
+def _in_worker():
+ return bool(Worker._instances)
+
+
class DaskWorkerFileSystem(AbstractFileSystem):
"""View files accessible to a worker as any other remote file-system
@@ -15,13 +30,15 @@ class DaskWorkerFileSystem(AbstractFileSystem):
**Warning** this implementation is experimental, and read-only for now.
"""
- def __init__(self, target_protocol=None, target_options=None, fs=None,
- client=None, **kwargs):
+ def __init__(
+ self, target_protocol=None, target_options=None, fs=None, client=None, **kwargs
+ ):
super().__init__(**kwargs)
if not (fs is None) ^ (target_protocol is None):
raise ValueError(
- 'Please provide one of filesystem instance (fs) or target_protocol, not both'
- )
+ "Please provide one of filesystem instance (fs) or"
+ " target_protocol, not both"
+ )
self.target_protocol = target_protocol
self.target_options = target_options
self.worker = None
@@ -29,19 +46,107 @@ class DaskWorkerFileSystem(AbstractFileSystem):
self.fs = fs
self._determine_worker()
+ @staticmethod
+ def _get_kwargs_from_urls(path):
+ so = infer_storage_options(path)
+ if "host" in so and "port" in so:
+ return {"client": f"{so['host']}:{so['port']}"}
+ else:
+ return {}
-class DaskFile(AbstractBufferedFile):
+ def _determine_worker(self):
+ if _in_worker():
+ self.worker = True
+ if self.fs is None:
+ self.fs = filesystem(
+ self.target_protocol, **(self.target_options or {})
+ )
+ else:
+ self.worker = False
+ self.client = _get_client(self.client)
+ self.rfs = dask.delayed(self)
- def __init__(self, mode='rb', **kwargs):
- if mode != 'rb':
- raise ValueError(
- 'Remote dask files can only be opened in "rb" mode')
+ def mkdir(self, *args, **kwargs):
+ if self.worker:
+ self.fs.mkdir(*args, **kwargs)
+ else:
+ self.rfs.mkdir(*args, **kwargs).compute()
+
+ def rm(self, *args, **kwargs):
+ if self.worker:
+ self.fs.rm(*args, **kwargs)
+ else:
+ self.rfs.rm(*args, **kwargs).compute()
+
+ def copy(self, *args, **kwargs):
+ if self.worker:
+ self.fs.copy(*args, **kwargs)
+ else:
+ self.rfs.copy(*args, **kwargs).compute()
+
+ def mv(self, *args, **kwargs):
+ if self.worker:
+ self.fs.mv(*args, **kwargs)
+ else:
+ self.rfs.mv(*args, **kwargs).compute()
+
+ def ls(self, *args, **kwargs):
+ if self.worker:
+ return self.fs.ls(*args, **kwargs)
+ else:
+ return self.rfs.ls(*args, **kwargs).compute()
+
+ def _open(
+ self,
+ path,
+ mode="rb",
+ block_size=None,
+ autocommit=True,
+ cache_options=None,
+ **kwargs,
+ ):
+ if self.worker:
+ return self.fs._open(
+ path,
+ mode=mode,
+ block_size=block_size,
+ autocommit=autocommit,
+ cache_options=cache_options,
+ **kwargs,
+ )
+ else:
+ return DaskFile(
+ fs=self,
+ path=path,
+ mode=mode,
+ block_size=block_size,
+ autocommit=autocommit,
+ cache_options=cache_options,
+ **kwargs,
+ )
+
+ def fetch_range(self, path, mode, start, end):
+ if self.worker:
+ with self._open(path, mode) as f:
+ f.seek(start)
+ return f.read(end - start)
+ else:
+ return self.rfs.fetch_range(path, mode, start, end).compute()
+
+
+class DaskFile(AbstractBufferedFile):
+ def __init__(self, mode="rb", **kwargs):
+ if mode != "rb":
+ raise ValueError('Remote dask files can only be opened in "rb" mode')
super().__init__(**kwargs)
+ def _upload_chunk(self, final=False):
+ pass
+
def _initiate_upload(self):
"""Create remote file/upload"""
pass
def _fetch_range(self, start, end):
"""Get the specified set of bytes from remote"""
- pass
+ return self.fs.fetch_range(self.path, self.mode, start, end)
diff --git a/fsspec/implementations/data.py b/fsspec/implementations/data.py
index 77435f6..5190323 100644
--- a/fsspec/implementations/data.py
+++ b/fsspec/implementations/data.py
@@ -2,6 +2,7 @@ import base64
import io
from typing import Optional
from urllib.parse import unquote
+
from fsspec import AbstractFileSystem
@@ -16,16 +17,42 @@ class DataFileSystem(AbstractFileSystem):
See https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs
"""
- protocol = 'data'
+
+ protocol = "data"
def __init__(self, **kwargs):
"""No parameters for this filesystem"""
super().__init__(**kwargs)
+ def cat_file(self, path, start=None, end=None, **kwargs):
+ pref, data = path.split(",", 1)
+ if pref.endswith("base64"):
+ return base64.b64decode(data)[start:end]
+ return unquote(data).encode()[start:end]
+
+ def info(self, path, **kwargs):
+ pref, name = path.split(",", 1)
+ data = self.cat_file(path)
+ mime = pref.split(":", 1)[1].split(";", 1)[0]
+ return {"name": name, "size": len(data), "type": "file", "mimetype": mime}
+
+ def _open(
+ self,
+ path,
+ mode="rb",
+ block_size=None,
+ autocommit=True,
+ cache_options=None,
+ **kwargs,
+ ):
+ if "r" not in mode:
+ raise ValueError("Read only filesystem")
+ return io.BytesIO(self.cat_file(path))
+
@staticmethod
- def encode(data: bytes, mime: Optional[str]=None):
+ def encode(data: bytes, mime: Optional[str] = None):
"""Format the given data into data-URL syntax
This version always base64 encodes, even when the data is ascii/url-safe.
"""
- pass
+ return f"data:{mime or ''};base64,{base64.b64encode(data).decode()}"
diff --git a/fsspec/implementations/dbfs.py b/fsspec/implementations/dbfs.py
index bbf4358..ce9f9ea 100644
--- a/fsspec/implementations/dbfs.py
+++ b/fsspec/implementations/dbfs.py
@@ -1,8 +1,10 @@
import base64
import urllib
+
import requests
import requests.exceptions
from requests.adapters import HTTPAdapter, Retry
+
from fsspec import AbstractFileSystem
from fsspec.spec import AbstractBufferedFile
@@ -15,6 +17,7 @@ class DatabricksException(Exception):
def __init__(self, error_code, message):
"""Create a new DatabricksException"""
super().__init__(message)
+
self.error_code = error_code
self.message = message
@@ -42,10 +45,15 @@ class DatabricksFileSystem(AbstractFileSystem):
self.instance = instance
self.token = token
self.session = requests.Session()
- self.retries = Retry(total=10, backoff_factor=0.05,
- status_forcelist=[408, 429, 500, 502, 503, 504])
- self.session.mount('https://', HTTPAdapter(max_retries=self.retries))
- self.session.headers.update({'Authorization': f'Bearer {self.token}'})
+ self.retries = Retry(
+ total=10,
+ backoff_factor=0.05,
+ status_forcelist=[408, 429, 500, 502, 503, 504],
+ )
+
+ self.session.mount("https://", HTTPAdapter(max_retries=self.retries))
+ self.session.headers.update({"Authorization": f"Bearer {self.token}"})
+
super().__init__(**kwargs)
def ls(self, path, detail=True, **kwargs):
@@ -61,7 +69,31 @@ class DatabricksFileSystem(AbstractFileSystem):
but also additional information on file sizes
and types.
"""
- pass
+ out = self._ls_from_cache(path)
+ if not out:
+ try:
+ r = self._send_to_api(
+ method="get", endpoint="list", json={"path": path}
+ )
+ except DatabricksException as e:
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
+ raise FileNotFoundError(e.message)
+
+ raise e
+ files = r["files"]
+ out = [
+ {
+ "name": o["path"],
+ "type": "directory" if o["is_dir"] else "file",
+ "size": o["file_size"],
+ }
+ for o in files
+ ]
+ self.dircache[path] = out
+
+ if detail:
+ return out
+ return [o["name"] for o in out]
def makedirs(self, path, exist_ok=True):
"""
@@ -76,7 +108,25 @@ class DatabricksFileSystem(AbstractFileSystem):
exists before creating it (and raises an
Exception if this is the case)
"""
- pass
+ if not exist_ok:
+ try:
+ # If the following succeeds, the path is already present
+ self._send_to_api(
+ method="get", endpoint="get-status", json={"path": path}
+ )
+ raise FileExistsError(f"Path {path} already exists")
+ except DatabricksException as e:
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
+ pass
+
+ try:
+ self._send_to_api(method="post", endpoint="mkdirs", json={"path": path})
+ except DatabricksException as e:
+ if e.error_code == "RESOURCE_ALREADY_EXISTS":
+ raise FileExistsError(e.message)
+
+ raise e
+ self.invalidate_cache(self._parent(path))
def mkdir(self, path, create_parents=True, **kwargs):
"""
@@ -90,7 +140,10 @@ class DatabricksFileSystem(AbstractFileSystem):
Whether to create all parents or not.
"False" is not implemented so far.
"""
- pass
+ if not create_parents:
+ raise NotImplementedError
+
+ self.mkdirs(path, **kwargs)
def rm(self, path, recursive=False, **kwargs):
"""
@@ -103,10 +156,27 @@ class DatabricksFileSystem(AbstractFileSystem):
recursive: bool
Recursively delete all files in a folder.
"""
- pass
-
- def mv(self, source_path, destination_path, recursive=False, maxdepth=
- None, **kwargs):
+ try:
+ self._send_to_api(
+ method="post",
+ endpoint="delete",
+ json={"path": path, "recursive": recursive},
+ )
+ except DatabricksException as e:
+ # This is not really an exception, it just means
+ # not everything was deleted so far
+ if e.error_code == "PARTIAL_DELETE":
+ self.rm(path=path, recursive=recursive)
+ elif e.error_code == "IO_ERROR":
+ # Using the same exception as the os module would use here
+ raise OSError(e.message)
+
+ raise e
+ self.invalidate_cache(self._parent(path))
+
+ def mv(
+ self, source_path, destination_path, recursive=False, maxdepth=None, **kwargs
+ ):
"""
Move a source to a destination path.
@@ -129,16 +199,35 @@ class DatabricksFileSystem(AbstractFileSystem):
maxdepth:
Not implemented to far.
"""
- pass
-
- def _open(self, path, mode='rb', block_size='default', **kwargs):
+ if recursive:
+ raise NotImplementedError
+ if maxdepth:
+ raise NotImplementedError
+
+ try:
+ self._send_to_api(
+ method="post",
+ endpoint="move",
+ json={"source_path": source_path, "destination_path": destination_path},
+ )
+ except DatabricksException as e:
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
+ raise FileNotFoundError(e.message)
+ elif e.error_code == "RESOURCE_ALREADY_EXISTS":
+ raise FileExistsError(e.message)
+
+ raise e
+ self.invalidate_cache(self._parent(source_path))
+ self.invalidate_cache(self._parent(destination_path))
+
+ def _open(self, path, mode="rb", block_size="default", **kwargs):
"""
Overwrite the base class method to make sure to create a DBFile.
All arguments are copied from the base method.
Only the default blocksize is allowed.
"""
- pass
+ return DatabricksFile(self, path, mode=mode, block_size=block_size, **kwargs)
def _send_to_api(self, method, endpoint, json):
"""
@@ -154,7 +243,32 @@ class DatabricksFileSystem(AbstractFileSystem):
json: dict
Dictionary of information to send
"""
- pass
+ if method == "post":
+ session_call = self.session.post
+ elif method == "get":
+ session_call = self.session.get
+ else:
+ raise ValueError(f"Do not understand method {method}")
+
+ url = urllib.parse.urljoin(f"https://{self.instance}/api/2.0/dbfs/", endpoint)
+
+ r = session_call(url, json=json)
+
+ # The DBFS API will return a json, also in case of an exception.
+ # We want to preserve this information as good as possible.
+ try:
+ r.raise_for_status()
+ except requests.HTTPError as e:
+ # try to extract json error message
+ # if that fails, fall back to the original exception
+ try:
+ exception_json = e.response.json()
+ except Exception:
+ raise e
+
+ raise DatabricksException(**exception_json)
+
+ return r.json()
def _create_handle(self, path, overwrite=True):
"""
@@ -174,7 +288,18 @@ class DatabricksFileSystem(AbstractFileSystem):
If a file already exist at this location, either overwrite
it or raise an exception.
"""
- pass
+ try:
+ r = self._send_to_api(
+ method="post",
+ endpoint="create",
+ json={"path": path, "overwrite": overwrite},
+ )
+ return r["handle"]
+ except DatabricksException as e:
+ if e.error_code == "RESOURCE_ALREADY_EXISTS":
+ raise FileExistsError(e.message)
+
+ raise e
def _close_handle(self, handle):
"""
@@ -185,7 +310,13 @@ class DatabricksFileSystem(AbstractFileSystem):
handle: str
Which handle to close.
"""
- pass
+ try:
+ self._send_to_api(method="post", endpoint="close", json={"handle": handle})
+ except DatabricksException as e:
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
+ raise FileNotFoundError(e.message)
+
+ raise e
def _add_data(self, handle, data):
"""
@@ -202,7 +333,20 @@ class DatabricksFileSystem(AbstractFileSystem):
data: bytes
Block of data to add to the handle.
"""
- pass
+ data = base64.b64encode(data).decode()
+ try:
+ self._send_to_api(
+ method="post",
+ endpoint="add-block",
+ json={"handle": handle, "data": data},
+ )
+ except DatabricksException as e:
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
+ raise FileNotFoundError(e.message)
+ elif e.error_code == "MAX_BLOCK_SIZE_EXCEEDED":
+ raise ValueError(e.message)
+
+ raise e
def _get_data(self, path, start, end):
"""
@@ -219,41 +363,105 @@ class DatabricksFileSystem(AbstractFileSystem):
end: int
End position of the block
"""
- pass
+ try:
+ r = self._send_to_api(
+ method="get",
+ endpoint="read",
+ json={"path": path, "offset": start, "length": end - start},
+ )
+ return base64.b64decode(r["data"])
+ except DatabricksException as e:
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
+ raise FileNotFoundError(e.message)
+ elif e.error_code in ["INVALID_PARAMETER_VALUE", "MAX_READ_SIZE_EXCEEDED"]:
+ raise ValueError(e.message)
+
+ raise e
+
+ def invalidate_cache(self, path=None):
+ if path is None:
+ self.dircache.clear()
+ else:
+ self.dircache.pop(path, None)
+ super().invalidate_cache(path)
class DatabricksFile(AbstractBufferedFile):
"""
Helper class for files referenced in the DatabricksFileSystem.
"""
- DEFAULT_BLOCK_SIZE = 1 * 2 ** 20
- def __init__(self, fs, path, mode='rb', block_size='default',
- autocommit=True, cache_type='readahead', cache_options=None, **kwargs):
+ DEFAULT_BLOCK_SIZE = 1 * 2**20 # only allowed block size
+
+ def __init__(
+ self,
+ fs,
+ path,
+ mode="rb",
+ block_size="default",
+ autocommit=True,
+ cache_type="readahead",
+ cache_options=None,
+ **kwargs,
+ ):
"""
Create a new instance of the DatabricksFile.
The blocksize needs to be the default one.
"""
- if block_size is None or block_size == 'default':
+ if block_size is None or block_size == "default":
block_size = self.DEFAULT_BLOCK_SIZE
- assert block_size == self.DEFAULT_BLOCK_SIZE, f'Only the default block size is allowed, not {block_size}'
- super().__init__(fs, path, mode=mode, block_size=block_size,
- autocommit=autocommit, cache_type=cache_type, cache_options=
- cache_options or {}, **kwargs)
+
+ assert (
+ block_size == self.DEFAULT_BLOCK_SIZE
+ ), f"Only the default block size is allowed, not {block_size}"
+
+ super().__init__(
+ fs,
+ path,
+ mode=mode,
+ block_size=block_size,
+ autocommit=autocommit,
+ cache_type=cache_type,
+ cache_options=cache_options or {},
+ **kwargs,
+ )
def _initiate_upload(self):
"""Internal function to start a file upload"""
- pass
+ self.handle = self.fs._create_handle(self.path)
def _upload_chunk(self, final=False):
"""Internal function to add a chunk of data to a started upload"""
- pass
+ self.buffer.seek(0)
+ data = self.buffer.getvalue()
+
+ data_chunks = [
+ data[start:end] for start, end in self._to_sized_blocks(len(data))
+ ]
+
+ for data_chunk in data_chunks:
+ self.fs._add_data(handle=self.handle, data=data_chunk)
+
+ if final:
+ self.fs._close_handle(handle=self.handle)
+ return True
def _fetch_range(self, start, end):
"""Internal function to download a block of data"""
- pass
+ return_buffer = b""
+ length = end - start
+ for chunk_start, chunk_end in self._to_sized_blocks(length, start):
+ return_buffer += self.fs._get_data(
+ path=self.path, start=chunk_start, end=chunk_end
+ )
+
+ return return_buffer
def _to_sized_blocks(self, length, start=0):
"""Helper function to split a range from 0 to total_length into bloksizes"""
- pass
+ end = start + length
+ for data_chunk in range(start, end, self.blocksize):
+ data_start = data_chunk
+ data_end = min(end, data_chunk + self.blocksize)
+ yield data_start, data_end
diff --git a/fsspec/implementations/dirfs.py b/fsspec/implementations/dirfs.py
index 08a20a1..04f7479 100644
--- a/fsspec/implementations/dirfs.py
+++ b/fsspec/implementations/dirfs.py
@@ -9,10 +9,18 @@ class DirFileSystem(AsyncFileSystem):
is relative to the `path`. After performing the necessary paths operation it
delegates everything to the wrapped filesystem.
"""
- protocol = 'dir'
- def __init__(self, path=None, fs=None, fo=None, target_protocol=None,
- target_options=None, **storage_options):
+ protocol = "dir"
+
+ def __init__(
+ self,
+ path=None,
+ fs=None,
+ fo=None,
+ target_protocol=None,
+ target_options=None,
+ **storage_options,
+ ):
"""
Parameters
----------
@@ -27,18 +35,332 @@ class DirFileSystem(AsyncFileSystem):
"""
super().__init__(**storage_options)
if fs is None:
- fs = filesystem(protocol=target_protocol, **target_options or {})
+ fs = filesystem(protocol=target_protocol, **(target_options or {}))
if (path is not None) ^ (fo is not None) is False:
- raise ValueError('Provide path or fo, not both')
+ raise ValueError("Provide path or fo, not both")
path = path or fo
+
if self.asynchronous and not fs.async_impl:
raise ValueError("can't use asynchronous with non-async fs")
+
if fs.async_impl and self.asynchronous != fs.asynchronous:
- raise ValueError(
- 'both dirfs and fs should be in the same sync/async mode')
+ raise ValueError("both dirfs and fs should be in the same sync/async mode")
+
self.path = fs._strip_protocol(path)
self.fs = fs
+ def _join(self, path):
+ if isinstance(path, str):
+ if not self.path:
+ return path
+ if not path:
+ return self.path
+ return self.fs.sep.join((self.path, self._strip_protocol(path)))
+ if isinstance(path, dict):
+ return {self._join(_path): value for _path, value in path.items()}
+ return [self._join(_path) for _path in path]
+
+ def _relpath(self, path):
+ if isinstance(path, str):
+ if not self.path:
+ return path
+ if path == self.path:
+ return ""
+ prefix = self.path + self.fs.sep
+ assert path.startswith(prefix)
+ return path[len(prefix) :]
+ return [self._relpath(_path) for _path in path]
+
+ # Wrappers below
+
+ @property
+ def sep(self):
+ return self.fs.sep
+
+ async def set_session(self, *args, **kwargs):
+ return await self.fs.set_session(*args, **kwargs)
+
+ async def _rm_file(self, path, **kwargs):
+ return await self.fs._rm_file(self._join(path), **kwargs)
+
+ def rm_file(self, path, **kwargs):
+ return self.fs.rm_file(self._join(path), **kwargs)
+
+ async def _rm(self, path, *args, **kwargs):
+ return await self.fs._rm(self._join(path), *args, **kwargs)
+
+ def rm(self, path, *args, **kwargs):
+ return self.fs.rm(self._join(path), *args, **kwargs)
+
+ async def _cp_file(self, path1, path2, **kwargs):
+ return await self.fs._cp_file(self._join(path1), self._join(path2), **kwargs)
+
+ def cp_file(self, path1, path2, **kwargs):
+ return self.fs.cp_file(self._join(path1), self._join(path2), **kwargs)
+
+ async def _copy(
+ self,
+ path1,
+ path2,
+ *args,
+ **kwargs,
+ ):
+ return await self.fs._copy(
+ self._join(path1),
+ self._join(path2),
+ *args,
+ **kwargs,
+ )
+
+ def copy(self, path1, path2, *args, **kwargs):
+ return self.fs.copy(
+ self._join(path1),
+ self._join(path2),
+ *args,
+ **kwargs,
+ )
+
+ async def _pipe(self, path, *args, **kwargs):
+ return await self.fs._pipe(self._join(path), *args, **kwargs)
+
+ def pipe(self, path, *args, **kwargs):
+ return self.fs.pipe(self._join(path), *args, **kwargs)
+
+ async def _pipe_file(self, path, *args, **kwargs):
+ return await self.fs._pipe_file(self._join(path), *args, **kwargs)
+
+ def pipe_file(self, path, *args, **kwargs):
+ return self.fs.pipe_file(self._join(path), *args, **kwargs)
+
+ async def _cat_file(self, path, *args, **kwargs):
+ return await self.fs._cat_file(self._join(path), *args, **kwargs)
+
+ def cat_file(self, path, *args, **kwargs):
+ return self.fs.cat_file(self._join(path), *args, **kwargs)
+
+ async def _cat(self, path, *args, **kwargs):
+ ret = await self.fs._cat(
+ self._join(path),
+ *args,
+ **kwargs,
+ )
+
+ if isinstance(ret, dict):
+ return {self._relpath(key): value for key, value in ret.items()}
+
+ return ret
+
+ def cat(self, path, *args, **kwargs):
+ ret = self.fs.cat(
+ self._join(path),
+ *args,
+ **kwargs,
+ )
+
+ if isinstance(ret, dict):
+ return {self._relpath(key): value for key, value in ret.items()}
+
+ return ret
+
+ async def _put_file(self, lpath, rpath, **kwargs):
+ return await self.fs._put_file(lpath, self._join(rpath), **kwargs)
+
+ def put_file(self, lpath, rpath, **kwargs):
+ return self.fs.put_file(lpath, self._join(rpath), **kwargs)
+
+ async def _put(
+ self,
+ lpath,
+ rpath,
+ *args,
+ **kwargs,
+ ):
+ return await self.fs._put(
+ lpath,
+ self._join(rpath),
+ *args,
+ **kwargs,
+ )
+
+ def put(self, lpath, rpath, *args, **kwargs):
+ return self.fs.put(
+ lpath,
+ self._join(rpath),
+ *args,
+ **kwargs,
+ )
+
+ async def _get_file(self, rpath, lpath, **kwargs):
+ return await self.fs._get_file(self._join(rpath), lpath, **kwargs)
+
+ def get_file(self, rpath, lpath, **kwargs):
+ return self.fs.get_file(self._join(rpath), lpath, **kwargs)
+
+ async def _get(self, rpath, *args, **kwargs):
+ return await self.fs._get(self._join(rpath), *args, **kwargs)
+
+ def get(self, rpath, *args, **kwargs):
+ return self.fs.get(self._join(rpath), *args, **kwargs)
+
+ async def _isfile(self, path):
+ return await self.fs._isfile(self._join(path))
+
+ def isfile(self, path):
+ return self.fs.isfile(self._join(path))
+
+ async def _isdir(self, path):
+ return await self.fs._isdir(self._join(path))
+
+ def isdir(self, path):
+ return self.fs.isdir(self._join(path))
+
+ async def _size(self, path):
+ return await self.fs._size(self._join(path))
+
+ def size(self, path):
+ return self.fs.size(self._join(path))
+
+ async def _exists(self, path):
+ return await self.fs._exists(self._join(path))
+
+ def exists(self, path):
+ return self.fs.exists(self._join(path))
+
+ async def _info(self, path, **kwargs):
+ return await self.fs._info(self._join(path), **kwargs)
+
+ def info(self, path, **kwargs):
+ return self.fs.info(self._join(path), **kwargs)
+
+ async def _ls(self, path, detail=True, **kwargs):
+ ret = (await self.fs._ls(self._join(path), detail=detail, **kwargs)).copy()
+ if detail:
+ out = []
+ for entry in ret:
+ entry = entry.copy()
+ entry["name"] = self._relpath(entry["name"])
+ out.append(entry)
+ return out
+
+ return self._relpath(ret)
+
+ def ls(self, path, detail=True, **kwargs):
+ ret = self.fs.ls(self._join(path), detail=detail, **kwargs).copy()
+ if detail:
+ out = []
+ for entry in ret:
+ entry = entry.copy()
+ entry["name"] = self._relpath(entry["name"])
+ out.append(entry)
+ return out
+
+ return self._relpath(ret)
+
+ async def _walk(self, path, *args, **kwargs):
+ async for root, dirs, files in self.fs._walk(self._join(path), *args, **kwargs):
+ yield self._relpath(root), dirs, files
+
+ def walk(self, path, *args, **kwargs):
+ for root, dirs, files in self.fs.walk(self._join(path), *args, **kwargs):
+ yield self._relpath(root), dirs, files
+
+ async def _glob(self, path, **kwargs):
+ detail = kwargs.get("detail", False)
+ ret = await self.fs._glob(self._join(path), **kwargs)
+ if detail:
+ return {self._relpath(path): info for path, info in ret.items()}
+ return self._relpath(ret)
+
+ def glob(self, path, **kwargs):
+ detail = kwargs.get("detail", False)
+ ret = self.fs.glob(self._join(path), **kwargs)
+ if detail:
+ return {self._relpath(path): info for path, info in ret.items()}
+ return self._relpath(ret)
+
+ async def _du(self, path, *args, **kwargs):
+ total = kwargs.get("total", True)
+ ret = await self.fs._du(self._join(path), *args, **kwargs)
+ if total:
+ return ret
+
+ return {self._relpath(path): size for path, size in ret.items()}
+
+ def du(self, path, *args, **kwargs):
+ total = kwargs.get("total", True)
+ ret = self.fs.du(self._join(path), *args, **kwargs)
+ if total:
+ return ret
+
+ return {self._relpath(path): size for path, size in ret.items()}
+
+ async def _find(self, path, *args, **kwargs):
+ detail = kwargs.get("detail", False)
+ ret = await self.fs._find(self._join(path), *args, **kwargs)
+ if detail:
+ return {self._relpath(path): info for path, info in ret.items()}
+ return self._relpath(ret)
+
+ def find(self, path, *args, **kwargs):
+ detail = kwargs.get("detail", False)
+ ret = self.fs.find(self._join(path), *args, **kwargs)
+ if detail:
+ return {self._relpath(path): info for path, info in ret.items()}
+ return self._relpath(ret)
+
+ async def _expand_path(self, path, *args, **kwargs):
+ return self._relpath(
+ await self.fs._expand_path(self._join(path), *args, **kwargs)
+ )
+
+ def expand_path(self, path, *args, **kwargs):
+ return self._relpath(self.fs.expand_path(self._join(path), *args, **kwargs))
+
+ async def _mkdir(self, path, *args, **kwargs):
+ return await self.fs._mkdir(self._join(path), *args, **kwargs)
+
+ def mkdir(self, path, *args, **kwargs):
+ return self.fs.mkdir(self._join(path), *args, **kwargs)
+
+ async def _makedirs(self, path, *args, **kwargs):
+ return await self.fs._makedirs(self._join(path), *args, **kwargs)
+
+ def makedirs(self, path, *args, **kwargs):
+ return self.fs.makedirs(self._join(path), *args, **kwargs)
+
+ def rmdir(self, path):
+ return self.fs.rmdir(self._join(path))
+
+ def mv(self, path1, path2, **kwargs):
+ return self.fs.mv(
+ self._join(path1),
+ self._join(path2),
+ **kwargs,
+ )
+
+ def touch(self, path, **kwargs):
+ return self.fs.touch(self._join(path), **kwargs)
+
+ def created(self, path):
+ return self.fs.created(self._join(path))
+
+ def modified(self, path):
+ return self.fs.modified(self._join(path))
+
+ def sign(self, path, *args, **kwargs):
+ return self.fs.sign(self._join(path), *args, **kwargs)
+
def __repr__(self):
- return (
- f"{self.__class__.__qualname__}(path='{self.path}', fs={self.fs})")
+ return f"{self.__class__.__qualname__}(path='{self.path}', fs={self.fs})"
+
+ def open(
+ self,
+ path,
+ *args,
+ **kwargs,
+ ):
+ return self.fs.open(
+ self._join(path),
+ *args,
+ **kwargs,
+ )
diff --git a/fsspec/implementations/ftp.py b/fsspec/implementations/ftp.py
index 0658887..415f484 100644
--- a/fsspec/implementations/ftp.py
+++ b/fsspec/implementations/ftp.py
@@ -4,19 +4,31 @@ import uuid
import warnings
from ftplib import FTP, Error, error_perm
from typing import Any
+
from ..spec import AbstractBufferedFile, AbstractFileSystem
from ..utils import infer_storage_options, isfilelike
class FTPFileSystem(AbstractFileSystem):
"""A filesystem over classic FTP"""
- root_marker = '/'
+
+ root_marker = "/"
cachable = False
- protocol = 'ftp'
+ protocol = "ftp"
- def __init__(self, host, port=21, username=None, password=None, acct=
- None, block_size=None, tempdir=None, timeout=30, encoding='utf-8',
- **kwargs):
+ def __init__(
+ self,
+ host,
+ port=21,
+ username=None,
+ password=None,
+ acct=None,
+ block_size=None,
+ tempdir=None,
+ timeout=30,
+ encoding="utf-8",
+ **kwargs,
+ ):
"""
You can use _get_kwargs_from_urls to get some kwargs from
a reasonable FTP url.
@@ -48,36 +60,243 @@ class FTPFileSystem(AbstractFileSystem):
super().__init__(**kwargs)
self.host = host
self.port = port
- self.tempdir = tempdir or '/tmp'
+ self.tempdir = tempdir or "/tmp"
self.cred = username, password, acct
self.timeout = timeout
self.encoding = encoding
if block_size is not None:
self.blocksize = block_size
else:
- self.blocksize = 2 ** 16
+ self.blocksize = 2**16
self._connect()
+ def _connect(self):
+ if sys.version_info >= (3, 9):
+ self.ftp = FTP(timeout=self.timeout, encoding=self.encoding)
+ elif self.encoding:
+ warnings.warn("`encoding` not supported for python<3.9, ignoring")
+ self.ftp = FTP(timeout=self.timeout)
+ else:
+ self.ftp = FTP(timeout=self.timeout)
+ self.ftp.connect(self.host, self.port)
+ self.ftp.login(*self.cred)
+
+ @classmethod
+ def _strip_protocol(cls, path):
+ return "/" + infer_storage_options(path)["path"].lstrip("/").rstrip("/")
+
+ @staticmethod
+ def _get_kwargs_from_urls(urlpath):
+ out = infer_storage_options(urlpath)
+ out.pop("path", None)
+ out.pop("protocol", None)
+ return out
+
+ def ls(self, path, detail=True, **kwargs):
+ path = self._strip_protocol(path)
+ out = []
+ if path not in self.dircache:
+ try:
+ try:
+ out = [
+ (fn, details)
+ for (fn, details) in self.ftp.mlsd(path)
+ if fn not in [".", ".."]
+ and details["type"] not in ["pdir", "cdir"]
+ ]
+ except error_perm:
+ out = _mlsd2(self.ftp, path) # Not platform independent
+ for fn, details in out:
+ if path == "/":
+ path = "" # just for forming the names, below
+ details["name"] = "/".join([path, fn.lstrip("/")])
+ if details["type"] == "file":
+ details["size"] = int(details["size"])
+ else:
+ details["size"] = 0
+ if details["type"] == "dir":
+ details["type"] = "directory"
+ self.dircache[path] = out
+ except Error:
+ try:
+ info = self.info(path)
+ if info["type"] == "file":
+ out = [(path, info)]
+ except (Error, IndexError):
+ raise FileNotFoundError(path)
+ files = self.dircache.get(path, out)
+ if not detail:
+ return sorted([fn for fn, details in files])
+ return [details for fn, details in files]
+
+ def info(self, path, **kwargs):
+ # implement with direct method
+ path = self._strip_protocol(path)
+ if path == "/":
+ # special case, since this dir has no real entry
+ return {"name": "/", "size": 0, "type": "directory"}
+ files = self.ls(self._parent(path).lstrip("/"), True)
+ try:
+ out = [f for f in files if f["name"] == path][0]
+ except IndexError:
+ raise FileNotFoundError(path)
+ return out
+
+ def get_file(self, rpath, lpath, **kwargs):
+ if self.isdir(rpath):
+ if not os.path.exists(lpath):
+ os.mkdir(lpath)
+ return
+ if isfilelike(lpath):
+ outfile = lpath
+ else:
+ outfile = open(lpath, "wb")
+
+ def cb(x):
+ outfile.write(x)
+
+ self.ftp.retrbinary(
+ f"RETR {rpath}",
+ blocksize=self.blocksize,
+ callback=cb,
+ )
+ if not isfilelike(lpath):
+ outfile.close()
+
+ def cat_file(self, path, start=None, end=None, **kwargs):
+ if end is not None:
+ return super().cat_file(path, start, end, **kwargs)
+ out = []
+
+ def cb(x):
+ out.append(x)
+
+ try:
+ self.ftp.retrbinary(
+ f"RETR {path}",
+ blocksize=self.blocksize,
+ rest=start,
+ callback=cb,
+ )
+ except (Error, error_perm) as orig_exc:
+ raise FileNotFoundError(path) from orig_exc
+ return b"".join(out)
+
+ def _open(
+ self,
+ path,
+ mode="rb",
+ block_size=None,
+ cache_options=None,
+ autocommit=True,
+ **kwargs,
+ ):
+ path = self._strip_protocol(path)
+ block_size = block_size or self.blocksize
+ return FTPFile(
+ self,
+ path,
+ mode=mode,
+ block_size=block_size,
+ tempdir=self.tempdir,
+ autocommit=autocommit,
+ cache_options=cache_options,
+ )
+
+ def _rm(self, path):
+ path = self._strip_protocol(path)
+ self.ftp.delete(path)
+ self.invalidate_cache(self._parent(path))
+
+ def rm(self, path, recursive=False, maxdepth=None):
+ paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
+ for p in reversed(paths):
+ if self.isfile(p):
+ self.rm_file(p)
+ else:
+ self.rmdir(p)
+
+ def mkdir(self, path: str, create_parents: bool = True, **kwargs: Any) -> None:
+ path = self._strip_protocol(path)
+ parent = self._parent(path)
+ if parent != self.root_marker and not self.exists(parent) and create_parents:
+ self.mkdir(parent, create_parents=create_parents)
+
+ self.ftp.mkd(path)
+ self.invalidate_cache(self._parent(path))
+
+ def makedirs(self, path: str, exist_ok: bool = False) -> None:
+ path = self._strip_protocol(path)
+ if self.exists(path):
+ # NB: "/" does not "exist" as it has no directory entry
+ if not exist_ok:
+ raise FileExistsError(f"{path} exists without `exist_ok`")
+ # exists_ok=True -> no-op
+ else:
+ self.mkdir(path, create_parents=True)
+
+ def rmdir(self, path):
+ path = self._strip_protocol(path)
+ self.ftp.rmd(path)
+ self.invalidate_cache(self._parent(path))
+
+ def mv(self, path1, path2, **kwargs):
+ path1 = self._strip_protocol(path1)
+ path2 = self._strip_protocol(path2)
+ self.ftp.rename(path1, path2)
+ self.invalidate_cache(self._parent(path1))
+ self.invalidate_cache(self._parent(path2))
+
def __del__(self):
self.ftp.close()
+ def invalidate_cache(self, path=None):
+ if path is None:
+ self.dircache.clear()
+ else:
+ self.dircache.pop(path, None)
+ super().invalidate_cache(path)
+
class TransferDone(Exception):
"""Internal exception to break out of transfer"""
+
pass
class FTPFile(AbstractBufferedFile):
"""Interact with a remote FTP file with read/write buffering"""
- def __init__(self, fs, path, mode='rb', block_size='default',
- autocommit=True, cache_type='readahead', cache_options=None, **kwargs):
- super().__init__(fs, path, mode=mode, block_size=block_size,
- autocommit=autocommit, cache_type=cache_type, cache_options=
- cache_options, **kwargs)
+ def __init__(
+ self,
+ fs,
+ path,
+ mode="rb",
+ block_size="default",
+ autocommit=True,
+ cache_type="readahead",
+ cache_options=None,
+ **kwargs,
+ ):
+ super().__init__(
+ fs,
+ path,
+ mode=mode,
+ block_size=block_size,
+ autocommit=autocommit,
+ cache_type=cache_type,
+ cache_options=cache_options,
+ **kwargs,
+ )
if not autocommit:
self.target = self.path
- self.path = '/'.join([kwargs['tempdir'], str(uuid.uuid4())])
+ self.path = "/".join([kwargs["tempdir"], str(uuid.uuid4())])
+
+ def commit(self):
+ self.fs.mv(self.path, self.target)
+
+ def discard(self):
+ self.fs.rm(self.path)
def _fetch_range(self, start, end):
"""Get bytes between given byte limits
@@ -88,10 +307,47 @@ class FTPFile(AbstractBufferedFile):
Will fail if the server does not respect the REST command on
retrieve requests.
"""
- pass
+ out = []
+ total = [0]
+
+ def callback(x):
+ total[0] += len(x)
+ if total[0] > end - start:
+ out.append(x[: (end - start) - total[0]])
+ if end < self.size:
+ raise TransferDone
+ else:
+ out.append(x)
+
+ if total[0] == end - start and end < self.size:
+ raise TransferDone
+
+ try:
+ self.fs.ftp.retrbinary(
+ f"RETR {self.path}",
+ blocksize=self.blocksize,
+ rest=start,
+ callback=callback,
+ )
+ except TransferDone:
+ try:
+ # stop transfer, we got enough bytes for this block
+ self.fs.ftp.abort()
+ self.fs.ftp.getmultiline()
+ except Error:
+ self.fs._connect()
+ return b"".join(out)
-def _mlsd2(ftp, path='.'):
+ def _upload_chunk(self, final=False):
+ self.buffer.seek(0)
+ self.fs.ftp.storbinary(
+ f"STOR {self.path}", self.buffer, blocksize=self.blocksize, rest=self.offset
+ )
+ return True
+
+
+def _mlsd2(ftp, path="."):
"""
Fall back to using `dir` instead of `mlsd` if not supported.
@@ -104,4 +360,26 @@ def _mlsd2(ftp, path='.'):
path: str
Expects to be given path, but defaults to ".".
"""
- pass
+ lines = []
+ minfo = []
+ ftp.dir(path, lines.append)
+ for line in lines:
+ split_line = line.split()
+ if len(split_line) < 9:
+ continue
+ this = (
+ split_line[-1],
+ {
+ "modify": " ".join(split_line[5:8]),
+ "unix.owner": split_line[2],
+ "unix.group": split_line[3],
+ "unix.mode": split_line[0],
+ "size": split_line[4],
+ },
+ )
+ if "d" == this[1]["unix.mode"][0]:
+ this[1]["type"] = "dir"
+ else:
+ this[1]["type"] = "file"
+ minfo.append(this)
+ return minfo
diff --git a/fsspec/implementations/git.py b/fsspec/implementations/git.py
index 760da5b..7c34d93 100644
--- a/fsspec/implementations/git.py
+++ b/fsspec/implementations/git.py
@@ -1,6 +1,9 @@
import os
+
import pygit2
+
from fsspec.spec import AbstractFileSystem
+
from .memory import MemoryFile
@@ -9,7 +12,8 @@ class GitFileSystem(AbstractFileSystem):
(experimental backend)
"""
- root_marker = ''
+
+ root_marker = ""
cachable = True
def __init__(self, path=None, fo=None, ref=None, **kwargs):
@@ -34,4 +38,90 @@ class GitFileSystem(AbstractFileSystem):
"""
super().__init__(**kwargs)
self.repo = pygit2.Repository(fo or path or os.getcwd())
- self.ref = ref or 'master'
+ self.ref = ref or "master"
+
+ @classmethod
+ def _strip_protocol(cls, path):
+ path = super()._strip_protocol(path).lstrip("/")
+ if ":" in path:
+ path = path.split(":", 1)[1]
+ if "@" in path:
+ path = path.split("@", 1)[1]
+ return path.lstrip("/")
+
+ def _path_to_object(self, path, ref):
+ comm, ref = self.repo.resolve_refish(ref or self.ref)
+ parts = path.split("/")
+ tree = comm.tree
+ for part in parts:
+ if part and isinstance(tree, pygit2.Tree):
+ tree = tree[part]
+ return tree
+
+ @staticmethod
+ def _get_kwargs_from_urls(path):
+ if path.startswith("git://"):
+ path = path[6:]
+ out = {}
+ if ":" in path:
+ out["path"], path = path.split(":", 1)
+ if "@" in path:
+ out["ref"], path = path.split("@", 1)
+ return out
+
+ def ls(self, path, detail=True, ref=None, **kwargs):
+ path = self._strip_protocol(path)
+ tree = self._path_to_object(path, ref)
+ if isinstance(tree, pygit2.Tree):
+ out = []
+ for obj in tree:
+ if isinstance(obj, pygit2.Tree):
+ out.append(
+ {
+ "type": "directory",
+ "name": "/".join([path, obj.name]).lstrip("/"),
+ "hex": obj.hex,
+ "mode": f"{obj.filemode:o}",
+ "size": 0,
+ }
+ )
+ else:
+ out.append(
+ {
+ "type": "file",
+ "name": "/".join([path, obj.name]).lstrip("/"),
+ "hex": obj.hex,
+ "mode": f"{obj.filemode:o}",
+ "size": obj.size,
+ }
+ )
+ else:
+ obj = tree
+ out = [
+ {
+ "type": "file",
+ "name": obj.name,
+ "hex": obj.hex,
+ "mode": f"{obj.filemode:o}",
+ "size": obj.size,
+ }
+ ]
+ if detail:
+ return out
+ return [o["name"] for o in out]
+
+ def ukey(self, path, ref=None):
+ return self.info(path, ref=ref)["hex"]
+
+ def _open(
+ self,
+ path,
+ mode="rb",
+ block_size=None,
+ autocommit=True,
+ cache_options=None,
+ ref=None,
+ **kwargs,
+ ):
+ obj = self._path_to_object(path, ref or self.ref)
+ return MemoryFile(data=obj.data)
diff --git a/fsspec/implementations/github.py b/fsspec/implementations/github.py
index 27f9ccd..3650b8e 100644
--- a/fsspec/implementations/github.py
+++ b/fsspec/implementations/github.py
@@ -1,9 +1,13 @@
import requests
+
import fsspec
+
from ..spec import AbstractFileSystem
from ..utils import infer_storage_options
from .memory import MemoryFile
+# TODO: add GIST backend, would be very similar
+
class GithubFileSystem(AbstractFileSystem):
"""Interface to files in github
@@ -30,30 +34,41 @@ class GithubFileSystem(AbstractFileSystem):
For authorised access, you must provide username and token, which can be made
at https://github.com/settings/tokens
"""
- url = 'https://api.github.com/repos/{org}/{repo}/git/trees/{sha}'
- rurl = 'https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}'
- protocol = 'github'
- timeout = 60, 60
- def __init__(self, org, repo, sha=None, username=None, token=None,
- timeout=None, **kwargs):
+ url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
+ rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}"
+ protocol = "github"
+ timeout = (60, 60) # connect, read timeouts
+
+ def __init__(
+ self, org, repo, sha=None, username=None, token=None, timeout=None, **kwargs
+ ):
super().__init__(**kwargs)
self.org = org
self.repo = repo
if (username is None) ^ (token is None):
- raise ValueError('Auth required both username and token')
+ raise ValueError("Auth required both username and token")
self.username = username
self.token = token
if timeout is not None:
self.timeout = timeout
if sha is None:
- u = 'https://api.github.com/repos/{org}/{repo}'
- r = requests.get(u.format(org=org, repo=repo), timeout=self.
- timeout, **self.kw)
+ # look up default branch (not necessarily "master")
+ u = "https://api.github.com/repos/{org}/{repo}"
+ r = requests.get(
+ u.format(org=org, repo=repo), timeout=self.timeout, **self.kw
+ )
r.raise_for_status()
- sha = r.json()['default_branch']
+ sha = r.json()["default_branch"]
+
self.root = sha
- self.ls('')
+ self.ls("")
+
+ @property
+ def kw(self):
+ if self.username:
+ return {"auth": (self.username, self.token)}
+ return {}
@classmethod
def repos(cls, org_or_user, is_org=True):
@@ -72,22 +87,39 @@ class GithubFileSystem(AbstractFileSystem):
-------
List of string
"""
- pass
+ r = requests.get(
+ f"https://api.github.com/{['users', 'orgs'][is_org]}/{org_or_user}/repos",
+ timeout=cls.timeout,
+ )
+ r.raise_for_status()
+ return [repo["name"] for repo in r.json()]
@property
def tags(self):
"""Names of tags in the repo"""
- pass
+ r = requests.get(
+ f"https://api.github.com/repos/{self.org}/{self.repo}/tags",
+ timeout=self.timeout,
+ **self.kw,
+ )
+ r.raise_for_status()
+ return [t["name"] for t in r.json()]
@property
def branches(self):
"""Names of branches in the repo"""
- pass
+ r = requests.get(
+ f"https://api.github.com/repos/{self.org}/{self.repo}/branches",
+ timeout=self.timeout,
+ **self.kw,
+ )
+ r.raise_for_status()
+ return [t["name"] for t in r.json()]
@property
def refs(self):
"""Named references, tags and branches"""
- pass
+ return {"tags": self.tags, "branches": self.branches}
def ls(self, path, detail=False, sha=None, _sha=None, **kwargs):
"""List files at given path
@@ -105,4 +137,103 @@ class GithubFileSystem(AbstractFileSystem):
_sha: str (optional)
List this specific tree object (used internally to descend into trees)
"""
- pass
+ path = self._strip_protocol(path)
+ if path == "":
+ _sha = sha or self.root
+ if _sha is None:
+ parts = path.rstrip("/").split("/")
+ so_far = ""
+ _sha = sha or self.root
+ for part in parts:
+ out = self.ls(so_far, True, sha=sha, _sha=_sha)
+ so_far += "/" + part if so_far else part
+ out = [o for o in out if o["name"] == so_far]
+ if not out:
+ raise FileNotFoundError(path)
+ out = out[0]
+ if out["type"] == "file":
+ if detail:
+ return [out]
+ else:
+ return path
+ _sha = out["sha"]
+ if path not in self.dircache or sha not in [self.root, None]:
+ r = requests.get(
+ self.url.format(org=self.org, repo=self.repo, sha=_sha),
+ timeout=self.timeout,
+ **self.kw,
+ )
+ if r.status_code == 404:
+ raise FileNotFoundError(path)
+ r.raise_for_status()
+ types = {"blob": "file", "tree": "directory"}
+ out = [
+ {
+ "name": path + "/" + f["path"] if path else f["path"],
+ "mode": f["mode"],
+ "type": types[f["type"]],
+ "size": f.get("size", 0),
+ "sha": f["sha"],
+ }
+ for f in r.json()["tree"]
+ if f["type"] in types
+ ]
+ if sha in [self.root, None]:
+ self.dircache[path] = out
+ else:
+ out = self.dircache[path]
+ if detail:
+ return out
+ else:
+ return sorted([f["name"] for f in out])
+
+ def invalidate_cache(self, path=None):
+ self.dircache.clear()
+
+ @classmethod
+ def _strip_protocol(cls, path):
+ opts = infer_storage_options(path)
+ if "username" not in opts:
+ return super()._strip_protocol(path)
+ return opts["path"].lstrip("/")
+
+ @staticmethod
+ def _get_kwargs_from_urls(path):
+ opts = infer_storage_options(path)
+ if "username" not in opts:
+ return {}
+ out = {"org": opts["username"], "repo": opts["password"]}
+ if opts["host"]:
+ out["sha"] = opts["host"]
+ return out
+
+ def _open(
+ self,
+ path,
+ mode="rb",
+ block_size=None,
+ autocommit=True,
+ cache_options=None,
+ sha=None,
+ **kwargs,
+ ):
+ if mode != "rb":
+ raise NotImplementedError
+ url = self.rurl.format(
+ org=self.org, repo=self.repo, path=path, sha=sha or self.root
+ )
+ r = requests.get(url, timeout=self.timeout, **self.kw)
+ if r.status_code == 404:
+ raise FileNotFoundError(path)
+ r.raise_for_status()
+ return MemoryFile(None, None, r.content)
+
+ def cat(self, path, recursive=False, on_error="raise", **kwargs):
+ paths = self.expand_path(path, recursive=recursive)
+ urls = [
+ self.rurl.format(org=self.org, repo=self.repo, path=u, sha=self.root)
+ for u, sh in paths
+ ]
+ fs = fsspec.filesystem("http")
+ data = fs.cat(urls, on_error="return")
+ return {u: v for ((k, v), u) in zip(data.items(), urls)}
diff --git a/fsspec/implementations/http.py b/fsspec/implementations/http.py
index 94a6f71..c9ab177 100644
--- a/fsspec/implementations/http.py
+++ b/fsspec/implementations/http.py
@@ -5,17 +5,32 @@ import re
import weakref
from copy import copy
from urllib.parse import urlparse
+
import aiohttp
import yarl
+
from fsspec.asyn import AbstractAsyncStreamedFile, AsyncFileSystem, sync, sync_wrapper
from fsspec.callbacks import DEFAULT_CALLBACK
from fsspec.exceptions import FSTimeoutError
from fsspec.spec import AbstractBufferedFile
-from fsspec.utils import DEFAULT_BLOCK_SIZE, glob_translate, isfilelike, nullcontext, tokenize
+from fsspec.utils import (
+ DEFAULT_BLOCK_SIZE,
+ glob_translate,
+ isfilelike,
+ nullcontext,
+ tokenize,
+)
+
from ..caching import AllBytes
-ex = re.compile('<(a|A)\\s+(?:[^>]*?\\s+)?(href|HREF)=["\'](?P<url>[^"\']+)')
-ex2 = re.compile('(?P<url>http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)')
-logger = logging.getLogger('fsspec.http')
+
+# https://stackoverflow.com/a/15926317/3821154
+ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P<url>[^"']+)""")
+ex2 = re.compile(r"""(?P<url>http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
+logger = logging.getLogger("fsspec.http")
+
+
+async def get_client(**kwargs):
+ return aiohttp.ClientSession(**kwargs)
class HTTPFileSystem(AsyncFileSystem):
@@ -27,12 +42,24 @@ class HTTPFileSystem(AsyncFileSystem):
"http(s)://server.com/stuff?thing=other"; otherwise only links within
HTML href tags will be used.
"""
- sep = '/'
- def __init__(self, simple_links=True, block_size=None, same_scheme=True,
- size_policy=None, cache_type='bytes', cache_options=None,
- asynchronous=False, loop=None, client_kwargs=None, get_client=
- get_client, encoded=False, **storage_options):
+ sep = "/"
+
+ def __init__(
+ self,
+ simple_links=True,
+ block_size=None,
+ same_scheme=True,
+ size_policy=None,
+ cache_type="bytes",
+ cache_options=None,
+ asynchronous=False,
+ loop=None,
+ client_kwargs=None,
+ get_client=get_client,
+ encoded=False,
+ **storage_options,
+ ):
"""
NB: if this is called async, you must await set_client
@@ -60,10 +87,8 @@ class HTTPFileSystem(AsyncFileSystem):
Any other parameters passed on to requests
cache_type, cache_options: defaults used in open
"""
- super().__init__(self, asynchronous=asynchronous, loop=loop, **
- storage_options)
- self.block_size = (block_size if block_size is not None else
- DEFAULT_BLOCK_SIZE)
+ super().__init__(self, asynchronous=asynchronous, loop=loop, **storage_options)
+ self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
self.simple_links = simple_links
self.same_schema = same_scheme
self.cache_type = cache_type
@@ -73,28 +98,246 @@ class HTTPFileSystem(AsyncFileSystem):
self.encoded = encoded
self.kwargs = storage_options
self._session = None
+
+ # Clean caching-related parameters from `storage_options`
+ # before propagating them as `request_options` through `self.kwargs`.
+ # TODO: Maybe rename `self.kwargs` to `self.request_options` to make
+ # it clearer.
request_options = copy(storage_options)
- self.use_listings_cache = request_options.pop('use_listings_cache',
- False)
- request_options.pop('listings_expiry_time', None)
- request_options.pop('max_paths', None)
- request_options.pop('skip_instance_cache', None)
+ self.use_listings_cache = request_options.pop("use_listings_cache", False)
+ request_options.pop("listings_expiry_time", None)
+ request_options.pop("max_paths", None)
+ request_options.pop("skip_instance_cache", None)
self.kwargs = request_options
+ @property
+ def fsid(self):
+ return "http"
+
+ def encode_url(self, url):
+ return yarl.URL(url, encoded=self.encoded)
+
+ @staticmethod
+ def close_session(loop, session):
+ if loop is not None and loop.is_running():
+ try:
+ sync(loop, session.close, timeout=0.1)
+ return
+ except (TimeoutError, FSTimeoutError, NotImplementedError):
+ pass
+ connector = getattr(session, "_connector", None)
+ if connector is not None:
+ # close after loop is dead
+ connector._close()
+
+ async def set_session(self):
+ if self._session is None:
+ self._session = await self.get_client(loop=self.loop, **self.client_kwargs)
+ if not self.asynchronous:
+ weakref.finalize(self, self.close_session, self.loop, self._session)
+ return self._session
+
@classmethod
def _strip_protocol(cls, path):
"""For HTTP, we always want to keep the full URL"""
- pass
+ return path
+
+ @classmethod
+ def _parent(cls, path):
+ # override, since _strip_protocol is different for URLs
+ par = super()._parent(path)
+ if len(par) > 7: # "http://..."
+ return par
+ return ""
+
+ async def _ls_real(self, url, detail=True, **kwargs):
+ # ignoring URL-encoded arguments
+ kw = self.kwargs.copy()
+ kw.update(kwargs)
+ logger.debug(url)
+ session = await self.set_session()
+ async with session.get(self.encode_url(url), **self.kwargs) as r:
+ self._raise_not_found_for_status(r, url)
+ try:
+ text = await r.text()
+ if self.simple_links:
+ links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
+ else:
+ links = [u[2] for u in ex.findall(text)]
+ except UnicodeDecodeError:
+ links = [] # binary, not HTML
+ out = set()
+ parts = urlparse(url)
+ for l in links:
+ if isinstance(l, tuple):
+ l = l[1]
+ if l.startswith("/") and len(l) > 1:
+ # absolute URL on this server
+ l = f"{parts.scheme}://{parts.netloc}{l}"
+ if l.startswith("http"):
+ if self.same_schema and l.startswith(url.rstrip("/") + "/"):
+ out.add(l)
+ elif l.replace("https", "http").startswith(
+ url.replace("https", "http").rstrip("/") + "/"
+ ):
+ # allowed to cross http <-> https
+ out.add(l)
+ else:
+ if l not in ["..", "../"]:
+ # Ignore FTP-like "parent"
+ out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
+ if not out and url.endswith("/"):
+ out = await self._ls_real(url.rstrip("/"), detail=False)
+ if detail:
+ return [
+ {
+ "name": u,
+ "size": None,
+ "type": "directory" if u.endswith("/") else "file",
+ }
+ for u in out
+ ]
+ else:
+ return sorted(out)
+
+ async def _ls(self, url, detail=True, **kwargs):
+ if self.use_listings_cache and url in self.dircache:
+ out = self.dircache[url]
+ else:
+ out = await self._ls_real(url, detail=detail, **kwargs)
+ self.dircache[url] = out
+ return out
+
ls = sync_wrapper(_ls)
def _raise_not_found_for_status(self, response, url):
"""
Raises FileNotFoundError for 404s, otherwise uses raise_for_status.
"""
- pass
-
- def _open(self, path, mode='rb', block_size=None, autocommit=None,
- cache_type=None, cache_options=None, size=None, **kwargs):
+ if response.status == 404:
+ raise FileNotFoundError(url)
+ response.raise_for_status()
+
+ async def _cat_file(self, url, start=None, end=None, **kwargs):
+ kw = self.kwargs.copy()
+ kw.update(kwargs)
+ logger.debug(url)
+
+ if start is not None or end is not None:
+ if start == end:
+ return b""
+ headers = kw.pop("headers", {}).copy()
+
+ headers["Range"] = await self._process_limits(url, start, end)
+ kw["headers"] = headers
+ session = await self.set_session()
+ async with session.get(self.encode_url(url), **kw) as r:
+ out = await r.read()
+ self._raise_not_found_for_status(r, url)
+ return out
+
+ async def _get_file(
+ self, rpath, lpath, chunk_size=5 * 2**20, callback=DEFAULT_CALLBACK, **kwargs
+ ):
+ kw = self.kwargs.copy()
+ kw.update(kwargs)
+ logger.debug(rpath)
+ session = await self.set_session()
+ async with session.get(self.encode_url(rpath), **kw) as r:
+ try:
+ size = int(r.headers["content-length"])
+ except (ValueError, KeyError):
+ size = None
+
+ callback.set_size(size)
+ self._raise_not_found_for_status(r, rpath)
+ if isfilelike(lpath):
+ outfile = lpath
+ else:
+ outfile = open(lpath, "wb") # noqa: ASYNC101
+
+ try:
+ chunk = True
+ while chunk:
+ chunk = await r.content.read(chunk_size)
+ outfile.write(chunk)
+ callback.relative_update(len(chunk))
+ finally:
+ if not isfilelike(lpath):
+ outfile.close()
+
+ async def _put_file(
+ self,
+ lpath,
+ rpath,
+ chunk_size=5 * 2**20,
+ callback=DEFAULT_CALLBACK,
+ method="post",
+ **kwargs,
+ ):
+ async def gen_chunks():
+ # Support passing arbitrary file-like objects
+ # and use them instead of streams.
+ if isinstance(lpath, io.IOBase):
+ context = nullcontext(lpath)
+ use_seek = False # might not support seeking
+ else:
+ context = open(lpath, "rb") # noqa: ASYNC101
+ use_seek = True
+
+ with context as f:
+ if use_seek:
+ callback.set_size(f.seek(0, 2))
+ f.seek(0)
+ else:
+ callback.set_size(getattr(f, "size", None))
+
+ chunk = f.read(chunk_size)
+ while chunk:
+ yield chunk
+ callback.relative_update(len(chunk))
+ chunk = f.read(chunk_size)
+
+ kw = self.kwargs.copy()
+ kw.update(kwargs)
+ session = await self.set_session()
+
+ method = method.lower()
+ if method not in ("post", "put"):
+ raise ValueError(
+ f"method has to be either 'post' or 'put', not: {method!r}"
+ )
+
+ meth = getattr(session, method)
+ async with meth(self.encode_url(rpath), data=gen_chunks(), **kw) as resp:
+ self._raise_not_found_for_status(resp, rpath)
+
+ async def _exists(self, path, **kwargs):
+ kw = self.kwargs.copy()
+ kw.update(kwargs)
+ try:
+ logger.debug(path)
+ session = await self.set_session()
+ r = await session.get(self.encode_url(path), **kw)
+ async with r:
+ return r.status < 400
+ except aiohttp.ClientError:
+ return False
+
+ async def _isfile(self, path, **kwargs):
+ return await self._exists(path, **kwargs)
+
+ def _open(
+ self,
+ path,
+ mode="rb",
+ block_size=None,
+ autocommit=None, # XXX: This differs from the base class.
+ cache_type=None,
+ cache_options=None,
+ size=None,
+ **kwargs,
+ ):
"""Make a file-like object
Parameters
@@ -109,11 +352,56 @@ class HTTPFileSystem(AsyncFileSystem):
kwargs: key-value
Any other parameters, passed to requests calls
"""
- pass
+ if mode != "rb":
+ raise NotImplementedError
+ block_size = block_size if block_size is not None else self.block_size
+ kw = self.kwargs.copy()
+ kw["asynchronous"] = self.asynchronous
+ kw.update(kwargs)
+ size = size or self.info(path, **kwargs)["size"]
+ session = sync(self.loop, self.set_session)
+ if block_size and size:
+ return HTTPFile(
+ self,
+ path,
+ session=session,
+ block_size=block_size,
+ mode=mode,
+ size=size,
+ cache_type=cache_type or self.cache_type,
+ cache_options=cache_options or self.cache_options,
+ loop=self.loop,
+ **kw,
+ )
+ else:
+ return HTTPStreamFile(
+ self,
+ path,
+ mode=mode,
+ loop=self.loop,
+ session=session,
+ **kw,
+ )
+
+ async def open_async(self, path, mode="rb", size=None, **kwargs):
+ session = await self.set_session()
+ if size is None:
+ try:
+ size = (await self._info(path, **kwargs))["size"]
+ except FileNotFoundError:
+ pass
+ return AsyncStreamFile(
+ self,
+ path,
+ loop=self.loop,
+ session=session,
+ size=size,
+ **kwargs,
+ )
def ukey(self, url):
"""Unique identifier; assume HTTP files are static, unchanging"""
- pass
+ return tokenize(url, self.kwargs, self.protocol)
async def _info(self, url, **kwargs):
"""Get info of URL
@@ -125,7 +413,29 @@ class HTTPFileSystem(AsyncFileSystem):
which case size will be given as None (and certain operations on the
corresponding file will not work).
"""
- pass
+ info = {}
+ session = await self.set_session()
+
+ for policy in ["head", "get"]:
+ try:
+ info.update(
+ await _file_info(
+ self.encode_url(url),
+ size_policy=policy,
+ session=session,
+ **self.kwargs,
+ **kwargs,
+ )
+ )
+ if info.get("size") is not None:
+ break
+ except Exception as exc:
+ if policy == "get":
+ # If get failed, then raise a FileNotFoundError
+ raise FileNotFoundError(url) from exc
+ logger.debug("", exc_info=exc)
+
+ return {"name": url, "size": None, **info, "type": "file"}
async def _glob(self, path, maxdepth=None, **kwargs):
"""
@@ -135,7 +445,77 @@ class HTTPFileSystem(AsyncFileSystem):
but "?" is not considered as a character for globbing, because it is
so common in URLs, often identifying the "query" part.
"""
- pass
+ if maxdepth is not None and maxdepth < 1:
+ raise ValueError("maxdepth must be at least 1")
+ import re
+
+ ends_with_slash = path.endswith("/") # _strip_protocol strips trailing slash
+ path = self._strip_protocol(path)
+ append_slash_to_dirname = ends_with_slash or path.endswith(("/**", "/*"))
+ idx_star = path.find("*") if path.find("*") >= 0 else len(path)
+ idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
+
+ min_idx = min(idx_star, idx_brace)
+
+ detail = kwargs.pop("detail", False)
+
+ if not has_magic(path):
+ if await self._exists(path, **kwargs):
+ if not detail:
+ return [path]
+ else:
+ return {path: await self._info(path, **kwargs)}
+ else:
+ if not detail:
+ return [] # glob of non-existent returns empty
+ else:
+ return {}
+ elif "/" in path[:min_idx]:
+ min_idx = path[:min_idx].rindex("/")
+ root = path[: min_idx + 1]
+ depth = path[min_idx + 1 :].count("/") + 1
+ else:
+ root = ""
+ depth = path[min_idx + 1 :].count("/") + 1
+
+ if "**" in path:
+ if maxdepth is not None:
+ idx_double_stars = path.find("**")
+ depth_double_stars = path[idx_double_stars:].count("/") + 1
+ depth = depth - depth_double_stars + maxdepth
+ else:
+ depth = None
+
+ allpaths = await self._find(
+ root, maxdepth=depth, withdirs=True, detail=True, **kwargs
+ )
+
+ pattern = glob_translate(path + ("/" if ends_with_slash else ""))
+ pattern = re.compile(pattern)
+
+ out = {
+ (
+ p.rstrip("/")
+ if not append_slash_to_dirname
+ and info["type"] == "directory"
+ and p.endswith("/")
+ else p
+ ): info
+ for p, info in sorted(allpaths.items())
+ if pattern.match(p.rstrip("/"))
+ }
+
+ if detail:
+ return out
+ else:
+ return list(out)
+
+ async def _isdir(self, path):
+ # override, since all URLs are (also) files
+ try:
+ return bool(await self._ls(path))
+ except (FileNotFoundError, ValueError):
+ return False
class HTTPFile(AbstractBufferedFile):
@@ -163,18 +543,36 @@ class HTTPFile(AbstractBufferedFile):
kwargs: all other key-values are passed to requests calls.
"""
- def __init__(self, fs, url, session=None, block_size=None, mode='rb',
- cache_type='bytes', cache_options=None, size=None, loop=None,
- asynchronous=False, **kwargs):
- if mode != 'rb':
- raise NotImplementedError('File mode not supported')
+ def __init__(
+ self,
+ fs,
+ url,
+ session=None,
+ block_size=None,
+ mode="rb",
+ cache_type="bytes",
+ cache_options=None,
+ size=None,
+ loop=None,
+ asynchronous=False,
+ **kwargs,
+ ):
+ if mode != "rb":
+ raise NotImplementedError("File mode not supported")
self.asynchronous = asynchronous
self.loop = loop
self.url = url
self.session = session
- self.details = {'name': url, 'size': size, 'type': 'file'}
- super().__init__(fs=fs, path=url, mode=mode, block_size=block_size,
- cache_type=cache_type, cache_options=cache_options, **kwargs)
+ self.details = {"name": url, "size": size, "type": "file"}
+ super().__init__(
+ fs=fs,
+ path=url,
+ mode=mode,
+ block_size=block_size,
+ cache_type=cache_type,
+ cache_options=cache_options,
+ **kwargs,
+ )
def read(self, length=-1):
"""Read bytes from file
@@ -186,7 +584,18 @@ class HTTPFile(AbstractBufferedFile):
file. If the server has not supplied the filesize, attempting to
read only part of the data will raise a ValueError.
"""
- pass
+ if (
+ (length < 0 and self.loc == 0) # explicit read all
+ # but not when the size is known and fits into a block anyways
+ and not (self.size is not None and self.size <= self.blocksize)
+ ):
+ self._fetch_all()
+ if self.size is None:
+ if length < 0:
+ self._fetch_all()
+ else:
+ length = min(self.size - self.loc, length)
+ return super().read(length)
async def async_fetch_all(self):
"""Read whole file in one shot, without caching
@@ -194,12 +603,32 @@ class HTTPFile(AbstractBufferedFile):
This is only called when position is still at zero,
and read() is called without a byte-count.
"""
- pass
+ logger.debug(f"Fetch all for {self}")
+ if not isinstance(self.cache, AllBytes):
+ r = await self.session.get(self.fs.encode_url(self.url), **self.kwargs)
+ async with r:
+ r.raise_for_status()
+ out = await r.read()
+ self.cache = AllBytes(
+ size=len(out), fetcher=None, blocksize=None, data=out
+ )
+ self.size = len(out)
+
_fetch_all = sync_wrapper(async_fetch_all)
def _parse_content_range(self, headers):
"""Parse the Content-Range header"""
- pass
+ s = headers.get("Content-Range", "")
+ m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s)
+ if not m:
+ return None, None, None
+
+ if m[1] == "*":
+ start = end = None
+ else:
+ start, end = [int(x) for x in m[1].split("-")]
+ total = None if m[2] == "*" else int(m[2])
+ return start, end, total
async def async_fetch_range(self, start, end):
"""Download a block of data
@@ -209,66 +638,235 @@ class HTTPFile(AbstractBufferedFile):
and then stream the output - if the data size is bigger than we
requested, an exception is raised.
"""
- pass
+ logger.debug(f"Fetch range for {self}: {start}-{end}")
+ kwargs = self.kwargs.copy()
+ headers = kwargs.pop("headers", {}).copy()
+ headers["Range"] = f"bytes={start}-{end - 1}"
+ logger.debug(f"{self.url} : {headers['Range']}")
+ r = await self.session.get(
+ self.fs.encode_url(self.url), headers=headers, **kwargs
+ )
+ async with r:
+ if r.status == 416:
+ # range request outside file
+ return b""
+ r.raise_for_status()
+
+ # If the server has handled the range request, it should reply
+ # with status 206 (partial content). But we'll guess that a suitable
+ # Content-Range header or a Content-Length no more than the
+ # requested range also mean we have got the desired range.
+ response_is_range = (
+ r.status == 206
+ or self._parse_content_range(r.headers)[0] == start
+ or int(r.headers.get("Content-Length", end + 1)) <= end - start
+ )
+
+ if response_is_range:
+ # partial content, as expected
+ out = await r.read()
+ elif start > 0:
+ raise ValueError(
+ "The HTTP server doesn't appear to support range requests. "
+ "Only reading this file from the beginning is supported. "
+ "Open with block_size=0 for a streaming file interface."
+ )
+ else:
+ # Response is not a range, but we want the start of the file,
+ # so we can read the required amount anyway.
+ cl = 0
+ out = []
+ while True:
+ chunk = await r.content.read(2**20)
+ # data size unknown, let's read until we have enough
+ if chunk:
+ out.append(chunk)
+ cl += len(chunk)
+ if cl > end - start:
+ break
+ else:
+ break
+ out = b"".join(out)[: end - start]
+ return out
+
_fetch_range = sync_wrapper(async_fetch_range)
def __reduce__(self):
- return reopen, (self.fs, self.url, self.mode, self.blocksize, self.
- cache.name if self.cache else 'none', self.size)
+ return (
+ reopen,
+ (
+ self.fs,
+ self.url,
+ self.mode,
+ self.blocksize,
+ self.cache.name if self.cache else "none",
+ self.size,
+ ),
+ )
-magic_check = re.compile('([*[])')
+def reopen(fs, url, mode, blocksize, cache_type, size=None):
+ return fs.open(
+ url, mode=mode, block_size=blocksize, cache_type=cache_type, size=size
+ )
-class HTTPStreamFile(AbstractBufferedFile):
+magic_check = re.compile("([*[])")
+
+
+def has_magic(s):
+ match = magic_check.search(s)
+ return match is not None
+
- def __init__(self, fs, url, mode='rb', loop=None, session=None, **kwargs):
- self.asynchronous = kwargs.pop('asynchronous', False)
+class HTTPStreamFile(AbstractBufferedFile):
+ def __init__(self, fs, url, mode="rb", loop=None, session=None, **kwargs):
+ self.asynchronous = kwargs.pop("asynchronous", False)
self.url = url
self.loop = loop
self.session = session
- if mode != 'rb':
+ if mode != "rb":
raise ValueError
- self.details = {'name': url, 'size': None}
- super().__init__(fs=fs, path=url, mode=mode, cache_type='none', **
- kwargs)
+ self.details = {"name": url, "size": None}
+ super().__init__(fs=fs, path=url, mode=mode, cache_type="none", **kwargs)
async def cor():
- r = await self.session.get(self.fs.encode_url(url), **kwargs
- ).__aenter__()
+ r = await self.session.get(self.fs.encode_url(url), **kwargs).__aenter__()
self.fs._raise_not_found_for_status(r, url)
return r
+
self.r = sync(self.loop, cor)
self.loop = fs.loop
+
+ def seek(self, loc, whence=0):
+ if loc == 0 and whence == 1:
+ return
+ if loc == self.loc and whence == 0:
+ return
+ raise ValueError("Cannot seek streaming HTTP file")
+
+ async def _read(self, num=-1):
+ out = await self.r.content.read(num)
+ self.loc += len(out)
+ return out
+
read = sync_wrapper(_read)
+ async def _close(self):
+ self.r.close()
+
+ def close(self):
+ asyncio.run_coroutine_threadsafe(self._close(), self.loop)
+ super().close()
+
def __reduce__(self):
- return reopen, (self.fs, self.url, self.mode, self.blocksize, self.
- cache.name)
+ return reopen, (self.fs, self.url, self.mode, self.blocksize, self.cache.name)
class AsyncStreamFile(AbstractAsyncStreamedFile):
-
- def __init__(self, fs, url, mode='rb', loop=None, session=None, size=
- None, **kwargs):
+ def __init__(
+ self, fs, url, mode="rb", loop=None, session=None, size=None, **kwargs
+ ):
self.url = url
self.session = session
self.r = None
- if mode != 'rb':
+ if mode != "rb":
raise ValueError
- self.details = {'name': url, 'size': None}
+ self.details = {"name": url, "size": None}
self.kwargs = kwargs
- super().__init__(fs=fs, path=url, mode=mode, cache_type='none')
+ super().__init__(fs=fs, path=url, mode=mode, cache_type="none")
self.size = size
-
-async def _file_info(url, session, size_policy='head', **kwargs):
+ async def read(self, num=-1):
+ if self.r is None:
+ r = await self.session.get(
+ self.fs.encode_url(self.url), **self.kwargs
+ ).__aenter__()
+ self.fs._raise_not_found_for_status(r, self.url)
+ self.r = r
+ out = await self.r.content.read(num)
+ self.loc += len(out)
+ return out
+
+ async def close(self):
+ if self.r is not None:
+ self.r.close()
+ self.r = None
+ await super().close()
+
+
+async def get_range(session, url, start, end, file=None, **kwargs):
+ # explicit get a range when we know it must be safe
+ kwargs = kwargs.copy()
+ headers = kwargs.pop("headers", {}).copy()
+ headers["Range"] = f"bytes={start}-{end - 1}"
+ r = await session.get(url, headers=headers, **kwargs)
+ r.raise_for_status()
+ async with r:
+ out = await r.read()
+ if file:
+ with open(file, "r+b") as f: # noqa: ASYNC101
+ f.seek(start)
+ f.write(out)
+ else:
+ return out
+
+
+async def _file_info(url, session, size_policy="head", **kwargs):
"""Call HEAD on the server to get details about the file (size/checksum etc.)
Default operation is to explicitly allow redirects and use encoding
'identity' (no compression) to get the true size of the target.
"""
- pass
+ logger.debug("Retrieve file size for %s", url)
+ kwargs = kwargs.copy()
+ ar = kwargs.pop("allow_redirects", True)
+ head = kwargs.get("headers", {}).copy()
+ head["Accept-Encoding"] = "identity"
+ kwargs["headers"] = head
+
+ info = {}
+ if size_policy == "head":
+ r = await session.head(url, allow_redirects=ar, **kwargs)
+ elif size_policy == "get":
+ r = await session.get(url, allow_redirects=ar, **kwargs)
+ else:
+ raise TypeError(f'size_policy must be "head" or "get", got {size_policy}')
+ async with r:
+ r.raise_for_status()
+
+ # TODO:
+ # recognise lack of 'Accept-Ranges',
+ # or 'Accept-Ranges': 'none' (not 'bytes')
+ # to mean streaming only, no random access => return None
+ if "Content-Length" in r.headers:
+ # Some servers may choose to ignore Accept-Encoding and return
+ # compressed content, in which case the returned size is unreliable.
+ if "Content-Encoding" not in r.headers or r.headers["Content-Encoding"] in [
+ "identity",
+ "",
+ ]:
+ info["size"] = int(r.headers["Content-Length"])
+ elif "Content-Range" in r.headers:
+ info["size"] = int(r.headers["Content-Range"].split("/")[1])
+
+ if "Content-Type" in r.headers:
+ info["mimetype"] = r.headers["Content-Type"].partition(";")[0]
+
+ info["url"] = str(r.url)
+
+ for checksum_field in ["ETag", "Content-MD5", "Digest"]:
+ if r.headers.get(checksum_field):
+ info[checksum_field] = r.headers[checksum_field]
+
+ return info
+
+
+async def _file_size(url, session=None, *args, **kwargs):
+ if session is None:
+ session = await get_client()
+ info = await _file_info(url, session=session, *args, **kwargs)
+ return info.get("size")
file_size = sync_wrapper(_file_size)
diff --git a/fsspec/implementations/jupyter.py b/fsspec/implementations/jupyter.py
index 7da1be6..2839f4c 100644
--- a/fsspec/implementations/jupyter.py
+++ b/fsspec/implementations/jupyter.py
@@ -1,13 +1,16 @@
import base64
import io
import re
+
import requests
+
import fsspec
class JupyterFileSystem(fsspec.AbstractFileSystem):
"""View of the files as seen by a Jupyter server (notebook or lab)"""
- protocol = 'jupyter', 'jlab'
+
+ protocol = ("jupyter", "jlab")
def __init__(self, url, tok=None, **kwargs):
"""
@@ -21,25 +24,101 @@ class JupyterFileSystem(fsspec.AbstractFileSystem):
If the token is obtained separately, can be given here
kwargs
"""
- if '?' in url:
+ if "?" in url:
if tok is None:
try:
- tok = re.findall('token=([a-z0-9]+)', url)[0]
+ tok = re.findall("token=([a-z0-9]+)", url)[0]
except IndexError as e:
- raise ValueError('Could not determine token') from e
- url = url.split('?', 1)[0]
- self.url = url.rstrip('/') + '/api/contents'
+ raise ValueError("Could not determine token") from e
+ url = url.split("?", 1)[0]
+ self.url = url.rstrip("/") + "/api/contents"
self.session = requests.Session()
if tok:
- self.session.headers['Authorization'] = f'token {tok}'
+ self.session.headers["Authorization"] = f"token {tok}"
+
super().__init__(**kwargs)
+ def ls(self, path, detail=True, **kwargs):
+ path = self._strip_protocol(path)
+ r = self.session.get(f"{self.url}/{path}")
+ if r.status_code == 404:
+ return FileNotFoundError(path)
+ r.raise_for_status()
+ out = r.json()
-class SimpleFileWriter(fsspec.spec.AbstractBufferedFile):
+ if out["type"] == "directory":
+ out = out["content"]
+ else:
+ out = [out]
+ for o in out:
+ o["name"] = o.pop("path")
+ o.pop("content")
+ if o["type"] == "notebook":
+ o["type"] = "file"
+ if detail:
+ return out
+ return [o["name"] for o in out]
+
+ def cat_file(self, path, start=None, end=None, **kwargs):
+ path = self._strip_protocol(path)
+ r = self.session.get(f"{self.url}/{path}")
+ if r.status_code == 404:
+ return FileNotFoundError(path)
+ r.raise_for_status()
+ out = r.json()
+ if out["format"] == "text":
+ # data should be binary
+ b = out["content"].encode()
+ else:
+ b = base64.b64decode(out["content"])
+ return b[start:end]
+ def pipe_file(self, path, value, **_):
+ path = self._strip_protocol(path)
+ json = {
+ "name": path.rsplit("/", 1)[-1],
+ "path": path,
+ "size": len(value),
+ "content": base64.b64encode(value).decode(),
+ "format": "base64",
+ "type": "file",
+ }
+ self.session.put(f"{self.url}/{path}", json=json)
+
+ def mkdir(self, path, create_parents=True, **kwargs):
+ path = self._strip_protocol(path)
+ if create_parents and "/" in path:
+ self.mkdir(path.rsplit("/", 1)[0], True)
+ json = {
+ "name": path.rsplit("/", 1)[-1],
+ "path": path,
+ "size": None,
+ "content": None,
+ "type": "directory",
+ }
+ self.session.put(f"{self.url}/{path}", json=json)
+
+ def _rm(self, path):
+ path = self._strip_protocol(path)
+ self.session.delete(f"{self.url}/{path}")
+
+ def _open(self, path, mode="rb", **kwargs):
+ path = self._strip_protocol(path)
+ if mode == "rb":
+ data = self.cat_file(path)
+ return io.BytesIO(data)
+ else:
+ return SimpleFileWriter(self, path, mode="wb")
+
+
+class SimpleFileWriter(fsspec.spec.AbstractBufferedFile):
def _upload_chunk(self, final=False):
"""Never uploads a chunk until file is done
Not suitable for large files
"""
- pass
+ if final is False:
+ return False
+ self.buffer.seek(0)
+ data = self.buffer.read()
+ self.fs.pipe_file(self.path, data)
diff --git a/fsspec/implementations/libarchive.py b/fsspec/implementations/libarchive.py
index c2101dc..eb6f145 100644
--- a/fsspec/implementations/libarchive.py
+++ b/fsspec/implementations/libarchive.py
@@ -1,25 +1,72 @@
from contextlib import contextmanager
-from ctypes import CFUNCTYPE, POINTER, c_int, c_longlong, c_void_p, cast, create_string_buffer
+from ctypes import (
+ CFUNCTYPE,
+ POINTER,
+ c_int,
+ c_longlong,
+ c_void_p,
+ cast,
+ create_string_buffer,
+)
+
import libarchive
import libarchive.ffi as ffi
+
from fsspec import open_files
from fsspec.archive import AbstractArchiveFileSystem
from fsspec.implementations.memory import MemoryFile
from fsspec.utils import DEFAULT_BLOCK_SIZE
+
+# Libarchive requires seekable files or memory only for certain archive
+# types. However, since we read the directory first to cache the contents
+# and also allow random access to any file, the file-like object needs
+# to be seekable no matter what.
+
+# Seek call-backs (not provided in the libarchive python wrapper)
SEEK_CALLBACK = CFUNCTYPE(c_longlong, c_int, c_void_p, c_longlong, c_int)
-read_set_seek_callback = ffi.ffi('read_set_seek_callback', [ffi.c_archive_p,
- SEEK_CALLBACK], c_int, ffi.check_int)
-new_api = hasattr(ffi, 'NO_OPEN_CB')
+read_set_seek_callback = ffi.ffi(
+ "read_set_seek_callback", [ffi.c_archive_p, SEEK_CALLBACK], c_int, ffi.check_int
+)
+new_api = hasattr(ffi, "NO_OPEN_CB")
@contextmanager
-def custom_reader(file, format_name='all', filter_name='all', block_size=
- ffi.page_size):
+def custom_reader(file, format_name="all", filter_name="all", block_size=ffi.page_size):
"""Read an archive from a seekable file-like object.
The `file` object must support the standard `readinto` and 'seek' methods.
"""
- pass
+ buf = create_string_buffer(block_size)
+ buf_p = cast(buf, c_void_p)
+
+ def read_func(archive_p, context, ptrptr):
+ # readinto the buffer, returns number of bytes read
+ length = file.readinto(buf)
+ # write the address of the buffer into the pointer
+ ptrptr = cast(ptrptr, POINTER(c_void_p))
+ ptrptr[0] = buf_p
+ # tell libarchive how much data was written into the buffer
+ return length
+
+ def seek_func(archive_p, context, offset, whence):
+ file.seek(offset, whence)
+ # tell libarchvie the current position
+ return file.tell()
+
+ read_cb = ffi.READ_CALLBACK(read_func)
+ seek_cb = SEEK_CALLBACK(seek_func)
+
+ if new_api:
+ open_cb = ffi.NO_OPEN_CB
+ close_cb = ffi.NO_CLOSE_CB
+ else:
+ open_cb = libarchive.read.OPEN_CALLBACK(ffi.VOID_CB)
+ close_cb = libarchive.read.CLOSE_CALLBACK(ffi.VOID_CB)
+
+ with libarchive.read.new_archive_read(format_name, filter_name) as archive_p:
+ read_set_seek_callback(archive_p, seek_cb)
+ ffi.read_open(archive_p, None, open_cb, read_cb, close_cb)
+ yield libarchive.read.ArchiveRead(archive_p)
class LibArchiveFileSystem(AbstractArchiveFileSystem):
@@ -39,12 +86,20 @@ class LibArchiveFileSystem(AbstractArchiveFileSystem):
This class is pickleable, but not necessarily thread-safe (depends on the
platform). See libarchive documentation for details.
"""
- root_marker = ''
- protocol = 'libarchive'
+
+ root_marker = ""
+ protocol = "libarchive"
cachable = False
- def __init__(self, fo='', mode='r', target_protocol=None,
- target_options=None, block_size=DEFAULT_BLOCK_SIZE, **kwargs):
+ def __init__(
+ self,
+ fo="",
+ mode="r",
+ target_protocol=None,
+ target_options=None,
+ block_size=DEFAULT_BLOCK_SIZE,
+ **kwargs,
+ ):
"""
Parameters
----------
@@ -61,17 +116,98 @@ class LibArchiveFileSystem(AbstractArchiveFileSystem):
a string.
"""
super().__init__(self, **kwargs)
- if mode != 'r':
- raise ValueError('Only read from archive files accepted')
+ if mode != "r":
+ raise ValueError("Only read from archive files accepted")
if isinstance(fo, str):
- files = open_files(fo, protocol=target_protocol, **
- target_options or {})
+ files = open_files(fo, protocol=target_protocol, **(target_options or {}))
if len(files) != 1:
raise ValueError(
f'Path "{fo}" did not resolve to exactly one file: "{files}"'
- )
+ )
fo = files[0]
self.of = fo
- self.fo = fo.__enter__()
+ self.fo = fo.__enter__() # the whole instance is a context
self.block_size = block_size
self.dir_cache = None
+
+ @contextmanager
+ def _open_archive(self):
+ self.fo.seek(0)
+ with custom_reader(self.fo, block_size=self.block_size) as arc:
+ yield arc
+
+ @classmethod
+ def _strip_protocol(cls, path):
+ # file paths are always relative to the archive root
+ return super()._strip_protocol(path).lstrip("/")
+
+ def _get_dirs(self):
+ fields = {
+ "name": "pathname",
+ "size": "size",
+ "created": "ctime",
+ "mode": "mode",
+ "uid": "uid",
+ "gid": "gid",
+ "mtime": "mtime",
+ }
+
+ if self.dir_cache is not None:
+ return
+
+ self.dir_cache = {}
+ list_names = []
+ with self._open_archive() as arc:
+ for entry in arc:
+ if not entry.isdir and not entry.isfile:
+ # Skip symbolic links, fifo entries, etc.
+ continue
+ self.dir_cache.update(
+ {
+ dirname: {"name": dirname, "size": 0, "type": "directory"}
+ for dirname in self._all_dirnames(set(entry.name))
+ }
+ )
+ f = {key: getattr(entry, fields[key]) for key in fields}
+ f["type"] = "directory" if entry.isdir else "file"
+ list_names.append(entry.name)
+
+ self.dir_cache[f["name"]] = f
+ # libarchive does not seem to return an entry for the directories (at least
+ # not in all formats), so get the directories names from the files names
+ self.dir_cache.update(
+ {
+ dirname: {"name": dirname, "size": 0, "type": "directory"}
+ for dirname in self._all_dirnames(list_names)
+ }
+ )
+
+ def _open(
+ self,
+ path,
+ mode="rb",
+ block_size=None,
+ autocommit=True,
+ cache_options=None,
+ **kwargs,
+ ):
+ path = self._strip_protocol(path)
+ if mode != "rb":
+ raise NotImplementedError
+
+ data = bytes()
+ with self._open_archive() as arc:
+ for entry in arc:
+ if entry.pathname != path:
+ continue
+
+ if entry.size == 0:
+ # empty file, so there are no blocks
+ break
+
+ for block in entry.get_blocks(entry.size):
+ data = block
+ break
+ else:
+ raise ValueError
+ return MemoryFile(fs=self, path=path, data=data)
diff --git a/fsspec/implementations/local.py b/fsspec/implementations/local.py
index af01bea..9881606 100644
--- a/fsspec/implementations/local.py
+++ b/fsspec/implementations/local.py
@@ -6,11 +6,13 @@ import os.path as osp
import shutil
import stat
import tempfile
+
from fsspec import AbstractFileSystem
from fsspec.compression import compr
from fsspec.core import get_compression
from fsspec.utils import isfilelike, stringify_path
-logger = logging.getLogger('fsspec.local')
+
+logger = logging.getLogger("fsspec.local")
class LocalFileSystem(AbstractFileSystem):
@@ -23,18 +25,307 @@ class LocalFileSystem(AbstractFileSystem):
be created (if it doesn't already exist). This is assumed by pyarrow
code.
"""
- root_marker = '/'
- protocol = 'file', 'local'
+
+ root_marker = "/"
+ protocol = "file", "local"
local_file = True
def __init__(self, auto_mkdir=False, **kwargs):
super().__init__(**kwargs)
self.auto_mkdir = auto_mkdir
+ @property
+ def fsid(self):
+ return "local"
+
+ def mkdir(self, path, create_parents=True, **kwargs):
+ path = self._strip_protocol(path)
+ if self.exists(path):
+ raise FileExistsError(path)
+ if create_parents:
+ self.makedirs(path, exist_ok=True)
+ else:
+ os.mkdir(path, **kwargs)
+
+ def makedirs(self, path, exist_ok=False):
+ path = self._strip_protocol(path)
+ os.makedirs(path, exist_ok=exist_ok)
+
+ def rmdir(self, path):
+ path = self._strip_protocol(path)
+ os.rmdir(path)
+
+ def ls(self, path, detail=False, **kwargs):
+ path = self._strip_protocol(path)
+ info = self.info(path)
+ if info["type"] == "directory":
+ with os.scandir(path) as it:
+ infos = [self.info(f) for f in it]
+ else:
+ infos = [info]
+
+ if not detail:
+ return [i["name"] for i in infos]
+ return infos
+
+ def info(self, path, **kwargs):
+ if isinstance(path, os.DirEntry):
+ # scandir DirEntry
+ out = path.stat(follow_symlinks=False)
+ link = path.is_symlink()
+ if path.is_dir(follow_symlinks=False):
+ t = "directory"
+ elif path.is_file(follow_symlinks=False):
+ t = "file"
+ else:
+ t = "other"
+ path = self._strip_protocol(path.path)
+ else:
+ # str or path-like
+ path = self._strip_protocol(path)
+ out = os.stat(path, follow_symlinks=False)
+ link = stat.S_ISLNK(out.st_mode)
+ if link:
+ out = os.stat(path, follow_symlinks=True)
+ if stat.S_ISDIR(out.st_mode):
+ t = "directory"
+ elif stat.S_ISREG(out.st_mode):
+ t = "file"
+ else:
+ t = "other"
+ result = {
+ "name": path,
+ "size": out.st_size,
+ "type": t,
+ "created": out.st_ctime,
+ "islink": link,
+ }
+ for field in ["mode", "uid", "gid", "mtime", "ino", "nlink"]:
+ result[field] = getattr(out, f"st_{field}")
+ if result["islink"]:
+ result["destination"] = os.readlink(path)
+ try:
+ out2 = os.stat(path, follow_symlinks=True)
+ result["size"] = out2.st_size
+ except OSError:
+ result["size"] = 0
+ return result
+
+ def lexists(self, path, **kwargs):
+ return osp.lexists(path)
+
+ def cp_file(self, path1, path2, **kwargs):
+ path1 = self._strip_protocol(path1)
+ path2 = self._strip_protocol(path2)
+ if self.auto_mkdir:
+ self.makedirs(self._parent(path2), exist_ok=True)
+ if self.isfile(path1):
+ shutil.copyfile(path1, path2)
+ elif self.isdir(path1):
+ self.mkdirs(path2, exist_ok=True)
+ else:
+ raise FileNotFoundError(path1)
+
+ def isfile(self, path):
+ path = self._strip_protocol(path)
+ return os.path.isfile(path)
+
+ def isdir(self, path):
+ path = self._strip_protocol(path)
+ return os.path.isdir(path)
+
+ def get_file(self, path1, path2, callback=None, **kwargs):
+ if isfilelike(path2):
+ with open(path1, "rb") as f:
+ shutil.copyfileobj(f, path2)
+ else:
+ return self.cp_file(path1, path2, **kwargs)
+
+ def put_file(self, path1, path2, callback=None, **kwargs):
+ return self.cp_file(path1, path2, **kwargs)
+
+ def mv(self, path1, path2, **kwargs):
+ path1 = self._strip_protocol(path1)
+ path2 = self._strip_protocol(path2)
+ shutil.move(path1, path2)
+
+ def link(self, src, dst, **kwargs):
+ src = self._strip_protocol(src)
+ dst = self._strip_protocol(dst)
+ os.link(src, dst, **kwargs)
+
+ def symlink(self, src, dst, **kwargs):
+ src = self._strip_protocol(src)
+ dst = self._strip_protocol(dst)
+ os.symlink(src, dst, **kwargs)
+
+ def islink(self, path) -> bool:
+ return os.path.islink(self._strip_protocol(path))
+
+ def rm_file(self, path):
+ os.remove(self._strip_protocol(path))
+
+ def rm(self, path, recursive=False, maxdepth=None):
+ if not isinstance(path, list):
+ path = [path]
+
+ for p in path:
+ p = self._strip_protocol(p)
+ if self.isdir(p):
+ if not recursive:
+ raise ValueError("Cannot delete directory, set recursive=True")
+ if osp.abspath(p) == os.getcwd():
+ raise ValueError("Cannot delete current working directory")
+ shutil.rmtree(p)
+ else:
+ os.remove(p)
+
+ def unstrip_protocol(self, name):
+ name = self._strip_protocol(name) # normalise for local/win/...
+ return f"file://{name}"
+
+ def _open(self, path, mode="rb", block_size=None, **kwargs):
+ path = self._strip_protocol(path)
+ if self.auto_mkdir and "w" in mode:
+ self.makedirs(self._parent(path), exist_ok=True)
+ return LocalFileOpener(path, mode, fs=self, **kwargs)
+
+ def touch(self, path, truncate=True, **kwargs):
+ path = self._strip_protocol(path)
+ if self.auto_mkdir:
+ self.makedirs(self._parent(path), exist_ok=True)
+ if self.exists(path):
+ os.utime(path, None)
+ else:
+ open(path, "a").close()
+ if truncate:
+ os.truncate(path, 0)
+
+ def created(self, path):
+ info = self.info(path=path)
+ return datetime.datetime.fromtimestamp(
+ info["created"], tz=datetime.timezone.utc
+ )
+
+ def modified(self, path):
+ info = self.info(path=path)
+ return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
+
+ @classmethod
+ def _parent(cls, path):
+ path = cls._strip_protocol(path)
+ if os.sep == "/":
+ # posix native
+ return path.rsplit("/", 1)[0] or "/"
+ else:
+ # NT
+ path_ = path.rsplit("/", 1)[0]
+ if len(path_) <= 3:
+ if path_[1:2] == ":":
+ # nt root (something like c:/)
+ return path_[0] + ":/"
+ # More cases may be required here
+ return path_
+
+ @classmethod
+ def _strip_protocol(cls, path):
+ path = stringify_path(path)
+ if path.startswith("file://"):
+ path = path[7:]
+ elif path.startswith("file:"):
+ path = path[5:]
+ elif path.startswith("local://"):
+ path = path[8:]
+ elif path.startswith("local:"):
+ path = path[6:]
+
+ path = make_path_posix(path)
+ if os.sep != "/":
+ # This code-path is a stripped down version of
+ # > drive, path = ntpath.splitdrive(path)
+ if path[1:2] == ":":
+ # Absolute drive-letter path, e.g. X:\Windows
+ # Relative path with drive, e.g. X:Windows
+ drive, path = path[:2], path[2:]
+ elif path[:2] == "//":
+ # UNC drives, e.g. \\server\share or \\?\UNC\server\share
+ # Device drives, e.g. \\.\device or \\?\device
+ if (index1 := path.find("/", 2)) == -1 or (
+ index2 := path.find("/", index1 + 1)
+ ) == -1:
+ drive, path = path, ""
+ else:
+ drive, path = path[:index2], path[index2:]
+ else:
+ # Relative path, e.g. Windows
+ drive = ""
+
+ path = path.rstrip("/") or cls.root_marker
+ return drive + path
+
+ else:
+ return path.rstrip("/") or cls.root_marker
+
+ def _isfilestore(self):
+ # Inheriting from DaskFileSystem makes this False (S3, etc. were)
+ # the original motivation. But we are a posix-like file system.
+ # See https://github.com/dask/dask/issues/5526
+ return True
+
+ def chmod(self, path, mode):
+ path = stringify_path(path)
+ return os.chmod(path, mode)
+
def make_path_posix(path):
"""Make path generic and absolute for current OS"""
- pass
+ if not isinstance(path, str):
+ if isinstance(path, (list, set, tuple)):
+ return type(path)(make_path_posix(p) for p in path)
+ else:
+ path = stringify_path(path)
+ if not isinstance(path, str):
+ raise TypeError(f"could not convert {path!r} to string")
+ if os.sep == "/":
+ # Native posix
+ if path.startswith("/"):
+ # most common fast case for posix
+ return path
+ elif path.startswith("~"):
+ return osp.expanduser(path)
+ elif path.startswith("./"):
+ path = path[2:]
+ elif path == ".":
+ path = ""
+ return f"{os.getcwd()}/{path}"
+ else:
+ # NT handling
+ if path[0:1] == "/" and path[2:3] == ":":
+ # path is like "/c:/local/path"
+ path = path[1:]
+ if path[1:2] == ":":
+ # windows full path like "C:\\local\\path"
+ if len(path) <= 3:
+ # nt root (something like c:/)
+ return path[0] + ":/"
+ path = path.replace("\\", "/")
+ return path
+ elif path[0:1] == "~":
+ return make_path_posix(osp.expanduser(path))
+ elif path.startswith(("\\\\", "//")):
+ # windows UNC/DFS-style paths
+ return "//" + path[2:].replace("\\", "/")
+ elif path.startswith(("\\", "/")):
+ # windows relative path with root
+ path = path.replace("\\", "/")
+ return f"{osp.splitdrive(os.getcwd())[0]}{path}"
+ else:
+ path = path.replace("\\", "/")
+ if path.startswith("./"):
+ path = path[2:]
+ elif path == ".":
+ path = ""
+ return f"{make_path_posix(os.getcwd())}/{path}"
def trailing_sep(path):
@@ -43,14 +334,17 @@ def trailing_sep(path):
A forward slash is always considered a path separator, even on Operating
Systems that normally use a backslash.
"""
- pass
+ # TODO: if all incoming paths were posix-compliant then separator would
+ # always be a forward slash, simplifying this function.
+ # See https://github.com/fsspec/filesystem_spec/pull/1250
+ return path.endswith(os.sep) or (os.altsep is not None and path.endswith(os.altsep))
class LocalFileOpener(io.IOBase):
-
- def __init__(self, path, mode, autocommit=True, fs=None, compression=
- None, **kwargs):
- logger.debug('open file: %s', path)
+ def __init__(
+ self, path, mode, autocommit=True, fs=None, compression=None, **kwargs
+ ):
+ logger.debug("open file: %s", path)
self.path = path
self.mode = mode
self.fs = fs
@@ -60,24 +354,104 @@ class LocalFileOpener(io.IOBase):
self.blocksize = io.DEFAULT_BUFFER_SIZE
self._open()
+ def _open(self):
+ if self.f is None or self.f.closed:
+ if self.autocommit or "w" not in self.mode:
+ self.f = open(self.path, mode=self.mode)
+ if self.compression:
+ compress = compr[self.compression]
+ self.f = compress(self.f, mode=self.mode)
+ else:
+ # TODO: check if path is writable?
+ i, name = tempfile.mkstemp()
+ os.close(i) # we want normal open and normal buffered file
+ self.temp = name
+ self.f = open(name, mode=self.mode)
+ if "w" not in self.mode:
+ self.size = self.f.seek(0, 2)
+ self.f.seek(0)
+ self.f.size = self.size
+
+ def _fetch_range(self, start, end):
+ # probably only used by cached FS
+ if "r" not in self.mode:
+ raise ValueError
+ self._open()
+ self.f.seek(start)
+ return self.f.read(end - start)
+
def __setstate__(self, state):
self.f = None
- loc = state.pop('loc', None)
+ loc = state.pop("loc", None)
self.__dict__.update(state)
- if 'r' in state['mode']:
+ if "r" in state["mode"]:
self.f = None
self._open()
self.f.seek(loc)
def __getstate__(self):
d = self.__dict__.copy()
- d.pop('f')
- if 'r' in self.mode:
- d['loc'] = self.f.tell()
- elif not self.f.closed:
- raise ValueError('Cannot serialise open write-mode local file')
+ d.pop("f")
+ if "r" in self.mode:
+ d["loc"] = self.f.tell()
+ else:
+ if not self.f.closed:
+ raise ValueError("Cannot serialise open write-mode local file")
return d
+ def commit(self):
+ if self.autocommit:
+ raise RuntimeError("Can only commit if not already set to autocommit")
+ shutil.move(self.temp, self.path)
+
+ def discard(self):
+ if self.autocommit:
+ raise RuntimeError("Cannot discard if set to autocommit")
+ os.remove(self.temp)
+
+ def readable(self) -> bool:
+ return True
+
+ def writable(self) -> bool:
+ return "r" not in self.mode
+
+ def read(self, *args, **kwargs):
+ return self.f.read(*args, **kwargs)
+
+ def write(self, *args, **kwargs):
+ return self.f.write(*args, **kwargs)
+
+ def tell(self, *args, **kwargs):
+ return self.f.tell(*args, **kwargs)
+
+ def seek(self, *args, **kwargs):
+ return self.f.seek(*args, **kwargs)
+
+ def seekable(self, *args, **kwargs):
+ return self.f.seekable(*args, **kwargs)
+
+ def readline(self, *args, **kwargs):
+ return self.f.readline(*args, **kwargs)
+
+ def readlines(self, *args, **kwargs):
+ return self.f.readlines(*args, **kwargs)
+
+ def close(self):
+ return self.f.close()
+
+ def truncate(self, size=None) -> int:
+ return self.f.truncate(size)
+
+ @property
+ def closed(self):
+ return self.f.closed
+
+ def fileno(self):
+ return self.raw.fileno()
+
+ def flush(self) -> None:
+ self.f.flush()
+
def __iter__(self):
return self.f.__iter__()
diff --git a/fsspec/implementations/memory.py b/fsspec/implementations/memory.py
index e1fdbd3..83e7e74 100644
--- a/fsspec/implementations/memory.py
+++ b/fsspec/implementations/memory.py
@@ -1,14 +1,17 @@
from __future__ import annotations
+
import logging
from datetime import datetime, timezone
from errno import ENOTEMPTY
from io import BytesIO
from pathlib import PurePath, PureWindowsPath
from typing import Any, ClassVar
+
from fsspec import AbstractFileSystem
from fsspec.implementations.local import LocalFileSystem
from fsspec.utils import stringify_path
-logger = logging.getLogger('fsspec.memoryfs')
+
+logger = logging.getLogger("fsspec.memoryfs")
class MemoryFileSystem(AbstractFileSystem):
@@ -17,17 +20,251 @@ class MemoryFileSystem(AbstractFileSystem):
This is a global filesystem so instances of this class all point to the same
in memory filesystem.
"""
- store: ClassVar[dict[str, Any]] = {}
- pseudo_dirs = ['']
- protocol = 'memory'
- root_marker = '/'
+
+ store: ClassVar[dict[str, Any]] = {} # global, do not overwrite!
+ pseudo_dirs = [""] # global, do not overwrite!
+ protocol = "memory"
+ root_marker = "/"
+
+ @classmethod
+ def _strip_protocol(cls, path):
+ if isinstance(path, PurePath):
+ if isinstance(path, PureWindowsPath):
+ return LocalFileSystem._strip_protocol(path)
+ else:
+ path = stringify_path(path)
+
+ if path.startswith("memory://"):
+ path = path[len("memory://") :]
+ if "::" in path or "://" in path:
+ return path.rstrip("/")
+ path = path.lstrip("/").rstrip("/")
+ return "/" + path if path else ""
+
+ def ls(self, path, detail=True, **kwargs):
+ path = self._strip_protocol(path)
+ if path in self.store:
+ # there is a key with this exact name
+ if not detail:
+ return [path]
+ return [
+ {
+ "name": path,
+ "size": self.store[path].size,
+ "type": "file",
+ "created": self.store[path].created.timestamp(),
+ }
+ ]
+ paths = set()
+ starter = path + "/"
+ out = []
+ for p2 in tuple(self.store):
+ if p2.startswith(starter):
+ if "/" not in p2[len(starter) :]:
+ # exact child
+ out.append(
+ {
+ "name": p2,
+ "size": self.store[p2].size,
+ "type": "file",
+ "created": self.store[p2].created.timestamp(),
+ }
+ )
+ elif len(p2) > len(starter):
+ # implied child directory
+ ppath = starter + p2[len(starter) :].split("/", 1)[0]
+ if ppath not in paths:
+ out = out or []
+ out.append(
+ {
+ "name": ppath,
+ "size": 0,
+ "type": "directory",
+ }
+ )
+ paths.add(ppath)
+ for p2 in self.pseudo_dirs:
+ if p2.startswith(starter):
+ if "/" not in p2[len(starter) :]:
+ # exact child pdir
+ if p2 not in paths:
+ out.append({"name": p2, "size": 0, "type": "directory"})
+ paths.add(p2)
+ else:
+ # directory implied by deeper pdir
+ ppath = starter + p2[len(starter) :].split("/", 1)[0]
+ if ppath not in paths:
+ out.append({"name": ppath, "size": 0, "type": "directory"})
+ paths.add(ppath)
+ if not out:
+ if path in self.pseudo_dirs:
+ # empty dir
+ return []
+ raise FileNotFoundError(path)
+ if detail:
+ return out
+ return sorted([f["name"] for f in out])
+
+ def mkdir(self, path, create_parents=True, **kwargs):
+ path = self._strip_protocol(path)
+ if path in self.store or path in self.pseudo_dirs:
+ raise FileExistsError(path)
+ if self._parent(path).strip("/") and self.isfile(self._parent(path)):
+ raise NotADirectoryError(self._parent(path))
+ if create_parents and self._parent(path).strip("/"):
+ try:
+ self.mkdir(self._parent(path), create_parents, **kwargs)
+ except FileExistsError:
+ pass
+ if path and path not in self.pseudo_dirs:
+ self.pseudo_dirs.append(path)
+
+ def makedirs(self, path, exist_ok=False):
+ try:
+ self.mkdir(path, create_parents=True)
+ except FileExistsError:
+ if not exist_ok:
+ raise
def pipe_file(self, path, value, **kwargs):
"""Set the bytes of given file
Avoids copies of the data if possible
"""
- pass
+ self.open(path, "wb", data=value)
+
+ def rmdir(self, path):
+ path = self._strip_protocol(path)
+ if path == "":
+ # silently avoid deleting FS root
+ return
+ if path in self.pseudo_dirs:
+ if not self.ls(path):
+ self.pseudo_dirs.remove(path)
+ else:
+ raise OSError(ENOTEMPTY, "Directory not empty", path)
+ else:
+ raise FileNotFoundError(path)
+
+ def info(self, path, **kwargs):
+ logger.debug("info: %s", path)
+ path = self._strip_protocol(path)
+ if path in self.pseudo_dirs or any(
+ p.startswith(path + "/") for p in list(self.store) + self.pseudo_dirs
+ ):
+ return {
+ "name": path,
+ "size": 0,
+ "type": "directory",
+ }
+ elif path in self.store:
+ filelike = self.store[path]
+ return {
+ "name": path,
+ "size": filelike.size,
+ "type": "file",
+ "created": getattr(filelike, "created", None),
+ }
+ else:
+ raise FileNotFoundError(path)
+
+ def _open(
+ self,
+ path,
+ mode="rb",
+ block_size=None,
+ autocommit=True,
+ cache_options=None,
+ **kwargs,
+ ):
+ path = self._strip_protocol(path)
+ if path in self.pseudo_dirs:
+ raise IsADirectoryError(path)
+ parent = path
+ while len(parent) > 1:
+ parent = self._parent(parent)
+ if self.isfile(parent):
+ raise FileExistsError(parent)
+ if mode in ["rb", "ab", "r+b"]:
+ if path in self.store:
+ f = self.store[path]
+ if mode == "ab":
+ # position at the end of file
+ f.seek(0, 2)
+ else:
+ # position at the beginning of file
+ f.seek(0)
+ return f
+ else:
+ raise FileNotFoundError(path)
+ elif mode == "wb":
+ m = MemoryFile(self, path, kwargs.get("data"))
+ if not self._intrans:
+ m.commit()
+ return m
+ else:
+ name = self.__class__.__name__
+ raise ValueError(f"unsupported file mode for {name}: {mode!r}")
+
+ def cp_file(self, path1, path2, **kwargs):
+ path1 = self._strip_protocol(path1)
+ path2 = self._strip_protocol(path2)
+ if self.isfile(path1):
+ self.store[path2] = MemoryFile(
+ self, path2, self.store[path1].getvalue()
+ ) # implicit copy
+ elif self.isdir(path1):
+ if path2 not in self.pseudo_dirs:
+ self.pseudo_dirs.append(path2)
+ else:
+ raise FileNotFoundError(path1)
+
+ def cat_file(self, path, start=None, end=None, **kwargs):
+ logger.debug("cat: %s", path)
+ path = self._strip_protocol(path)
+ try:
+ return bytes(self.store[path].getbuffer()[start:end])
+ except KeyError:
+ raise FileNotFoundError(path)
+
+ def _rm(self, path):
+ path = self._strip_protocol(path)
+ try:
+ del self.store[path]
+ except KeyError as e:
+ raise FileNotFoundError(path) from e
+
+ def modified(self, path):
+ path = self._strip_protocol(path)
+ try:
+ return self.store[path].modified
+ except KeyError:
+ raise FileNotFoundError(path)
+
+ def created(self, path):
+ path = self._strip_protocol(path)
+ try:
+ return self.store[path].created
+ except KeyError:
+ raise FileNotFoundError(path)
+
+ def rm(self, path, recursive=False, maxdepth=None):
+ if isinstance(path, str):
+ path = self._strip_protocol(path)
+ else:
+ path = [self._strip_protocol(p) for p in path]
+ paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
+ for p in reversed(paths):
+ # If the expanded path doesn't exist, it is only because the expanded
+ # path was a directory that does not exist in self.pseudo_dirs. This
+ # is possible if you directly create files without making the
+ # directories first.
+ if not self.exists(p):
+ continue
+ if self.isfile(p):
+ self.rm_file(p)
+ else:
+ self.rmdir(p)
class MemoryFile(BytesIO):
@@ -39,7 +276,7 @@ class MemoryFile(BytesIO):
"""
def __init__(self, fs=None, path=None, data=None):
- logger.debug('open file %s', path)
+ logger.debug("open file %s", path)
self.fs = fs
self.path = path
self.created = datetime.now(tz=timezone.utc)
@@ -48,5 +285,19 @@ class MemoryFile(BytesIO):
super().__init__(data)
self.seek(0)
+ @property
+ def size(self):
+ return self.getbuffer().nbytes
+
def __enter__(self):
return self
+
+ def close(self):
+ pass
+
+ def discard(self):
+ pass
+
+ def commit(self):
+ self.fs.store[self.path] = self
+ self.modified = datetime.now(tz=timezone.utc)
diff --git a/fsspec/implementations/reference.py b/fsspec/implementations/reference.py
index 608bb67..981e698 100644
--- a/fsspec/implementations/reference.py
+++ b/fsspec/implementations/reference.py
@@ -7,34 +7,54 @@ import math
import os
from functools import lru_cache
from typing import TYPE_CHECKING
+
import fsspec.core
+
try:
import ujson as json
except ImportError:
if not TYPE_CHECKING:
import json
+
from ..asyn import AsyncFileSystem
from ..callbacks import DEFAULT_CALLBACK
from ..core import filesystem, open, split_protocol
from ..utils import isfilelike, merge_offset_ranges, other_paths
-logger = logging.getLogger('fsspec.reference')
+logger = logging.getLogger("fsspec.reference")
-class ReferenceNotReachable(RuntimeError):
+class ReferenceNotReachable(RuntimeError):
def __init__(self, reference, target, *args):
super().__init__(*args)
self.reference = reference
self.target = target
def __str__(self):
- return (
- f'Reference "{self.reference}" failed to fetch target {self.target}'
- )
+ return f'Reference "{self.reference}" failed to fetch target {self.target}'
-class RefsValuesView(collections.abc.ValuesView):
+def _first(d):
+ return list(d.values())[0]
+
+def _prot_in_references(path, references):
+ ref = references.get(path)
+ if isinstance(ref, (list, tuple)):
+ return split_protocol(ref[0])[0] if ref[0] else ref[0]
+
+
+def _protocol_groups(paths, references):
+ if isinstance(paths, str):
+ return {_prot_in_references(paths, references): [paths]}
+ out = {}
+ for path in paths:
+ protocol = _prot_in_references(path, references)
+ out.setdefault(protocol, []).append(path)
+ return out
+
+
+class RefsValuesView(collections.abc.ValuesView):
def __iter__(self):
for val in self._mapping.zmetadata.values():
yield json.dumps(val).encode()
@@ -42,17 +62,25 @@ class RefsValuesView(collections.abc.ValuesView):
for field in self._mapping.listdir():
chunk_sizes = self._mapping._get_chunk_sizes(field)
if len(chunk_sizes) == 0:
- yield self._mapping[field + '/0']
+ yield self._mapping[field + "/0"]
continue
yield from self._mapping._generate_all_records(field)
class RefsItemsView(collections.abc.ItemsView):
-
def __iter__(self):
return zip(self._mapping.keys(), self._mapping.values())
+def ravel_multi_index(idx, sizes):
+ val = 0
+ mult = 1
+ for i, s in zip(idx[::-1], sizes[::-1]):
+ val += i * mult
+ mult *= s
+ return val
+
+
class LazyReferenceMapper(collections.abc.MutableMapping):
"""This interface can be used to read/write references from Parquet stores.
It is not intended for other types of references.
@@ -61,8 +89,22 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
Examples of this use-case can be found here:
https://fsspec.github.io/kerchunk/advanced.html?highlight=parquet#parquet-storage"""
- def __init__(self, root, fs=None, out_root=None, cache_size=128,
- categorical_threshold=10):
+ # import is class level to prevent numpy dep requirement for fsspec
+ @property
+ def np(self):
+ import numpy as np
+
+ return np
+
+ @property
+ def pd(self):
+ import pandas as pd
+
+ return pd
+
+ def __init__(
+ self, root, fs=None, out_root=None, cache_size=128, categorical_threshold=10
+ ):
"""
This instance will be writable, storing changes in memory until full partitions
@@ -90,18 +132,40 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
self.cat_thresh = categorical_threshold
self.cache_size = cache_size
self.dirs = None
- self.url = self.root + '/{field}/refs.{record}.parq'
- self.fs = fsspec.filesystem('file') if fs is None else fs
+ self.url = self.root + "/{field}/refs.{record}.parq"
+ # TODO: derive fs from `root`
+ self.fs = fsspec.filesystem("file") if fs is None else fs
def __getattr__(self, item):
- if item in ('_items', 'record_size', 'zmetadata'):
+ if item in ("_items", "record_size", "zmetadata"):
self.setup()
+ # avoid possible recursion if setup fails somehow
return self.__dict__[item]
raise AttributeError(item)
+ def setup(self):
+ self._items = {}
+ self._items[".zmetadata"] = self.fs.cat_file(
+ "/".join([self.root, ".zmetadata"])
+ )
+ met = json.loads(self._items[".zmetadata"])
+ self.record_size = met["record_size"]
+ self.zmetadata = met["metadata"]
+
+ # Define function to open and decompress refs
+ @lru_cache(maxsize=self.cache_size)
+ def open_refs(field, record):
+ """cached parquet file loader"""
+ path = self.url.format(field=field, record=record)
+ data = io.BytesIO(self.fs.cat_file(path))
+ df = self.pd.read_parquet(data, engine="fastparquet")
+ refs = {c: df[c].values for c in df.columns}
+ return refs
+
+ self.open_refs = open_refs
+
@staticmethod
- def create(root, storage_options=None, fs=None, record_size=10000, **kwargs
- ):
+ def create(root, storage_options=None, fs=None, record_size=10000, **kwargs):
"""Make empty parquet reference set
First deletes the contents of the given directory, if it exists.
@@ -122,39 +186,177 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
-------
LazyReferenceMapper instance
"""
- pass
+ met = {"metadata": {}, "record_size": record_size}
+ if fs is None:
+ fs, root = fsspec.core.url_to_fs(root, **(storage_options or {}))
+ if fs.exists(root):
+ fs.rm(root, recursive=True)
+ fs.makedirs(root, exist_ok=True)
+ fs.pipe("/".join([root, ".zmetadata"]), json.dumps(met).encode())
+ return LazyReferenceMapper(root, fs, **kwargs)
def listdir(self, basename=True):
"""List top-level directories"""
- pass
+ # cache me?
+ if self.dirs is None:
+ dirs = [p.split("/", 1)[0] for p in self.zmetadata]
+ self.dirs = {p for p in dirs if p and not p.startswith(".")}
+ listing = self.dirs
+ if basename:
+ listing = [os.path.basename(path) for path in listing]
+ return listing
- def ls(self, path='', detail=True):
+ def ls(self, path="", detail=True):
"""Shortcut file listings"""
- pass
+ if not path:
+ dirnames = self.listdir()
+ others = set(
+ [".zmetadata"]
+ + [name for name in self.zmetadata if "/" not in name]
+ + [name for name in self._items if "/" not in name]
+ )
+ if detail is False:
+ others.update(dirnames)
+ return sorted(others)
+ dirinfo = [
+ {"name": name, "type": "directory", "size": 0} for name in dirnames
+ ]
+ fileinfo = [
+ {
+ "name": name,
+ "type": "file",
+ "size": len(
+ json.dumps(self.zmetadata[name])
+ if name in self.zmetadata
+ else self._items[name]
+ ),
+ }
+ for name in others
+ ]
+ return sorted(dirinfo + fileinfo, key=lambda s: s["name"])
+ parts = path.split("/", 1)
+ if len(parts) > 1:
+ raise FileNotFoundError("Cannot list within directories right now")
+ field = parts[0]
+ others = set(
+ [name for name in self.zmetadata if name.startswith(f"{path}/")]
+ + [name for name in self._items if name.startswith(f"{path}/")]
+ )
+ fileinfo = [
+ {
+ "name": name,
+ "type": "file",
+ "size": len(
+ json.dumps(self.zmetadata[name])
+ if name in self.zmetadata
+ else self._items[name]
+ ),
+ }
+ for name in others
+ ]
+ keys = self._keys_in_field(field)
+
+ if detail is False:
+ return list(others) + list(keys)
+ recs = self._generate_all_records(field)
+ recinfo = [
+ {"name": name, "type": "file", "size": rec[-1]}
+ for name, rec in zip(keys, recs)
+ if rec[0] # filters out path==None, deleted/missing
+ ]
+ return fileinfo + recinfo
def _load_one_key(self, key):
"""Get the reference for one key
Returns bytes, one-element list or three-element list.
"""
- pass
+ if key in self._items:
+ return self._items[key]
+ elif key in self.zmetadata:
+ return json.dumps(self.zmetadata[key]).encode()
+ elif "/" not in key or self._is_meta(key):
+ raise KeyError(key)
+ field, _ = key.rsplit("/", 1)
+ record, ri, chunk_size = self._key_to_record(key)
+ maybe = self._items.get((field, record), {}).get(ri, False)
+ if maybe is None:
+ # explicitly deleted
+ raise KeyError
+ elif maybe:
+ return maybe
+ elif chunk_size == 0:
+ return b""
+
+ # Chunk keys can be loaded from row group and cached in LRU cache
+ try:
+ refs = self.open_refs(field, record)
+ except (ValueError, TypeError, FileNotFoundError):
+ raise KeyError(key)
+ columns = ["path", "offset", "size", "raw"]
+ selection = [refs[c][ri] if c in refs else None for c in columns]
+ raw = selection[-1]
+ if raw is not None:
+ return raw
+ if selection[0] is None:
+ raise KeyError("This reference does not exist or has been deleted")
+ if selection[1:3] == [0, 0]:
+ # URL only
+ return selection[:1]
+ # URL, offset, size
+ return selection[:3]
@lru_cache(4096)
def _key_to_record(self, key):
"""Details needed to construct a reference for one key"""
- pass
+ field, chunk = key.rsplit("/", 1)
+ chunk_sizes = self._get_chunk_sizes(field)
+ if len(chunk_sizes) == 0:
+ return 0, 0, 0
+ chunk_idx = [int(c) for c in chunk.split(".")]
+ chunk_number = ravel_multi_index(chunk_idx, chunk_sizes)
+ record = chunk_number // self.record_size
+ ri = chunk_number % self.record_size
+ return record, ri, len(chunk_sizes)
def _get_chunk_sizes(self, field):
"""The number of chunks along each axis for a given field"""
- pass
+ if field not in self.chunk_sizes:
+ zarray = self.zmetadata[f"{field}/.zarray"]
+ size_ratio = [
+ math.ceil(s / c) for s, c in zip(zarray["shape"], zarray["chunks"])
+ ]
+ self.chunk_sizes[field] = size_ratio or [1]
+ return self.chunk_sizes[field]
def _generate_record(self, field, record):
"""The references for a given parquet file of a given field"""
- pass
+ refs = self.open_refs(field, record)
+ it = iter(zip(*refs.values()))
+ if len(refs) == 3:
+ # All urls
+ return (list(t) for t in it)
+ elif len(refs) == 1:
+ # All raws
+ return refs["raw"]
+ else:
+ # Mix of urls and raws
+ return (list(t[:3]) if not t[3] else t[3] for t in it)
def _generate_all_records(self, field):
"""Load all the references within a field by iterating over the parquet files"""
- pass
+ nrec = 1
+ for ch in self._get_chunk_sizes(field):
+ nrec *= ch
+ nrec = math.ceil(nrec / self.record_size)
+ for record in range(nrec):
+ yield from self._generate_record(field, record)
+
+ def values(self):
+ return RefsValuesView(self)
+
+ def items(self):
+ return RefsItemsView(self)
def __hash__(self):
return id(self)
@@ -163,33 +365,117 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
return self._load_one_key(key)
def __setitem__(self, key, value):
- if '/' in key and not self._is_meta(key):
- field, chunk = key.rsplit('/', 1)
+ if "/" in key and not self._is_meta(key):
+ field, chunk = key.rsplit("/", 1)
record, i, _ = self._key_to_record(key)
subdict = self._items.setdefault((field, record), {})
subdict[i] = value
if len(subdict) == self.record_size:
self.write(field, record)
else:
+ # metadata or top-level
self._items[key] = value
- new_value = json.loads(value.decode() if isinstance(value,
- bytes) else value)
+ new_value = json.loads(
+ value.decode() if isinstance(value, bytes) else value
+ )
self.zmetadata[key] = {**self.zmetadata.get(key, {}), **new_value}
+ @staticmethod
+ def _is_meta(key):
+ return key.startswith(".z") or "/.z" in key
+
def __delitem__(self, key):
if key in self._items:
del self._items[key]
elif key in self.zmetadata:
del self.zmetadata[key]
- elif '/' in key and not self._is_meta(key):
- field, _ = key.rsplit('/', 1)
- record, i, _ = self._key_to_record(key)
- subdict = self._items.setdefault((field, record), {})
- subdict[i] = None
- if len(subdict) == self.record_size:
- self.write(field, record)
else:
- self._items[key] = None
+ if "/" in key and not self._is_meta(key):
+ field, _ = key.rsplit("/", 1)
+ record, i, _ = self._key_to_record(key)
+ subdict = self._items.setdefault((field, record), {})
+ subdict[i] = None
+ if len(subdict) == self.record_size:
+ self.write(field, record)
+ else:
+ # metadata or top-level
+ self._items[key] = None
+
+ def write(self, field, record, base_url=None, storage_options=None):
+ # extra requirements if writing
+ import kerchunk.df
+ import numpy as np
+ import pandas as pd
+
+ partition = self._items[(field, record)]
+ original = False
+ if len(partition) < self.record_size:
+ try:
+ original = self.open_refs(field, record)
+ except IOError:
+ pass
+
+ if original:
+ paths = original["path"]
+ offsets = original["offset"]
+ sizes = original["size"]
+ raws = original["raw"]
+ else:
+ paths = np.full(self.record_size, np.nan, dtype="O")
+ offsets = np.zeros(self.record_size, dtype="int64")
+ sizes = np.zeros(self.record_size, dtype="int64")
+ raws = np.full(self.record_size, np.nan, dtype="O")
+ for j, data in partition.items():
+ if isinstance(data, list):
+ if (
+ str(paths.dtype) == "category"
+ and data[0] not in paths.dtype.categories
+ ):
+ paths = paths.add_categories(data[0])
+ paths[j] = data[0]
+ if len(data) > 1:
+ offsets[j] = data[1]
+ sizes[j] = data[2]
+ elif data is None:
+ # delete
+ paths[j] = None
+ offsets[j] = 0
+ sizes[j] = 0
+ raws[j] = None
+ else:
+ # this is the only call into kerchunk, could remove
+ raws[j] = kerchunk.df._proc_raw(data)
+ # TODO: only save needed columns
+ df = pd.DataFrame(
+ {
+ "path": paths,
+ "offset": offsets,
+ "size": sizes,
+ "raw": raws,
+ },
+ copy=False,
+ )
+ if df.path.count() / (df.path.nunique() or 1) > self.cat_thresh:
+ df["path"] = df["path"].astype("category")
+ object_encoding = {"raw": "bytes", "path": "utf8"}
+ has_nulls = ["path", "raw"]
+
+ fn = f"{base_url or self.out_root}/{field}/refs.{record}.parq"
+ self.fs.mkdirs(f"{base_url or self.out_root}/{field}", exist_ok=True)
+ df.to_parquet(
+ fn,
+ engine="fastparquet",
+ storage_options=storage_options
+ or getattr(self.fs, "storage_options", None),
+ compression="zstd",
+ index=False,
+ stats=False,
+ object_encoding=object_encoding,
+ has_nulls=has_nulls,
+ # **kwargs,
+ )
+ partition.clear()
+ self._items.pop((field, record))
def flush(self, base_url=None, storage_options=None):
"""Output any modified or deleted keys
@@ -199,20 +485,47 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
base_url: str
Location of the output
"""
- pass
+ # write what we have so far and clear sub chunks
+ for thing in list(self._items):
+ if isinstance(thing, tuple):
+ field, record = thing
+ self.write(
+ field,
+ record,
+ base_url=base_url,
+ storage_options=storage_options,
+ )
+
+ # gather .zmetadata from self._items and write that too
+ for k in list(self._items):
+ if k != ".zmetadata" and ".z" in k:
+ self.zmetadata[k] = json.loads(self._items.pop(k))
+ met = {"metadata": self.zmetadata, "record_size": self.record_size}
+ self._items[".zmetadata"] = json.dumps(met).encode()
+ self.fs.pipe(
+ "/".join([base_url or self.out_root, ".zmetadata"]),
+ self._items[".zmetadata"],
+ )
+
+ # TODO: only clear those that we wrote to?
+ self.open_refs.cache_clear()
def __len__(self):
+ # Caveat: This counts expected references, not actual - but is fast
count = 0
for field in self.listdir():
- if field.startswith('.'):
+ if field.startswith("."):
count += 1
else:
count += math.prod(self._get_chunk_sizes(field))
- count += len(self.zmetadata)
+ count += len(self.zmetadata) # all metadata keys
+ # any other files not in reference partitions
count += sum(1 for _ in self._items if not isinstance(_, tuple))
return count
def __iter__(self):
+ # Caveat: returns only existing keys, so the number of these does not
+ # match len(self)
metas = set(self.zmetadata)
metas.update(self._items)
for bit in metas:
@@ -235,7 +548,13 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
Produces strings like "field/x.y" appropriate from the chunking of the array
"""
- pass
+ chunk_sizes = self._get_chunk_sizes(field)
+ if len(chunk_sizes) == 0:
+ yield field + "/0"
+ return
+ inds = itertools.product(*(range(i) for i in chunk_sizes))
+ for ind in inds:
+ yield field + "/" + ".".join([str(c) for c in ind])
class ReferenceFileSystem(AsyncFileSystem):
@@ -253,13 +572,26 @@ class ReferenceFileSystem(AsyncFileSystem):
{path0: bytes_data, path1: (target_url, offset, size)}
https://github.com/fsspec/kerchunk/blob/main/README.md
"""
- protocol = 'reference'
- def __init__(self, fo, target=None, ref_storage_args=None,
- target_protocol=None, target_options=None, remote_protocol=None,
- remote_options=None, fs=None, template_overrides=None,
- simple_templates=True, max_gap=64000, max_block=256000000,
- cache_size=128, **kwargs):
+ protocol = "reference"
+
+ def __init__(
+ self,
+ fo,
+ target=None,
+ ref_storage_args=None,
+ target_protocol=None,
+ target_options=None,
+ remote_protocol=None,
+ remote_options=None,
+ fs=None,
+ template_overrides=None,
+ simple_templates=True,
+ max_gap=64_000,
+ max_block=256_000_000,
+ cache_size=128,
+ **kwargs,
+ ):
"""
Parameters
----------
@@ -326,61 +658,512 @@ class ReferenceFileSystem(AsyncFileSystem):
self.max_gap = max_gap
self.max_block = max_block
if isinstance(fo, str):
- dic = dict(**ref_storage_args or target_options or {}, protocol
- =target_protocol)
+ dic = dict(
+ **(ref_storage_args or target_options or {}), protocol=target_protocol
+ )
ref_fs, fo2 = fsspec.core.url_to_fs(fo, **dic)
if ref_fs.isfile(fo2):
- with fsspec.open(fo, 'rb', **dic) as f:
- logger.info('Read reference from URL %s', fo)
+ # text JSON
+ with fsspec.open(fo, "rb", **dic) as f:
+ logger.info("Read reference from URL %s", fo)
text = json.load(f)
self._process_references(text, template_overrides)
else:
- logger.info('Open lazy reference dict from URL %s', fo)
- self.references = LazyReferenceMapper(fo2, fs=ref_fs,
- cache_size=cache_size)
+ # Lazy parquet refs
+ logger.info("Open lazy reference dict from URL %s", fo)
+ self.references = LazyReferenceMapper(
+ fo2,
+ fs=ref_fs,
+ cache_size=cache_size,
+ )
else:
+ # dictionaries
self._process_references(fo, template_overrides)
if isinstance(fs, dict):
- self.fss = {k: (fsspec.filesystem(k.split(':', 1)[0], **opts) if
- isinstance(opts, dict) else opts) for k, opts in fs.items()}
+ self.fss = {
+ k: (
+ fsspec.filesystem(k.split(":", 1)[0], **opts)
+ if isinstance(opts, dict)
+ else opts
+ )
+ for k, opts in fs.items()
+ }
if None not in self.fss:
- self.fss[None] = filesystem('file')
+ self.fss[None] = filesystem("file")
return
if fs is not None:
- remote_protocol = fs.protocol[0] if isinstance(fs.protocol, tuple
- ) else fs.protocol
+ # single remote FS
+ remote_protocol = (
+ fs.protocol[0] if isinstance(fs.protocol, tuple) else fs.protocol
+ )
self.fss[remote_protocol] = fs
+
if remote_protocol is None:
+ # get single protocol from any templates
for ref in self.templates.values():
if callable(ref):
ref = ref()
protocol, _ = fsspec.core.split_protocol(ref)
if protocol and protocol not in self.fss:
- fs = filesystem(protocol, **remote_options or {})
+ fs = filesystem(protocol, **(remote_options or {}))
self.fss[protocol] = fs
if remote_protocol is None:
+ # get single protocol from references
+ # TODO: warning here, since this can be very expensive?
for ref in self.references.values():
if callable(ref):
ref = ref()
if isinstance(ref, list) and ref[0]:
protocol, _ = fsspec.core.split_protocol(ref[0])
if protocol not in self.fss:
- fs = filesystem(protocol, **remote_options or {})
+ fs = filesystem(protocol, **(remote_options or {}))
self.fss[protocol] = fs
+ # only use first remote URL
break
+
if remote_protocol and remote_protocol not in self.fss:
- fs = filesystem(remote_protocol, **remote_options or {})
+ fs = filesystem(remote_protocol, **(remote_options or {}))
self.fss[remote_protocol] = fs
- self.fss[None] = fs or filesystem('file')
+
+ self.fss[None] = fs or filesystem("file") # default one
+
+ def _cat_common(self, path, start=None, end=None):
+ path = self._strip_protocol(path)
+ logger.debug(f"cat: {path}")
+ try:
+ part = self.references[path]
+ except KeyError:
+ raise FileNotFoundError(path)
+ if isinstance(part, str):
+ part = part.encode()
+ if isinstance(part, bytes):
+ logger.debug(f"Reference: {path}, type bytes")
+ if part.startswith(b"base64:"):
+ part = base64.b64decode(part[7:])
+ return part, None, None
+
+ if len(part) == 1:
+ logger.debug(f"Reference: {path}, whole file => {part}")
+ url = part[0]
+ start1, end1 = start, end
+ else:
+ url, start0, size = part
+ logger.debug(f"Reference: {path} => {url}, offset {start0}, size {size}")
+ end0 = start0 + size
+
+ if start is not None:
+ if start >= 0:
+ start1 = start0 + start
+ else:
+ start1 = end0 + start
+ else:
+ start1 = start0
+ if end is not None:
+ if end >= 0:
+ end1 = start0 + end
+ else:
+ end1 = end0 + end
+ else:
+ end1 = end0
+ if url is None:
+ url = self.target
+ return url, start1, end1
+
+ async def _cat_file(self, path, start=None, end=None, **kwargs):
+ part_or_url, start0, end0 = self._cat_common(path, start=start, end=end)
+ if isinstance(part_or_url, bytes):
+ return part_or_url[start:end]
+ protocol, _ = split_protocol(part_or_url)
+ try:
+ await self.fss[protocol]._cat_file(part_or_url, start=start, end=end)
+ except Exception as e:
+ raise ReferenceNotReachable(path, part_or_url) from e
+
+ def cat_file(self, path, start=None, end=None, **kwargs):
+ part_or_url, start0, end0 = self._cat_common(path, start=start, end=end)
+ if isinstance(part_or_url, bytes):
+ return part_or_url[start:end]
+ protocol, _ = split_protocol(part_or_url)
+ try:
+ return self.fss[protocol].cat_file(part_or_url, start=start0, end=end0)
+ except Exception as e:
+ raise ReferenceNotReachable(path, part_or_url) from e
def pipe_file(self, path, value, **_):
"""Temporarily add binary data or reference as a file"""
- pass
+ self.references[path] = value
+
+ async def _get_file(self, rpath, lpath, **kwargs):
+ if self.isdir(rpath):
+ return os.makedirs(lpath, exist_ok=True)
+ data = await self._cat_file(rpath)
+ with open(lpath, "wb") as f:
+ f.write(data)
+
+ def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, **kwargs):
+ if self.isdir(rpath):
+ return os.makedirs(lpath, exist_ok=True)
+ data = self.cat_file(rpath, **kwargs)
+ callback.set_size(len(data))
+ if isfilelike(lpath):
+ lpath.write(data)
+ else:
+ with open(lpath, "wb") as f:
+ f.write(data)
+ callback.absolute_update(len(data))
+
+ def get(self, rpath, lpath, recursive=False, **kwargs):
+ if recursive:
+ # trigger directory build
+ self.ls("")
+ rpath = self.expand_path(rpath, recursive=recursive)
+ fs = fsspec.filesystem("file", auto_mkdir=True)
+ targets = other_paths(rpath, lpath)
+ if recursive:
+ data = self.cat([r for r in rpath if not self.isdir(r)])
+ else:
+ data = self.cat(rpath)
+ for remote, local in zip(rpath, targets):
+ if remote in data:
+ fs.pipe_file(local, data[remote])
+
+ def cat(self, path, recursive=False, on_error="raise", **kwargs):
+ if isinstance(path, str) and recursive:
+ raise NotImplementedError
+ if isinstance(path, list) and (recursive or any("*" in p for p in path)):
+ raise NotImplementedError
+ # TODO: if references is lazy, pre-fetch all paths in batch before access
+ proto_dict = _protocol_groups(path, self.references)
+ out = {}
+ for proto, paths in proto_dict.items():
+ fs = self.fss[proto]
+ urls, starts, ends, valid_paths = [], [], [], []
+ for p in paths:
+ # find references or label not-found. Early exit if any not
+ # found and on_error is "raise"
+ try:
+ u, s, e = self._cat_common(p)
+ except FileNotFoundError as err:
+ if on_error == "raise":
+ raise
+ if on_error != "omit":
+ out[p] = err
+ else:
+ urls.append(u)
+ starts.append(s)
+ ends.append(e)
+ valid_paths.append(p)
+
+ # process references into form for merging
+ urls2 = []
+ starts2 = []
+ ends2 = []
+ paths2 = []
+ whole_files = set()
+ for u, s, e, p in zip(urls, starts, ends, valid_paths):
+ if isinstance(u, bytes):
+ # data
+ out[p] = u
+ elif s is None:
+ # whole file - limits are None, None, but no further
+ # entries take for this file
+ whole_files.add(u)
+ urls2.append(u)
+ starts2.append(s)
+ ends2.append(e)
+ paths2.append(p)
+ for u, s, e, p in zip(urls, starts, ends, valid_paths):
+ # second run to account for files that are to be loaded whole
+ if s is not None and u not in whole_files:
+ urls2.append(u)
+ starts2.append(s)
+ ends2.append(e)
+ paths2.append(p)
+
+ # merge and fetch consolidated ranges
+ new_paths, new_starts, new_ends = merge_offset_ranges(
+ list(urls2),
+ list(starts2),
+ list(ends2),
+ sort=True,
+ max_gap=self.max_gap,
+ max_block=self.max_block,
+ )
+ bytes_out = fs.cat_ranges(new_paths, new_starts, new_ends)
+
+ # unbundle from merged bytes - simple approach
+ for u, s, e, p in zip(urls, starts, ends, valid_paths):
+ if p in out:
+ continue # was bytes, already handled
+ for np, ns, ne, b in zip(new_paths, new_starts, new_ends, bytes_out):
+ if np == u and (ns is None or ne is None):
+ if isinstance(b, Exception):
+ out[p] = b
+ else:
+ out[p] = b[s:e]
+ elif np == u and s >= ns and e <= ne:
+ if isinstance(b, Exception):
+ out[p] = b
+ else:
+ out[p] = b[s - ns : (e - ne) or None]
+
+ for k, v in out.copy().items():
+ # these were valid references, but fetch failed, so transform exc
+ if isinstance(v, Exception) and k in self.references:
+ ex = out[k]
+ new_ex = ReferenceNotReachable(k, self.references[k])
+ new_ex.__cause__ = ex
+ if on_error == "raise":
+ raise new_ex
+ elif on_error != "omit":
+ out[k] = new_ex
+
+ if len(out) == 1 and isinstance(path, str) and "*" not in path:
+ return _first(out)
+ return out
+
+ def _process_references(self, references, template_overrides=None):
+ vers = references.get("version", None)
+ if vers is None:
+ self._process_references0(references)
+ elif vers == 1:
+ self._process_references1(references, template_overrides=template_overrides)
+ else:
+ raise ValueError(f"Unknown reference spec version: {vers}")
+ # TODO: we make dircache by iterating over all entries, but for Spec >= 1,
+ # can replace with programmatic. Is it even needed for mapper interface?
def _process_references0(self, references):
"""Make reference dict for Spec Version 0"""
- pass
+ if isinstance(references, dict):
+ # do not do this for lazy/parquet backend, which will not make dicts,
+ # but must remain writable in the original object
+ references = {
+ key: json.dumps(val) if isinstance(val, dict) else val
+ for key, val in references.items()
+ }
+ self.references = references
+
+ def _process_references1(self, references, template_overrides=None):
+ if not self.simple_templates or self.templates:
+ import jinja2
+ self.references = {}
+ self._process_templates(references.get("templates", {}))
+
+ @lru_cache(1000)
+ def _render_jinja(u):
+ return jinja2.Template(u).render(**self.templates)
+
+ for k, v in references.get("refs", {}).items():
+ if isinstance(v, str):
+ if v.startswith("base64:"):
+ self.references[k] = base64.b64decode(v[7:])
+ self.references[k] = v
+ elif isinstance(v, dict):
+ self.references[k] = json.dumps(v)
+ elif self.templates:
+ u = v[0]
+ if "{{" in u:
+ if self.simple_templates:
+ u = (
+ u.replace("{{", "{")
+ .replace("}}", "}")
+ .format(**self.templates)
+ )
+ else:
+ u = _render_jinja(u)
+ self.references[k] = [u] if len(v) == 1 else [u, v[1], v[2]]
+ else:
+ self.references[k] = v
+ self.references.update(self._process_gen(references.get("gen", [])))
+
+ def _process_templates(self, tmp):
+ self.templates = {}
+ if self.template_overrides is not None:
+ tmp.update(self.template_overrides)
+ for k, v in tmp.items():
+ if "{{" in v:
+ import jinja2
+
+ self.templates[k] = lambda temp=v, **kwargs: jinja2.Template(
+ temp
+ ).render(**kwargs)
+ else:
+ self.templates[k] = v
+
+ def _process_gen(self, gens):
+ out = {}
+ for gen in gens:
+ dimension = {
+ k: v
+ if isinstance(v, list)
+ else range(v.get("start", 0), v["stop"], v.get("step", 1))
+ for k, v in gen["dimensions"].items()
+ }
+ products = (
+ dict(zip(dimension.keys(), values))
+ for values in itertools.product(*dimension.values())
+ )
+ for pr in products:
+ import jinja2
+
+ key = jinja2.Template(gen["key"]).render(**pr, **self.templates)
+ url = jinja2.Template(gen["url"]).render(**pr, **self.templates)
+ if ("offset" in gen) and ("length" in gen):
+ offset = int(
+ jinja2.Template(gen["offset"]).render(**pr, **self.templates)
+ )
+ length = int(
+ jinja2.Template(gen["length"]).render(**pr, **self.templates)
+ )
+ out[key] = [url, offset, length]
+ elif ("offset" in gen) ^ ("length" in gen):
+ raise ValueError(
+ "Both 'offset' and 'length' are required for a "
+ "reference generator entry if either is provided."
+ )
+ else:
+ out[key] = [url]
+ return out
+
+ def _dircache_from_items(self):
+ self.dircache = {"": []}
+ it = self.references.items()
+ for path, part in it:
+ if isinstance(part, (bytes, str)):
+ size = len(part)
+ elif len(part) == 1:
+ size = None
+ else:
+ _, _, size = part
+ par = path.rsplit("/", 1)[0] if "/" in path else ""
+ par0 = par
+ subdirs = [par0]
+ while par0 and par0 not in self.dircache:
+ # collect parent directories
+ par0 = self._parent(par0)
+ subdirs.append(par0)
+
+ subdirs.reverse()
+ for parent, child in zip(subdirs, subdirs[1:]):
+ # register newly discovered directories
+ assert child not in self.dircache
+ assert parent in self.dircache
+ self.dircache[parent].append(
+ {"name": child, "type": "directory", "size": 0}
+ )
+ self.dircache[child] = []
+
+ self.dircache[par].append({"name": path, "type": "file", "size": size})
+
+ def _open(self, path, mode="rb", block_size=None, cache_options=None, **kwargs):
+ data = self.cat_file(path) # load whole chunk into memory
+ return io.BytesIO(data)
+
+ def ls(self, path, detail=True, **kwargs):
+ path = self._strip_protocol(path)
+ if isinstance(self.references, LazyReferenceMapper):
+ try:
+ return self.references.ls(path, detail)
+ except KeyError:
+ pass
+ raise FileNotFoundError(f"'{path}' is not a known key")
+ if not self.dircache:
+ self._dircache_from_items()
+ out = self._ls_from_cache(path)
+ if out is None:
+ raise FileNotFoundError(path)
+ if detail:
+ return out
+ return [o["name"] for o in out]
+
+ def exists(self, path, **kwargs): # overwrite auto-sync version
+ return self.isdir(path) or self.isfile(path)
+
+ def isdir(self, path): # overwrite auto-sync version
+ if self.dircache:
+ return path in self.dircache
+ elif isinstance(self.references, LazyReferenceMapper):
+ return path in self.references.listdir("")
+ else:
+ # this may be faster than building dircache for single calls, but
+ # by looping will be slow for many calls; could cache it?
+ return any(_.startswith(f"{path}/") for _ in self.references)
+
+ def isfile(self, path): # overwrite auto-sync version
+ return path in self.references
+
+ async def _ls(self, path, detail=True, **kwargs): # calls fast sync code
+ return self.ls(path, detail, **kwargs)
+
+ def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
+ if withdirs:
+ return super().find(
+ path, maxdepth=maxdepth, withdirs=withdirs, detail=detail, **kwargs
+ )
+ if path:
+ path = self._strip_protocol(path)
+ r = sorted(k for k in self.references if k.startswith(path))
+ else:
+ r = sorted(self.references)
+ if detail:
+ if not self.dircache:
+ self._dircache_from_items()
+ return {k: self._ls_from_cache(k)[0] for k in r}
+ else:
+ return r
+
+ def info(self, path, **kwargs):
+ out = self.references.get(path)
+ if out is not None:
+ if isinstance(out, (str, bytes)):
+ # decode base64 here
+ return {"name": path, "type": "file", "size": len(out)}
+ elif len(out) > 1:
+ return {"name": path, "type": "file", "size": out[2]}
+ else:
+ out0 = [{"name": path, "type": "file", "size": None}]
+ else:
+ out = self.ls(path, True)
+ out0 = [o for o in out if o["name"] == path]
+ if not out0:
+ return {"name": path, "type": "directory", "size": 0}
+ if out0[0]["size"] is None:
+ # if this is a whole remote file, update size using remote FS
+ prot, _ = split_protocol(self.references[path][0])
+ out0[0]["size"] = self.fss[prot].size(self.references[path][0])
+ return out0[0]
+
+ async def _info(self, path, **kwargs): # calls fast sync code
+ return self.info(path)
+
+ async def _rm_file(self, path, **kwargs):
+ self.references.pop(
+ path, None
+ ) # ignores FileNotFound, just as well for directories
+ self.dircache.clear() # this is a bit heavy handed
+
+ async def _pipe_file(self, path, data):
+ # can be str or bytes
+ self.references[path] = data
+ self.dircache.clear() # this is a bit heavy handed
+
+ async def _put_file(self, lpath, rpath, **kwargs):
+ # puts binary
+ with open(lpath, "rb") as f:
+ self.references[rpath] = f.read()
+ self.dircache.clear() # this is a bit heavy handed
def save_json(self, url, **storage_options):
"""Write modified references into new location"""
- pass
+ out = {}
+ for k, v in self.references.items():
+ if isinstance(v, bytes):
+ try:
+ out[k] = v.decode("ascii")
+ except UnicodeDecodeError:
+ out[k] = (b"base64:" + base64.b64encode(v)).decode()
+ else:
+ out[k] = v
+ with fsspec.open(url, "wb", **storage_options) as f:
+ f.write(json.dumps({"version": 1, "refs": out}).encode())
diff --git a/fsspec/implementations/sftp.py b/fsspec/implementations/sftp.py
index 95b7f25..77f7b37 100644
--- a/fsspec/implementations/sftp.py
+++ b/fsspec/implementations/sftp.py
@@ -4,10 +4,13 @@ import os
import types
import uuid
from stat import S_ISDIR, S_ISLNK
+
import paramiko
+
from .. import AbstractFileSystem
from ..utils import infer_storage_options
-logger = logging.getLogger('fsspec.sftp')
+
+logger = logging.getLogger("fsspec.sftp")
class SFTPFileSystem(AbstractFileSystem):
@@ -19,7 +22,8 @@ class SFTPFileSystem(AbstractFileSystem):
there is no way to tell if a path is relative, so all paths are assumed
to be absolute.
"""
- protocol = 'sftp', 'ssh'
+
+ protocol = "sftp", "ssh"
def __init__(self, host, **ssh_kwargs):
"""
@@ -38,15 +42,139 @@ class SFTPFileSystem(AbstractFileSystem):
if self._cached:
return
super().__init__(**ssh_kwargs)
- self.temppath = ssh_kwargs.pop('temppath', '/tmp')
+ self.temppath = ssh_kwargs.pop("temppath", "/tmp") # remote temp directory
self.host = host
self.ssh_kwargs = ssh_kwargs
self._connect()
- def _open(self, path, mode='rb', block_size=None, **kwargs):
+ def _connect(self):
+ logger.debug("Connecting to SFTP server %s", self.host)
+ self.client = paramiko.SSHClient()
+ self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+ self.client.connect(self.host, **self.ssh_kwargs)
+ self.ftp = self.client.open_sftp()
+
+ @classmethod
+ def _strip_protocol(cls, path):
+ return infer_storage_options(path)["path"]
+
+ @staticmethod
+ def _get_kwargs_from_urls(urlpath):
+ out = infer_storage_options(urlpath)
+ out.pop("path", None)
+ out.pop("protocol", None)
+ return out
+
+ def mkdir(self, path, create_parents=True, mode=511):
+ logger.debug("Creating folder %s", path)
+ if self.exists(path):
+ raise FileExistsError(f"File exists: {path}")
+
+ if create_parents:
+ self.makedirs(path)
+ else:
+ self.ftp.mkdir(path, mode)
+
+ def makedirs(self, path, exist_ok=False, mode=511):
+ if self.exists(path) and not exist_ok:
+ raise FileExistsError(f"File exists: {path}")
+
+ parts = path.split("/")
+ new_path = "/" if path[:1] == "/" else ""
+
+ for part in parts:
+ if part:
+ new_path = f"{new_path}/{part}" if new_path else part
+ if not self.exists(new_path):
+ self.ftp.mkdir(new_path, mode)
+
+ def rmdir(self, path):
+ logger.debug("Removing folder %s", path)
+ self.ftp.rmdir(path)
+
+ def info(self, path):
+ stat = self._decode_stat(self.ftp.stat(path))
+ stat["name"] = path
+ return stat
+
+ @staticmethod
+ def _decode_stat(stat, parent_path=None):
+ if S_ISDIR(stat.st_mode):
+ t = "directory"
+ elif S_ISLNK(stat.st_mode):
+ t = "link"
+ else:
+ t = "file"
+ out = {
+ "name": "",
+ "size": stat.st_size,
+ "type": t,
+ "uid": stat.st_uid,
+ "gid": stat.st_gid,
+ "time": datetime.datetime.fromtimestamp(
+ stat.st_atime, tz=datetime.timezone.utc
+ ),
+ "mtime": datetime.datetime.fromtimestamp(
+ stat.st_mtime, tz=datetime.timezone.utc
+ ),
+ }
+ if parent_path:
+ out["name"] = "/".join([parent_path.rstrip("/"), stat.filename])
+ return out
+
+ def ls(self, path, detail=False):
+ logger.debug("Listing folder %s", path)
+ stats = [self._decode_stat(stat, path) for stat in self.ftp.listdir_iter(path)]
+ if detail:
+ return stats
+ else:
+ paths = [stat["name"] for stat in stats]
+ return sorted(paths)
+
+ def put(self, lpath, rpath, callback=None, **kwargs):
+ logger.debug("Put file %s into %s", lpath, rpath)
+ self.ftp.put(lpath, rpath)
+
+ def get_file(self, rpath, lpath, **kwargs):
+ if self.isdir(rpath):
+ os.makedirs(lpath, exist_ok=True)
+ else:
+ self.ftp.get(self._strip_protocol(rpath), lpath)
+
+ def _open(self, path, mode="rb", block_size=None, **kwargs):
"""
block_size: int or None
If 0, no buffering, if 1, line buffering, if >1, buffer that many
bytes, if None use default from paramiko.
"""
- pass
+ logger.debug("Opening file %s", path)
+ if kwargs.get("autocommit", True) is False:
+ # writes to temporary file, move on commit
+ path2 = "/".join([self.temppath, str(uuid.uuid4())])
+ f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1)
+ f.temppath = path2
+ f.targetpath = path
+ f.fs = self
+ f.commit = types.MethodType(commit_a_file, f)
+ f.discard = types.MethodType(discard_a_file, f)
+ else:
+ f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1)
+ return f
+
+ def _rm(self, path):
+ if self.isdir(path):
+ self.ftp.rmdir(path)
+ else:
+ self.ftp.remove(path)
+
+ def mv(self, old, new):
+ logger.debug("Renaming %s into %s", old, new)
+ self.ftp.posix_rename(old, new)
+
+
+def commit_a_file(self):
+ self.fs.mv(self.temppath, self.targetpath)
+
+
+def discard_a_file(self):
+ self.fs._rm(self.temppath)
diff --git a/fsspec/implementations/smb.py b/fsspec/implementations/smb.py
index a4da1d4..bcd13a6 100644
--- a/fsspec/implementations/smb.py
+++ b/fsspec/implementations/smb.py
@@ -2,13 +2,18 @@
This module contains SMBFileSystem class responsible for handling access to
Windows Samba network shares by using package smbprotocol
"""
+
import datetime
import uuid
from stat import S_ISDIR, S_ISLNK
+
import smbclient
+
from .. import AbstractFileSystem
from ..utils import infer_storage_options
+# ! pylint: disable=bad-continuation
+
class SMBFileSystem(AbstractFileSystem):
"""Allow reading and writing to Windows and Samba network shares.
@@ -49,11 +54,23 @@ class SMBFileSystem(AbstractFileSystem):
there is no way to tell if a path is relative, so all paths are assumed
to be absolute.
"""
- protocol = 'smb'
- def __init__(self, host, port=None, username=None, password=None,
- timeout=60, encrypt=None, share_access=None,
- register_session_retries=5, auto_mkdir=False, **kwargs):
+ protocol = "smb"
+
+ # pylint: disable=too-many-arguments
+ def __init__(
+ self,
+ host,
+ port=None,
+ username=None,
+ password=None,
+ timeout=60,
+ encrypt=None,
+ share_access=None,
+ register_session_retries=5,
+ auto_mkdir=False,
+ **kwargs,
+ ):
"""
You can use _get_kwargs_from_urls to get some kwargs from
a reasonable SMB url.
@@ -98,22 +115,112 @@ class SMBFileSystem(AbstractFileSystem):
self.password = password
self.timeout = timeout
self.encrypt = encrypt
- self.temppath = kwargs.pop('temppath', '')
+ self.temppath = kwargs.pop("temppath", "")
self.share_access = share_access
self.register_session_retries = register_session_retries
self.auto_mkdir = auto_mkdir
self._connect()
+ @property
+ def _port(self):
+ return 445 if self.port is None else self.port
+
+ def _connect(self):
+ import time
+
+ for _ in range(self.register_session_retries):
+ try:
+ smbclient.register_session(
+ self.host,
+ username=self.username,
+ password=self.password,
+ port=self._port,
+ encrypt=self.encrypt,
+ connection_timeout=self.timeout,
+ )
+ break
+ except Exception:
+ time.sleep(0.1)
+
+ @classmethod
+ def _strip_protocol(cls, path):
+ return infer_storage_options(path)["path"]
+
+ @staticmethod
+ def _get_kwargs_from_urls(path):
+ # smb://workgroup;user:password@host:port/share/folder/file.csv
+ out = infer_storage_options(path)
+ out.pop("path", None)
+ out.pop("protocol", None)
+ return out
+
+ def mkdir(self, path, create_parents=True, **kwargs):
+ wpath = _as_unc_path(self.host, path)
+ if create_parents:
+ smbclient.makedirs(wpath, exist_ok=False, port=self._port, **kwargs)
+ else:
+ smbclient.mkdir(wpath, port=self._port, **kwargs)
+
+ def makedirs(self, path, exist_ok=False):
+ if _share_has_path(path):
+ wpath = _as_unc_path(self.host, path)
+ smbclient.makedirs(wpath, exist_ok=exist_ok, port=self._port)
+
+ def rmdir(self, path):
+ if _share_has_path(path):
+ wpath = _as_unc_path(self.host, path)
+ smbclient.rmdir(wpath, port=self._port)
+
+ def info(self, path, **kwargs):
+ wpath = _as_unc_path(self.host, path)
+ stats = smbclient.stat(wpath, port=self._port, **kwargs)
+ if S_ISDIR(stats.st_mode):
+ stype = "directory"
+ elif S_ISLNK(stats.st_mode):
+ stype = "link"
+ else:
+ stype = "file"
+ res = {
+ "name": path + "/" if stype == "directory" else path,
+ "size": stats.st_size,
+ "type": stype,
+ "uid": stats.st_uid,
+ "gid": stats.st_gid,
+ "time": stats.st_atime,
+ "mtime": stats.st_mtime,
+ }
+ return res
+
def created(self, path):
"""Return the created timestamp of a file as a datetime.datetime"""
- pass
+ wpath = _as_unc_path(self.host, path)
+ stats = smbclient.stat(wpath, port=self._port)
+ return datetime.datetime.fromtimestamp(stats.st_ctime, tz=datetime.timezone.utc)
def modified(self, path):
"""Return the modified timestamp of a file as a datetime.datetime"""
- pass
-
- def _open(self, path, mode='rb', block_size=-1, autocommit=True,
- cache_options=None, **kwargs):
+ wpath = _as_unc_path(self.host, path)
+ stats = smbclient.stat(wpath, port=self._port)
+ return datetime.datetime.fromtimestamp(stats.st_mtime, tz=datetime.timezone.utc)
+
+ def ls(self, path, detail=True, **kwargs):
+ unc = _as_unc_path(self.host, path)
+ listed = smbclient.listdir(unc, port=self._port, **kwargs)
+ dirs = ["/".join([path.rstrip("/"), p]) for p in listed]
+ if detail:
+ dirs = [self.info(d) for d in dirs]
+ return dirs
+
+ # pylint: disable=too-many-arguments
+ def _open(
+ self,
+ path,
+ mode="rb",
+ block_size=-1,
+ autocommit=True,
+ cache_options=None,
+ **kwargs,
+ ):
"""
block_size: int or None
If 0, no buffering, 1, line buffering, >1, buffer that many bytes
@@ -123,11 +230,66 @@ class SMBFileSystem(AbstractFileSystem):
By specifying 'share_access' in 'kwargs' it is possible to override the
default shared access setting applied in the constructor of this object.
"""
- pass
+ if self.auto_mkdir and "w" in mode:
+ self.makedirs(self._parent(path), exist_ok=True)
+ bls = block_size if block_size is not None and block_size >= 0 else -1
+ wpath = _as_unc_path(self.host, path)
+ share_access = kwargs.pop("share_access", self.share_access)
+ if "w" in mode and autocommit is False:
+ temp = _as_temp_path(self.host, path, self.temppath)
+ return SMBFileOpener(
+ wpath, temp, mode, port=self._port, block_size=bls, **kwargs
+ )
+ return smbclient.open_file(
+ wpath,
+ mode,
+ buffering=bls,
+ share_access=share_access,
+ port=self._port,
+ **kwargs,
+ )
def copy(self, path1, path2, **kwargs):
"""Copy within two locations in the same filesystem"""
- pass
+ wpath1 = _as_unc_path(self.host, path1)
+ wpath2 = _as_unc_path(self.host, path2)
+ if self.auto_mkdir:
+ self.makedirs(self._parent(path2), exist_ok=True)
+ smbclient.copyfile(wpath1, wpath2, port=self._port, **kwargs)
+
+ def _rm(self, path):
+ if _share_has_path(path):
+ wpath = _as_unc_path(self.host, path)
+ stats = smbclient.stat(wpath, port=self._port)
+ if S_ISDIR(stats.st_mode):
+ smbclient.rmdir(wpath, port=self._port)
+ else:
+ smbclient.remove(wpath, port=self._port)
+
+ def mv(self, path1, path2, recursive=None, maxdepth=None, **kwargs):
+ wpath1 = _as_unc_path(self.host, path1)
+ wpath2 = _as_unc_path(self.host, path2)
+ smbclient.rename(wpath1, wpath2, port=self._port, **kwargs)
+
+
+def _as_unc_path(host, path):
+ rpath = path.replace("/", "\\")
+ unc = f"\\\\{host}{rpath}"
+ return unc
+
+
+def _as_temp_path(host, path, temppath):
+ share = path.split("/")[1]
+ temp_file = f"/{share}{temppath}/{uuid.uuid4()}"
+ unc = _as_unc_path(host, temp_file)
+ return unc
+
+
+def _share_has_path(path):
+ parts = path.count("/")
+ if path.endswith("/"):
+ return parts > 2
+ return parts > 1
class SMBFileOpener:
@@ -144,13 +306,24 @@ class SMBFileOpener:
self.port = port
self._open()
+ def _open(self):
+ if self.smbfile is None or self.smbfile.closed:
+ self.smbfile = smbclient.open_file(
+ self.temp,
+ self.mode,
+ port=self.port,
+ buffering=self.block_size,
+ **self.kwargs,
+ )
+
def commit(self):
"""Move temp file to definitive on success."""
- pass
+ # TODO: use transaction support in SMB protocol
+ smbclient.replace(self.temp, self.path, port=self.port)
def discard(self):
"""Remove the temp file on failure."""
- pass
+ smbclient.remove(self.temp, port=self.port)
def __fspath__(self):
return self.path
diff --git a/fsspec/implementations/tar.py b/fsspec/implementations/tar.py
index acb812a..412e5ba 100644
--- a/fsspec/implementations/tar.py
+++ b/fsspec/implementations/tar.py
@@ -1,11 +1,14 @@
import logging
import tarfile
+
import fsspec
from fsspec.archive import AbstractArchiveFileSystem
from fsspec.compression import compr
from fsspec.utils import infer_compression
-typemap = {b'0': 'file', b'5': 'directory'}
-logger = logging.getLogger('tar')
+
+typemap = {b"0": "file", b"5": "directory"}
+
+logger = logging.getLogger("tar")
class TarFileSystem(AbstractArchiveFileSystem):
@@ -14,44 +17,108 @@ class TarFileSystem(AbstractArchiveFileSystem):
Supports the following formats:
tar.gz, tar.bz2, tar.xz
"""
- root_marker = ''
- protocol = 'tar'
+
+ root_marker = ""
+ protocol = "tar"
cachable = False
- def __init__(self, fo='', index_store=None, target_options=None,
- target_protocol=None, compression=None, **kwargs):
+ def __init__(
+ self,
+ fo="",
+ index_store=None,
+ target_options=None,
+ target_protocol=None,
+ compression=None,
+ **kwargs,
+ ):
super().__init__(**kwargs)
target_options = target_options or {}
+
if isinstance(fo, str):
- self.of = fsspec.open(fo, protocol=target_protocol, **
- target_options)
- fo = self.of.open()
+ self.of = fsspec.open(fo, protocol=target_protocol, **target_options)
+ fo = self.of.open() # keep the reference
+
+ # Try to infer compression.
if compression is None:
name = None
+
+ # Try different ways to get hold of the filename. `fo` might either
+ # be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an
+ # `fsspec.AbstractFileSystem` instance.
try:
- if hasattr(fo, 'original'):
+ # Amended io.BufferedReader or similar.
+ # This uses a "protocol extension" where original filenames are
+ # propagated to archive-like filesystems in order to let them
+ # infer the right compression appropriately.
+ if hasattr(fo, "original"):
name = fo.original
- elif hasattr(fo, 'path'):
+
+ # fsspec.LocalFileOpener
+ elif hasattr(fo, "path"):
name = fo.path
- elif hasattr(fo, 'name'):
+
+ # io.BufferedReader
+ elif hasattr(fo, "name"):
name = fo.name
- elif hasattr(fo, 'info'):
- name = fo.info()['name']
+
+ # fsspec.AbstractFileSystem
+ elif hasattr(fo, "info"):
+ name = fo.info()["name"]
+
except Exception as ex:
logger.warning(
- f'Unable to determine file name, not inferring compression: {ex}'
- )
+ f"Unable to determine file name, not inferring compression: {ex}"
+ )
+
if name is not None:
compression = infer_compression(name)
- logger.info(
- f'Inferred compression {compression} from file name {name}'
- )
+ logger.info(f"Inferred compression {compression} from file name {name}")
+
if compression is not None:
+ # TODO: tarfile already implements compression with modes like "'r:gz'",
+ # but then would seek to offset in the file work?
fo = compr[compression](fo)
+
self._fo_ref = fo
- self.fo = fo
+ self.fo = fo # the whole instance is a context
self.tar = tarfile.TarFile(fileobj=self.fo)
self.dir_cache = None
+
self.index_store = index_store
self.index = None
self._index()
+
+ def _index(self):
+ # TODO: load and set saved index, if exists
+ out = {}
+ for ti in self.tar:
+ info = ti.get_info()
+ info["type"] = typemap.get(info["type"], "file")
+ name = ti.get_info()["name"].rstrip("/")
+ out[name] = (info, ti.offset_data)
+
+ self.index = out
+ # TODO: save index to self.index_store here, if set
+
+ def _get_dirs(self):
+ if self.dir_cache is not None:
+ return
+
+ # This enables ls to get directories as children as well as files
+ self.dir_cache = {
+ dirname: {"name": dirname, "size": 0, "type": "directory"}
+ for dirname in self._all_dirnames(self.tar.getnames())
+ }
+ for member in self.tar.getmembers():
+ info = member.get_info()
+ info["name"] = info["name"].rstrip("/")
+ info["type"] = typemap.get(info["type"], "file")
+ self.dir_cache[info["name"]] = info
+
+ def _open(self, path, mode="rb", **kwargs):
+ if mode != "rb":
+ raise ValueError("Read-only filesystem implementation")
+ details, offset = self.index[path]
+ if details["type"] != "file":
+ raise ValueError("Can only handle regular files")
+ return self.tar.extractfile(path)
diff --git a/fsspec/implementations/tests/local/local_fixtures.py b/fsspec/implementations/tests/local/local_fixtures.py
index a549f6d..bafff60 100644
--- a/fsspec/implementations/tests/local/local_fixtures.py
+++ b/fsspec/implementations/tests/local/local_fixtures.py
@@ -1,7 +1,18 @@
import pytest
+
from fsspec.implementations.local import LocalFileSystem, make_path_posix
from fsspec.tests.abstract import AbstractFixtures
class LocalFixtures(AbstractFixtures):
- pass
+ @pytest.fixture(scope="class")
+ def fs(self):
+ return LocalFileSystem(auto_mkdir=True)
+
+ @pytest.fixture
+ def fs_path(self, tmpdir):
+ return str(tmpdir)
+
+ @pytest.fixture
+ def fs_sanitize_path(self):
+ return make_path_posix
diff --git a/fsspec/implementations/tests/memory/memory_fixtures.py b/fsspec/implementations/tests/memory/memory_fixtures.py
index 26f59cd..27d0252 100644
--- a/fsspec/implementations/tests/memory/memory_fixtures.py
+++ b/fsspec/implementations/tests/memory/memory_fixtures.py
@@ -1,7 +1,27 @@
import pytest
+
from fsspec import filesystem
from fsspec.tests.abstract import AbstractFixtures
class MemoryFixtures(AbstractFixtures):
- pass
+ @pytest.fixture(scope="class")
+ def fs(self):
+ m = filesystem("memory")
+ m.store.clear()
+ m.pseudo_dirs.clear()
+ m.pseudo_dirs.append("")
+ try:
+ yield m
+ finally:
+ m.store.clear()
+ m.pseudo_dirs.clear()
+ m.pseudo_dirs.append("")
+
+ @pytest.fixture
+ def fs_join(self):
+ return lambda *args: "/".join(args)
+
+ @pytest.fixture
+ def fs_path(self):
+ return ""
diff --git a/fsspec/implementations/tests/test_archive.py b/fsspec/implementations/tests/test_archive.py
index 0c8d230..457714b 100644
--- a/fsspec/implementations/tests/test_archive.py
+++ b/fsspec/implementations/tests/test_archive.py
@@ -8,9 +8,13 @@ import tempfile
import zipfile
from contextlib import contextmanager
from io import BytesIO
+
import pytest
+
import fsspec
-archive_data = {'a': b'', 'b': b'hello', 'deeply/nested/path': b'stuff'}
+
+# The blueprint to create synthesized archive files from.
+archive_data = {"a": b"", "b": b"hello", "deeply/nested/path": b"stuff"}
@contextmanager
@@ -18,7 +22,18 @@ def tempzip(data=None):
"""
Provide test cases with temporary synthesized Zip archives.
"""
- pass
+ data = data or {}
+ f = tempfile.mkstemp(suffix=".zip")[1]
+ with zipfile.ZipFile(f, mode="w") as z:
+ for k, v in data.items():
+ z.writestr(k, v)
+ try:
+ yield f
+ finally:
+ try:
+ os.remove(f)
+ except OSError:
+ pass
@contextmanager
@@ -26,39 +41,119 @@ def temparchive(data=None):
"""
Provide test cases with temporary synthesized 7-Zip archives.
"""
- pass
+ data = data or {}
+ libarchive = pytest.importorskip("libarchive")
+ f = tempfile.mkstemp(suffix=".7z")[1]
+ with libarchive.file_writer(f, "7zip") as archive:
+ for k, v in data.items():
+ archive.add_file_from_memory(entry_path=k, entry_size=len(v), entry_data=v)
+ try:
+ yield f
+ finally:
+ try:
+ os.remove(f)
+ except OSError:
+ pass
@contextmanager
-def temptar(data=None, mode='w', suffix='.tar'):
+def temptar(data=None, mode="w", suffix=".tar"):
"""
Provide test cases with temporary synthesized .tar archives.
"""
- pass
+ data = data or {}
+ fn = tempfile.mkstemp(suffix=suffix)[1]
+ with tarfile.TarFile.open(fn, mode=mode) as t:
+ touched = {}
+ for name, value in data.items():
+ # Create directory hierarchy.
+ # https://bugs.python.org/issue22208#msg225558
+ if "/" in name and name not in touched:
+ parts = os.path.dirname(name).split("/")
+ for index in range(1, len(parts) + 1):
+ info = tarfile.TarInfo("/".join(parts[:index]))
+ info.type = tarfile.DIRTYPE
+ t.addfile(info)
+ touched[name] = True
+
+ # Add file content.
+ info = tarfile.TarInfo(name=name)
+ info.size = len(value)
+ t.addfile(info, BytesIO(value))
+
+ try:
+ yield fn
+ finally:
+ try:
+ os.remove(fn)
+ except OSError:
+ pass
@contextmanager
-def temptargz(data=None, mode='w', suffix='.tar.gz'):
+def temptargz(data=None, mode="w", suffix=".tar.gz"):
"""
Provide test cases with temporary synthesized .tar.gz archives.
"""
- pass
+
+ with temptar(data=data, mode=mode) as tarname:
+ fn = tempfile.mkstemp(suffix=suffix)[1]
+ with open(tarname, "rb") as tar:
+ cf = gzip.GzipFile(filename=fn, mode=mode)
+ cf.write(tar.read())
+ cf.close()
+
+ try:
+ yield fn
+ finally:
+ try:
+ os.remove(fn)
+ except OSError:
+ pass
@contextmanager
-def temptarbz2(data=None, mode='w', suffix='.tar.bz2'):
+def temptarbz2(data=None, mode="w", suffix=".tar.bz2"):
"""
Provide test cases with temporary synthesized .tar.bz2 archives.
"""
- pass
+
+ with temptar(data=data, mode=mode) as tarname:
+ fn = tempfile.mkstemp(suffix=suffix)[1]
+ with open(tarname, "rb") as tar:
+ cf = bz2.BZ2File(filename=fn, mode=mode)
+ cf.write(tar.read())
+ cf.close()
+
+ try:
+ yield fn
+ finally:
+ try:
+ os.remove(fn)
+ except OSError:
+ pass
@contextmanager
-def temptarxz(data=None, mode='w', suffix='.tar.xz'):
+def temptarxz(data=None, mode="w", suffix=".tar.xz"):
"""
Provide test cases with temporary synthesized .tar.xz archives.
"""
- pass
+
+ with temptar(data=data, mode=mode) as tarname:
+ fn = tempfile.mkstemp(suffix=suffix)[1]
+ with open(tarname, "rb") as tar:
+ cf = lzma.open(filename=fn, mode=mode, format=lzma.FORMAT_XZ)
+ cf.write(tar.read())
+ cf.close()
+
+ try:
+ yield fn
+ finally:
+ try:
+ os.remove(fn)
+ except OSError:
+ pass
class ArchiveTestScenario:
@@ -67,8 +162,11 @@ class ArchiveTestScenario:
"""
def __init__(self, protocol=None, provider=None, variant=None):
+ # The filesystem protocol identifier. Any of "zip", "tar" or "libarchive".
self.protocol = protocol
+ # A contextmanager function to provide temporary synthesized archives.
self.provider = provider
+ # The filesystem protocol variant identifier. Any of "gz", "bz2" or "xz".
self.variant = variant
@@ -86,19 +184,28 @@ def pytest_generate_tests(metafunc):
https://docs.pytest.org/en/latest/example/parametrize.html#a-quick-port-of-testscenarios
"""
- pass
+ idlist = []
+ argnames = ["scenario"]
+ argvalues = []
+ for scenario in metafunc.cls.scenarios:
+ scenario: ArchiveTestScenario = scenario
+ label = scenario.protocol
+ if scenario.variant:
+ label += "-" + scenario.variant
+ idlist.append(label)
+ argvalues.append([scenario])
+ metafunc.parametrize(argnames, argvalues, ids=idlist, scope="class")
-scenario_zip = ArchiveTestScenario(protocol='zip', provider=tempzip)
-scenario_tar = ArchiveTestScenario(protocol='tar', provider=temptar)
-scenario_targz = ArchiveTestScenario(protocol='tar', provider=temptargz,
- variant='gz')
-scenario_tarbz2 = ArchiveTestScenario(protocol='tar', provider=temptarbz2,
- variant='bz2')
-scenario_tarxz = ArchiveTestScenario(protocol='tar', provider=temptarxz,
- variant='xz')
-scenario_libarchive = ArchiveTestScenario(protocol='libarchive', provider=
- temparchive)
+# Define test scenarios.
+scenario_zip = ArchiveTestScenario(protocol="zip", provider=tempzip)
+scenario_tar = ArchiveTestScenario(protocol="tar", provider=temptar)
+scenario_targz = ArchiveTestScenario(protocol="tar", provider=temptargz, variant="gz")
+scenario_tarbz2 = ArchiveTestScenario(
+ protocol="tar", provider=temptarbz2, variant="bz2"
+)
+scenario_tarxz = ArchiveTestScenario(protocol="tar", provider=temptarxz, variant="xz")
+scenario_libarchive = ArchiveTestScenario(protocol="libarchive", provider=temparchive)
class TestAnyArchive:
@@ -106,5 +213,170 @@ class TestAnyArchive:
Validate that all filesystem adapter implementations for archive files
will adhere to the same specification.
"""
- scenarios = [scenario_zip, scenario_tar, scenario_targz,
- scenario_tarbz2, scenario_tarxz, scenario_libarchive]
+
+ scenarios = [
+ scenario_zip,
+ scenario_tar,
+ scenario_targz,
+ scenario_tarbz2,
+ scenario_tarxz,
+ scenario_libarchive,
+ ]
+
+ def test_repr(self, scenario: ArchiveTestScenario):
+ with scenario.provider() as archive:
+ fs = fsspec.filesystem(scenario.protocol, fo=archive)
+ assert repr(fs).startswith("<Archive-like object")
+
+ def test_empty(self, scenario: ArchiveTestScenario):
+ with scenario.provider() as archive:
+ fs = fsspec.filesystem(scenario.protocol, fo=archive)
+ assert fs.find("") == []
+ assert fs.find("", withdirs=True) == []
+ with pytest.raises(FileNotFoundError):
+ fs.info("")
+ assert fs.ls("") == []
+
+ def test_glob(self, scenario: ArchiveTestScenario):
+ with scenario.provider(archive_data) as archive:
+ fs = fsspec.filesystem(scenario.protocol, fo=archive)
+ assert fs.glob("*/*/*th") == ["deeply/nested/path"]
+
+ def test_mapping(self, scenario: ArchiveTestScenario):
+ with scenario.provider(archive_data) as archive:
+ fs = fsspec.filesystem(scenario.protocol, fo=archive)
+ m = fs.get_mapper()
+ assert list(m) == ["a", "b", "deeply/nested/path"]
+ assert m["b"] == archive_data["b"]
+
+ def test_pickle(self, scenario: ArchiveTestScenario):
+ with scenario.provider(archive_data) as archive:
+ fs = fsspec.filesystem(scenario.protocol, fo=archive)
+ fs2 = pickle.loads(pickle.dumps(fs))
+ assert fs2.cat("b") == b"hello"
+
+ def test_all_dirnames(self, scenario: ArchiveTestScenario):
+ with scenario.provider(archive_data) as archive:
+ fs = fsspec.filesystem(scenario.protocol, fo=archive)
+
+ # fx are files, dx are a directories
+ assert fs._all_dirnames([]) == set()
+ assert fs._all_dirnames(["f1"]) == set()
+ assert fs._all_dirnames(["f1", "f2"]) == set()
+ assert fs._all_dirnames(["f1", "f2", "d1/f1"]) == {"d1"}
+ assert fs._all_dirnames(["f1", "d1/f1", "d1/f2"]) == {"d1"}
+ assert fs._all_dirnames(["f1", "d1/f1", "d2/f1"]) == {"d1", "d2"}
+ assert fs._all_dirnames(["d1/d1/d1/f1"]) == {"d1", "d1/d1", "d1/d1/d1"}
+
+ def test_ls(self, scenario: ArchiveTestScenario):
+ with scenario.provider(archive_data) as archive:
+ fs = fsspec.filesystem(scenario.protocol, fo=archive)
+
+ assert fs.ls("", detail=False) == ["a", "b", "deeply"]
+ assert fs.ls("/") == fs.ls("")
+
+ assert fs.ls("deeply", detail=False) == ["deeply/nested"]
+ assert fs.ls("deeply/") == fs.ls("deeply")
+
+ assert fs.ls("deeply/nested", detail=False) == ["deeply/nested/path"]
+ assert fs.ls("deeply/nested/") == fs.ls("deeply/nested")
+
+ def test_find(self, scenario: ArchiveTestScenario):
+ with scenario.provider(archive_data) as archive:
+ fs = fsspec.filesystem(scenario.protocol, fo=archive)
+
+ assert fs.find("") == ["a", "b", "deeply/nested/path"]
+ assert fs.find("", withdirs=True) == [
+ "a",
+ "b",
+ "deeply",
+ "deeply/nested",
+ "deeply/nested/path",
+ ]
+
+ assert fs.find("deeply") == ["deeply/nested/path"]
+ assert fs.find("deeply/") == fs.find("deeply")
+
+ @pytest.mark.parametrize("topdown", [True, False])
+ @pytest.mark.parametrize("prune_nested", [True, False])
+ def test_walk(self, scenario: ArchiveTestScenario, topdown, prune_nested):
+ with scenario.provider(archive_data) as archive:
+ fs = fsspec.filesystem(scenario.protocol, fo=archive)
+ expected = [
+ # (dirname, list of subdirs, list of files)
+ ("", ["deeply"], ["a", "b"]),
+ ("deeply", ["nested"], []),
+ ]
+ if not topdown or not prune_nested:
+ expected.append(("deeply/nested", [], ["path"]))
+ if not topdown:
+ expected.reverse()
+
+ result = []
+ for path, dirs, files in fs.walk("", topdown=topdown):
+ result.append((path, dirs.copy(), files))
+ # Bypass the "nested" dir
+ if prune_nested and "nested" in dirs:
+ dirs.remove("nested")
+
+ # prior py3.10 zip() does not support strict=True, we need
+ # a manual len check here
+ assert len(result) == len(expected)
+ for lhs, rhs in zip(result, expected):
+ assert lhs[0] == rhs[0]
+ assert sorted(lhs[1]) == sorted(rhs[1])
+ assert sorted(lhs[2]) == sorted(rhs[2])
+
+ def test_info(self, scenario: ArchiveTestScenario):
+ # https://github.com/Suor/funcy/blob/1.15/funcy/colls.py#L243-L245
+ def project(mapping, keys):
+ """Leaves only given keys in mapping."""
+ return {k: mapping[k] for k in keys if k in mapping}
+
+ with scenario.provider(archive_data) as archive:
+ fs = fsspec.filesystem(scenario.protocol, fo=archive)
+
+ with pytest.raises(FileNotFoundError):
+ fs.info("i-do-not-exist")
+
+ # Iterate over all directories.
+ for d in fs._all_dirnames(archive_data.keys()):
+ lhs = project(fs.info(d), ["name", "size", "type"])
+ expected = {"name": f"{d}", "size": 0, "type": "directory"}
+ assert lhs == expected
+
+ # Iterate over all files.
+ for f, v in archive_data.items():
+ lhs = fs.info(f)
+ assert lhs["name"] == f
+ assert lhs["size"] == len(v)
+ assert lhs["type"] == "file"
+
+ @pytest.mark.parametrize("scale", [128, 512, 4096])
+ def test_isdir_isfile(self, scenario: ArchiveTestScenario, scale: int):
+ def make_nested_dir(i):
+ x = f"{i}"
+ table = x.maketrans("0123456789", "ABCDEFGHIJ")
+ return "/".join(x.translate(table))
+
+ scaled_data = {f"{make_nested_dir(i)}/{i}": b"" for i in range(1, scale + 1)}
+ with scenario.provider(scaled_data) as archive:
+ fs = fsspec.filesystem(scenario.protocol, fo=archive)
+
+ lhs_dirs, lhs_files = (
+ fs._all_dirnames(scaled_data.keys()),
+ scaled_data.keys(),
+ )
+
+ # Warm-up the Cache, this is done in both cases anyways...
+ fs._get_dirs()
+
+ entries = lhs_files | lhs_dirs
+
+ assert lhs_dirs == {e for e in entries if fs.isdir(e)}
+ assert lhs_files == {e for e in entries if fs.isfile(e)}
+
+ def test_read_empty_file(self, scenario: ArchiveTestScenario):
+ with scenario.provider(archive_data) as archive:
+ fs = fsspec.filesystem(scenario.protocol, fo=archive)
+ assert fs.open("a").read() == b""
diff --git a/fsspec/implementations/tests/test_arrow.py b/fsspec/implementations/tests/test_arrow.py
index 77ace24..af706c5 100644
--- a/fsspec/implementations/tests/test_arrow.py
+++ b/fsspec/implementations/tests/test_arrow.py
@@ -1,5 +1,259 @@
import secrets
+
import pytest
-pyarrow_fs = pytest.importorskip('pyarrow.fs')
+
+pyarrow_fs = pytest.importorskip("pyarrow.fs")
FileSystem = pyarrow_fs.FileSystem
-from fsspec.implementations.arrow import ArrowFSWrapper, HadoopFileSystem
+
+from fsspec.implementations.arrow import ArrowFSWrapper, HadoopFileSystem # noqa
+
+
+@pytest.fixture(scope="function")
+def fs():
+ fs, _ = FileSystem.from_uri("mock://")
+ return ArrowFSWrapper(fs)
+
+
+@pytest.fixture(scope="function", params=[False, True])
+def remote_dir(fs, request):
+ directory = secrets.token_hex(16)
+ fs.makedirs(directory)
+ yield ("hdfs://" if request.param else "/") + directory
+ fs.rm(directory, recursive=True)
+
+
+def test_protocol():
+ fs, _ = FileSystem.from_uri("mock://")
+ fss = ArrowFSWrapper(fs)
+ assert fss.protocol == "mock"
+
+
+def strip_keys(original_entry):
+ entry = original_entry.copy()
+ entry.pop("mtime")
+ return entry
+
+
+def test_strip(fs):
+ assert fs._strip_protocol("/a/file") == "/a/file"
+ assert fs._strip_protocol("hdfs:///a/file") == "/a/file"
+ assert fs._strip_protocol("hdfs://1.1.1.1/a/file") == "/a/file"
+ assert fs._strip_protocol("hdfs://1.1.1.1:8888/a/file") == "/a/file"
+
+
+def test_info(fs, remote_dir):
+ fs.touch(remote_dir + "/a.txt")
+ remote_dir_strip_protocol = fs._strip_protocol(remote_dir)
+ details = fs.info(remote_dir + "/a.txt")
+ assert details["type"] == "file"
+ assert details["name"] == remote_dir_strip_protocol + "/a.txt"
+ assert details["size"] == 0
+
+ fs.mkdir(remote_dir + "/dir")
+ details = fs.info(remote_dir + "/dir")
+ assert details["type"] == "directory"
+ assert details["name"] == remote_dir_strip_protocol + "/dir"
+
+ details = fs.info(remote_dir + "/dir/")
+ assert details["name"] == remote_dir_strip_protocol + "/dir/"
+
+
+def test_move(fs, remote_dir):
+ fs.touch(remote_dir + "/a.txt")
+ initial_info = fs.info(remote_dir + "/a.txt")
+
+ fs.move(remote_dir + "/a.txt", remote_dir + "/b.txt")
+ secondary_info = fs.info(remote_dir + "/b.txt")
+
+ assert not fs.exists(remote_dir + "/a.txt")
+ assert fs.exists(remote_dir + "/b.txt")
+
+ initial_info.pop("name")
+ secondary_info.pop("name")
+ assert initial_info == secondary_info
+
+
+def test_move_recursive(fs, remote_dir):
+ src = remote_dir + "/src"
+ dest = remote_dir + "/dest"
+
+ assert fs.isdir(src) is False
+ fs.mkdir(src)
+ assert fs.isdir(src)
+
+ fs.touch(src + "/a.txt")
+ fs.mkdir(src + "/b")
+ fs.touch(src + "/b/c.txt")
+ fs.move(src, dest, recursive=True)
+
+ assert fs.isdir(src) is False
+ assert not fs.exists(src)
+
+ assert fs.isdir(dest)
+ assert fs.exists(dest)
+ assert fs.cat(dest + "/b/c.txt") == fs.cat(dest + "/a.txt") == b""
+
+
+def test_copy(fs, remote_dir):
+ fs.touch(remote_dir + "/a.txt")
+ initial_info = fs.info(remote_dir + "/a.txt")
+
+ fs.copy(remote_dir + "/a.txt", remote_dir + "/b.txt")
+ secondary_info = fs.info(remote_dir + "/b.txt")
+
+ assert fs.exists(remote_dir + "/a.txt")
+ assert fs.exists(remote_dir + "/b.txt")
+
+ initial_info.pop("name")
+ secondary_info.pop("name")
+ assert strip_keys(initial_info) == strip_keys(secondary_info)
+
+
+def test_rm(fs, remote_dir):
+ fs.touch(remote_dir + "/a.txt")
+ fs.rm(remote_dir + "/a.txt", recursive=True)
+ assert not fs.exists(remote_dir + "/a.txt")
+
+ fs.mkdir(remote_dir + "/dir")
+ fs.rm(remote_dir + "/dir", recursive=True)
+ assert not fs.exists(remote_dir + "/dir")
+
+ fs.mkdir(remote_dir + "/dir")
+ fs.touch(remote_dir + "/dir/a")
+ fs.touch(remote_dir + "/dir/b")
+ fs.mkdir(remote_dir + "/dir/c/")
+ fs.touch(remote_dir + "/dir/c/a")
+ fs.rm(remote_dir + "/dir", recursive=True)
+ assert not fs.exists(remote_dir + "/dir")
+
+
+def test_ls(fs, remote_dir):
+ if remote_dir != "/":
+ remote_dir = remote_dir + "/"
+ remote_dir_strip_protocol = fs._strip_protocol(remote_dir)
+ fs.mkdir(remote_dir + "dir/")
+ files = set()
+ for no in range(8):
+ file = remote_dir + f"dir/test_{no}"
+ # we also want to make sure `fs.touch` works with protocol
+ fs.touch(file)
+ files.add(remote_dir_strip_protocol + f"dir/test_{no}")
+
+ assert set(fs.ls(remote_dir + "dir/")) == files
+
+ dirs = fs.ls(remote_dir + "dir/", detail=True)
+ expected = [fs.info(file) for file in files]
+
+ by_name = lambda details: details["name"]
+ dirs.sort(key=by_name)
+ expected.sort(key=by_name)
+
+ assert dirs == expected
+
+
+def test_mkdir(fs, remote_dir):
+ if remote_dir != "/":
+ remote_dir = remote_dir + "/"
+ fs.mkdir(remote_dir + "dir/")
+ assert fs.isdir(remote_dir + "dir/")
+ assert len(fs.ls(remote_dir + "dir/")) == 0
+
+ fs.mkdir(remote_dir + "dir/sub", create_parents=False)
+ assert fs.isdir(remote_dir + "dir/sub")
+
+
+def test_makedirs(fs, remote_dir):
+ fs.makedirs(remote_dir + "dir/a/b/c/")
+ assert fs.isdir(remote_dir + "dir/a/b/c/")
+ assert fs.isdir(remote_dir + "dir/a/b/")
+ assert fs.isdir(remote_dir + "dir/a/")
+
+ fs.makedirs(remote_dir + "dir/a/b/c/", exist_ok=True)
+
+
+def test_exceptions(fs, remote_dir):
+ with pytest.raises(FileNotFoundError):
+ with fs.open(remote_dir + "/a.txt"):
+ ...
+
+ with pytest.raises(FileNotFoundError):
+ fs.copy(remote_dir + "/u.txt", remote_dir + "/y.txt")
+
+
+def test_open_rw(fs, remote_dir):
+ data = b"dvc.org"
+
+ with fs.open(remote_dir + "/a.txt", "wb") as stream:
+ stream.write(data)
+
+ with fs.open(remote_dir + "/a.txt") as stream:
+ assert stream.read() == data
+
+
+def test_open_rw_flush(fs, remote_dir):
+ data = b"dvc.org"
+
+ with fs.open(remote_dir + "/b.txt", "wb") as stream:
+ for _ in range(200):
+ stream.write(data)
+ stream.write(data)
+ stream.flush()
+
+ with fs.open(remote_dir + "/b.txt", "rb") as stream:
+ assert stream.read() == data * 400
+
+
+def test_open_append(fs, remote_dir):
+ data = b"dvc.org"
+
+ with fs.open(remote_dir + "/a.txt", "wb") as stream:
+ stream.write(data)
+
+ with fs.open(remote_dir + "/a.txt", "ab") as stream:
+ stream.write(data)
+
+ with fs.open(remote_dir + "/a.txt") as stream:
+ assert stream.read() == 2 * data
+
+
+def test_open_seekable(fs, remote_dir):
+ data = b"dvc.org"
+
+ with fs.open(remote_dir + "/a.txt", "wb") as stream:
+ stream.write(data)
+
+ with fs.open(remote_dir + "/a.txt", "rb", seekable=True) as file:
+ file.seek(2)
+ assert file.read() == data[2:]
+
+
+def test_seekable(fs, remote_dir):
+ data = b"dvc.org"
+
+ with fs.open(remote_dir + "/a.txt", "wb") as stream:
+ stream.write(data)
+
+ for seekable in [True, False]:
+ with fs.open(remote_dir + "/a.txt", "rb", seekable=seekable) as file:
+ assert file.seekable() == seekable
+ assert file.read() == data
+
+ with fs.open(remote_dir + "/a.txt", "rb", seekable=False) as file:
+ with pytest.raises(OSError):
+ file.seek(5)
+
+
+def test_get_kwargs_from_urls_hadoop_fs():
+ kwargs = HadoopFileSystem._get_kwargs_from_urls(
+ "hdfs://user@localhost:8020/?replication=2"
+ )
+ assert kwargs["user"] == "user"
+ assert kwargs["host"] == "localhost"
+ assert kwargs["port"] == 8020
+ assert kwargs["replication"] == 2
+
+ kwargs = HadoopFileSystem._get_kwargs_from_urls("hdfs://user@localhost:8020/")
+ assert kwargs["user"] == "user"
+ assert kwargs["host"] == "localhost"
+ assert kwargs["port"] == 8020
+ assert "replication" not in kwargs
diff --git a/fsspec/implementations/tests/test_cached.py b/fsspec/implementations/tests/test_cached.py
index 3baa269..aa1ac22 100644
--- a/fsspec/implementations/tests/test_cached.py
+++ b/fsspec/implementations/tests/test_cached.py
@@ -3,18 +3,1130 @@ import os
import pickle
import shutil
import tempfile
+
import pytest
+
import fsspec
from fsspec.compression import compr
from fsspec.exceptions import BlocksizeMismatchError
-from fsspec.implementations.cache_mapper import BasenameCacheMapper, HashCacheMapper, create_cache_mapper
-from fsspec.implementations.cached import CachingFileSystem, LocalTempFile, WholeFileCacheFileSystem
+from fsspec.implementations.cache_mapper import (
+ BasenameCacheMapper,
+ HashCacheMapper,
+ create_cache_mapper,
+)
+from fsspec.implementations.cached import (
+ CachingFileSystem,
+ LocalTempFile,
+ WholeFileCacheFileSystem,
+)
from fsspec.implementations.local import make_path_posix
from fsspec.implementations.zip import ZipFileSystem
from fsspec.tests.conftest import win
+
from .test_ftp import FTPFileSystem
+@pytest.fixture
+def local_filecache():
+ import tempfile
+
+ original_location = tempfile.mkdtemp()
+ cache_location = tempfile.mkdtemp()
+ original_file = os.path.join(original_location, "afile")
+ data = b"test data"
+ with open(original_file, "wb") as f:
+ f.write(data)
+
+ # we can access the file and read it
+ fs = fsspec.filesystem(
+ "filecache", target_protocol="file", cache_storage=cache_location
+ )
+
+ return data, original_file, cache_location, fs
+
+
+def test_mapper():
+ mapper0 = create_cache_mapper(True)
+ assert mapper0("somefile") == "somefile"
+ assert mapper0("/somefile") == "somefile"
+ assert mapper0("/somedir/somefile") == "somefile"
+ assert mapper0("/otherdir/somefile") == "somefile"
+
+ mapper1 = create_cache_mapper(False)
+ assert (
+ mapper1("somefile")
+ == "dd00b9487898b02555b6a2d90a070586d63f93e80c70aaa60c992fa9e81a72fe"
+ )
+ assert (
+ mapper1("/somefile")
+ == "884c07bc2efe65c60fb9d280a620e7f180488718fb5d97736521b7f9cf5c8b37"
+ )
+ assert (
+ mapper1("/somedir/somefile")
+ == "67a6956e5a5f95231263f03758c1fd9254fdb1c564d311674cec56b0372d2056"
+ )
+ assert (
+ mapper1("/otherdir/somefile")
+ == "f043dee01ab9b752c7f2ecaeb1a5e1b2d872018e2d0a1a26c43835ebf34e7d3e"
+ )
+
+ assert mapper0 != mapper1
+ assert create_cache_mapper(True) == mapper0
+ assert create_cache_mapper(False) == mapper1
+
+ assert hash(mapper0) != hash(mapper1)
+ assert hash(create_cache_mapper(True)) == hash(mapper0)
+ assert hash(create_cache_mapper(False)) == hash(mapper1)
+
+ with pytest.raises(
+ ValueError,
+ match="BasenameCacheMapper requires zero or positive directory_levels",
+ ):
+ BasenameCacheMapper(-1)
+
+ mapper2 = BasenameCacheMapper(1)
+ assert mapper2("/somefile") == "somefile"
+ assert mapper2("/somedir/somefile") == "somedir_@_somefile"
+ assert mapper2("/otherdir/somefile") == "otherdir_@_somefile"
+ assert mapper2("/dir1/dir2/dir3/somefile") == "dir3_@_somefile"
+
+ assert mapper2 != mapper0
+ assert mapper2 != mapper1
+ assert BasenameCacheMapper(1) == mapper2
+
+ assert hash(mapper2) != hash(mapper0)
+ assert hash(mapper2) != hash(mapper1)
+ assert hash(BasenameCacheMapper(1)) == hash(mapper2)
+
+ mapper3 = BasenameCacheMapper(2)
+ assert mapper3("/somefile") == "somefile"
+ assert mapper3("/somedir/somefile") == "somedir_@_somefile"
+ assert mapper3("/otherdir/somefile") == "otherdir_@_somefile"
+ assert mapper3("/dir1/dir2/dir3/somefile") == "dir2_@_dir3_@_somefile"
+
+ assert mapper3 != mapper0
+ assert mapper3 != mapper1
+ assert mapper3 != mapper2
+ assert BasenameCacheMapper(2) == mapper3
+
+ assert hash(mapper3) != hash(mapper0)
+ assert hash(mapper3) != hash(mapper1)
+ assert hash(mapper3) != hash(mapper2)
+ assert hash(BasenameCacheMapper(2)) == hash(mapper3)
+
+
+@pytest.mark.parametrize(
+ "cache_mapper", [BasenameCacheMapper(), BasenameCacheMapper(1), HashCacheMapper()]
+)
+@pytest.mark.parametrize("force_save_pickle", [True, False])
+def test_metadata(tmpdir, cache_mapper, force_save_pickle):
+ source = os.path.join(tmpdir, "source")
+ afile = os.path.join(source, "afile")
+ os.mkdir(source)
+ open(afile, "w").write("test")
+
+ fs = fsspec.filesystem(
+ "filecache",
+ target_protocol="file",
+ cache_storage=os.path.join(tmpdir, "cache"),
+ cache_mapper=cache_mapper,
+ )
+ fs._metadata._force_save_pickle = force_save_pickle
+
+ with fs.open(afile, "rb") as f:
+ assert f.read(5) == b"test"
+
+ afile_posix = make_path_posix(afile)
+ detail = fs._metadata.cached_files[0][afile_posix]
+ assert sorted(detail.keys()) == ["blocks", "fn", "original", "time", "uid"]
+ assert isinstance(detail["blocks"], bool)
+ assert isinstance(detail["fn"], str)
+ assert isinstance(detail["time"], float)
+ assert isinstance(detail["uid"], str)
+
+ assert detail["original"] == afile_posix
+ assert detail["fn"] == fs._mapper(afile_posix)
+
+ if isinstance(cache_mapper, BasenameCacheMapper):
+ if cache_mapper.directory_levels == 0:
+ assert detail["fn"] == "afile"
+ else:
+ assert detail["fn"] == "source_@_afile"
+
+
+def test_metadata_replace_pickle_with_json(tmpdir):
+ # For backward compatibility will allow reading of old pickled metadata.
+ # When the metadata is next saved, it is in json format.
+ source = os.path.join(tmpdir, "source")
+ afile = os.path.join(source, "afile")
+ os.mkdir(source)
+ open(afile, "w").write("test")
+
+ # Save metadata in pickle format, to simulate old metadata
+ fs = fsspec.filesystem(
+ "filecache",
+ target_protocol="file",
+ cache_storage=os.path.join(tmpdir, "cache"),
+ )
+ fs._metadata._force_save_pickle = True
+ with fs.open(afile, "rb") as f:
+ assert f.read(5) == b"test"
+
+ # Confirm metadata is in pickle format
+ cache_fn = os.path.join(fs.storage[-1], "cache")
+ with open(cache_fn, "rb") as f:
+ metadata = pickle.load(f)
+ assert list(metadata.keys()) == [make_path_posix(afile)]
+
+ # Force rewrite of metadata, now in json format
+ fs._metadata._force_save_pickle = False
+ fs.pop_from_cache(afile)
+ with fs.open(afile, "rb") as f:
+ assert f.read(5) == b"test"
+
+ # Confirm metadata is in json format
+ with open(cache_fn, "r") as f:
+ metadata = json.load(f)
+ assert list(metadata.keys()) == [make_path_posix(afile)]
+
+
+def test_constructor_kwargs(tmpdir):
+ fs = fsspec.filesystem("filecache", target_protocol="file", same_names=True)
+ assert isinstance(fs._mapper, BasenameCacheMapper)
+
+ fs = fsspec.filesystem("filecache", target_protocol="file", same_names=False)
+ assert isinstance(fs._mapper, HashCacheMapper)
+
+ fs = fsspec.filesystem("filecache", target_protocol="file")
+ assert isinstance(fs._mapper, HashCacheMapper)
+
+ with pytest.raises(
+ ValueError, match="Cannot specify both same_names and cache_mapper"
+ ):
+ fs = fsspec.filesystem(
+ "filecache",
+ target_protocol="file",
+ cache_mapper=HashCacheMapper(),
+ same_names=True,
+ )
+
+
+def test_idempotent():
+ fs = CachingFileSystem("file")
+ fs2 = CachingFileSystem("file")
+ assert fs2 is fs
+ fs3 = pickle.loads(pickle.dumps(fs))
+ assert fs3.storage == fs.storage
+
+
+@pytest.mark.parametrize("force_save_pickle", [True, False])
+def test_blockcache_workflow(ftp_writable, tmp_path, force_save_pickle):
+ host, port, user, pw = ftp_writable
+ fs = FTPFileSystem(host, port, user, pw)
+ with fs.open("/out", "wb") as f:
+ f.write(b"test\n" * 4096)
+
+ fs_kwargs = {
+ "skip_instance_cache": True,
+ "cache_storage": str(tmp_path),
+ "target_protocol": "ftp",
+ "target_options": {
+ "host": host,
+ "port": port,
+ "username": user,
+ "password": pw,
+ },
+ }
+
+ # Open the blockcache and read a little bit of the data
+ fs = fsspec.filesystem("blockcache", **fs_kwargs)
+ fs._metadata._force_save_pickle = force_save_pickle
+ with fs.open("/out", "rb", block_size=5) as f:
+ assert f.read(5) == b"test\n"
+
+ # Save the cache/close it
+ fs.save_cache()
+ del fs
+
+ # Check that cache file only has the first two blocks
+ if force_save_pickle:
+ with open(tmp_path / "cache", "rb") as f:
+ cache = pickle.load(f)
+ else:
+ with open(tmp_path / "cache", "r") as f:
+ cache = json.load(f)
+ assert "/out" in cache
+ assert cache["/out"]["blocks"] == [0, 1]
+
+ # Reopen the same cache and read some more...
+ fs = fsspec.filesystem("blockcache", **fs_kwargs)
+ fs._metadata._force_save_pickle = force_save_pickle
+ with fs.open("/out", block_size=5) as f:
+ assert f.read(5) == b"test\n"
+ f.seek(30)
+ assert f.read(5) == b"test\n"
+
+
+@pytest.mark.parametrize("impl", ["filecache", "blockcache"])
+def test_workflow(ftp_writable, impl):
+ host, port, user, pw = ftp_writable
+ fs = FTPFileSystem(host, port, user, pw)
+ with fs.open("/out", "wb") as f:
+ f.write(b"test")
+ fs = fsspec.filesystem(
+ impl,
+ target_protocol="ftp",
+ target_options={"host": host, "port": port, "username": user, "password": pw},
+ )
+ assert os.listdir(fs.storage[-1]) == []
+ with fs.open("/out") as f:
+ assert os.listdir(fs.storage[-1])
+ assert f.read() == b"test"
+ assert fs._metadata.cached_files[-1]["/out"]["blocks"]
+ assert fs.cat("/out") == b"test"
+ assert fs._metadata.cached_files[-1]["/out"]["blocks"] is True
+
+ with fs.open("/out", "wb") as f:
+ f.write(b"changed")
+
+ if impl == "filecache":
+ assert (
+ fs.cat("/out") == b"changed"
+ ) # new value, because we overwrote the cached location
+
+
+@pytest.mark.parametrize("impl", ["simplecache", "blockcache"])
+def test_glob(ftp_writable, impl):
+ host, port, user, pw = ftp_writable
+ fs = FTPFileSystem(host, port, user, pw)
+ with fs.open("/out", "wb") as f:
+ f.write(b"test")
+ with fs.open("/out2", "wb") as f:
+ f.write(b"test2")
+ fs = fsspec.filesystem(
+ impl,
+ target_protocol="ftp",
+ target_options={"host": host, "port": port, "username": user, "password": pw},
+ )
+ assert fs.glob("/wrong*") == []
+ assert fs.glob("/ou*") == ["/out", "/out2"]
+
+
+def test_write():
+ tmp = str(tempfile.mkdtemp())
+ fn = tmp + "afile"
+ url = f"simplecache::file://{fn}"
+ with fsspec.open(url, "wb") as f:
+ f.write(b"hello")
+ assert fn not in f.name
+ assert not os.listdir(tmp)
+
+ assert open(fn, "rb").read() == b"hello"
+
+
+def test_clear():
+ import tempfile
+
+ origin = tempfile.mkdtemp()
+ cache1 = tempfile.mkdtemp()
+ data = b"test data"
+ f1 = os.path.join(origin, "afile")
+ with open(f1, "wb") as f:
+ f.write(data)
+
+ # populates first cache
+ fs = fsspec.filesystem("filecache", target_protocol="file", cache_storage=cache1)
+ assert fs.cat(f1) == data
+
+ assert "cache" in os.listdir(cache1)
+ assert len(os.listdir(cache1)) == 2
+ assert fs._check_file(f1)
+
+ fs.clear_cache()
+ assert not fs._check_file(f1)
+ assert len(os.listdir(cache1)) < 2
+
+
+@pytest.mark.parametrize("force_save_pickle", [True, False])
+def test_clear_expired(tmp_path, force_save_pickle):
+ def __ager(cache_fn, fn, del_fn=False):
+ """
+ Modify the cache file to virtually add time lag to selected files.
+
+ Parameters
+ ---------
+ cache_fn: str
+ cache path
+ fn: str
+ file name to be modified
+ del_fn: bool
+ whether or not to delete 'fn' from cache details
+ """
+ import pathlib
+ import time
+
+ if os.path.exists(cache_fn):
+ if force_save_pickle:
+ with open(cache_fn, "rb") as f:
+ cached_files = pickle.load(f)
+ else:
+ with open(cache_fn, "r") as f:
+ cached_files = json.load(f)
+ fn_posix = pathlib.Path(fn).as_posix()
+ cached_files[fn_posix]["time"] = cached_files[fn_posix]["time"] - 691200
+ assert os.access(cache_fn, os.W_OK), "Cache is not writable"
+ if del_fn:
+ del cached_files[fn_posix]["fn"]
+ if force_save_pickle:
+ with open(cache_fn, "wb") as f:
+ pickle.dump(cached_files, f)
+ else:
+ with open(cache_fn, "w") as f:
+ json.dump(cached_files, f)
+ time.sleep(1)
+
+ origin = tmp_path.joinpath("origin")
+ cache1 = tmp_path.joinpath("cache1")
+ cache2 = tmp_path.joinpath("cache2")
+ cache3 = tmp_path.joinpath("cache3")
+
+ origin.mkdir()
+ cache1.mkdir()
+ cache2.mkdir()
+ cache3.mkdir()
+
+ data = b"test data"
+ f1 = origin.joinpath("afile")
+ f2 = origin.joinpath("bfile")
+ f3 = origin.joinpath("cfile")
+ f4 = origin.joinpath("dfile")
+
+ with open(f1, "wb") as f:
+ f.write(data)
+ with open(f2, "wb") as f:
+ f.write(data)
+ with open(f3, "wb") as f:
+ f.write(data)
+ with open(f4, "wb") as f:
+ f.write(data)
+
+ # populates first cache
+ fs = fsspec.filesystem(
+ "filecache", target_protocol="file", cache_storage=str(cache1), cache_check=1
+ )
+ fs._metadata._force_save_pickle = force_save_pickle
+ assert fs.cat(str(f1)) == data
+
+ # populates "last" cache if file not found in first one
+ fs = fsspec.filesystem(
+ "filecache",
+ target_protocol="file",
+ cache_storage=[str(cache1), str(cache2)],
+ cache_check=1,
+ )
+ fs._metadata._force_save_pickle = force_save_pickle
+ assert fs.cat(str(f2)) == data
+ assert fs.cat(str(f3)) == data
+ assert len(os.listdir(cache2)) == 3
+
+ # force the expiration
+ cache_fn = os.path.join(fs.storage[-1], "cache")
+ __ager(cache_fn, f2)
+
+ # remove from cache2 the expired files
+ fs.clear_expired_cache()
+ assert len(os.listdir(cache2)) == 2
+
+ # check complete cleanup
+ __ager(cache_fn, f3)
+
+ fs.clear_expired_cache()
+ assert not fs._check_file(f2)
+ assert not fs._check_file(f3)
+ assert len(os.listdir(cache2)) < 2
+
+ # check cache1 to be untouched after cleaning
+ assert len(os.listdir(cache1)) == 2
+
+ # check cleaning with 'same_name' option enabled
+ fs = fsspec.filesystem(
+ "filecache",
+ target_protocol="file",
+ cache_storage=[str(cache1), str(cache2), str(cache3)],
+ same_names=True,
+ cache_check=1,
+ )
+ fs._metadata._force_save_pickle = force_save_pickle
+ assert fs.cat(str(f4)) == data
+
+ cache_fn = os.path.join(fs.storage[-1], "cache")
+ __ager(cache_fn, f4)
+
+ fs.clear_expired_cache()
+ assert not fs._check_file(str(f4))
+
+ # check cache metadata lacking 'fn' raises RuntimeError.
+ fs = fsspec.filesystem(
+ "filecache",
+ target_protocol="file",
+ cache_storage=str(cache1),
+ same_names=True,
+ cache_check=1,
+ )
+ fs._metadata._force_save_pickle = force_save_pickle
+ assert fs.cat(str(f1)) == data
+
+ cache_fn = os.path.join(fs.storage[-1], "cache")
+ __ager(cache_fn, f1, del_fn=True)
+
+ with pytest.raises(RuntimeError, match="Cache metadata does not contain 'fn' for"):
+ fs.clear_expired_cache()
+
+
+def test_pop():
+ import tempfile
+
+ origin = tempfile.mkdtemp()
+ cache1 = tempfile.mkdtemp()
+ cache2 = tempfile.mkdtemp()
+ data = b"test data"
+ f1 = os.path.join(origin, "afile")
+ f2 = os.path.join(origin, "bfile")
+ with open(f1, "wb") as f:
+ f.write(data)
+ with open(f2, "wb") as f:
+ f.write(data)
+
+ # populates first cache
+ fs = fsspec.filesystem("filecache", target_protocol="file", cache_storage=cache1)
+ fs.cat(f1)
+
+ # populates last cache if file not found in first cache
+ fs = fsspec.filesystem(
+ "filecache", target_protocol="file", cache_storage=[cache1, cache2]
+ )
+ assert fs.cat(f2) == data
+ assert len(os.listdir(cache2)) == 2
+ assert fs._check_file(f1)
+ with pytest.raises(PermissionError):
+ fs.pop_from_cache(f1)
+ fs.pop_from_cache(f2)
+ fs.pop_from_cache(os.path.join(origin, "uncached-file"))
+ assert len(os.listdir(cache2)) == 1
+ assert not fs._check_file(f2)
+ assert fs._check_file(f1)
+
+
+def test_write_pickle_context():
+ tmp = str(tempfile.mkdtemp())
+ fn = tmp + "afile"
+ url = f"simplecache::file://{fn}"
+ with fsspec.open(url, "wb") as f:
+ pickle.loads(pickle.dumps(f))
+ f.write(b"hello ")
+ pickle.dumps(f)
+
+ with pytest.raises(ValueError):
+ pickle.dumps(f)
+
+ assert open(fn, "rb").read() == b"hello "
+
+
+def test_blocksize(ftp_writable):
+ host, port, user, pw = ftp_writable
+ fs = FTPFileSystem(host, port, user, pw)
+ with fs.open("/out_block", "wb") as f:
+ f.write(b"test" * 4000)
+
+ fs = fsspec.filesystem(
+ "blockcache",
+ target_protocol="ftp",
+ target_options={"host": host, "port": port, "username": user, "password": pw},
+ )
+
+ with fs.open("/out_block", block_size=20) as f:
+ assert f.read(1) == b"t"
+ with pytest.raises(BlocksizeMismatchError):
+ fs.open("/out_block", block_size=30)
+
+
+def test_blockcache_multiinstance(ftp_writable):
+ host, port, user, pw = ftp_writable
+ fs = FTPFileSystem(host, port, user, pw)
+ with fs.open("/one", "wb") as f:
+ f.write(b"test" * 40)
+ with fs.open("/two", "wb") as f:
+ f.write(b"test" * 40)
+ fs = fsspec.filesystem(
+ "blockcache",
+ target_protocol="ftp",
+ target_options={"host": host, "port": port, "username": user, "password": pw},
+ )
+
+ with fs.open("/one", block_size=20) as f:
+ assert f.read(1) == b"t"
+ fs2 = fsspec.filesystem(
+ "blockcache",
+ target_protocol="ftp",
+ target_options={"host": host, "port": port, "username": user, "password": pw},
+ skip_instance_cache=True,
+ cache_storage=fs.storage,
+ )
+ assert fs2._metadata.cached_files # loaded from metadata for "one"
+ with fs2.open("/two", block_size=20) as f:
+ assert f.read(1) == b"t"
+ assert "/two" in fs2._metadata.cached_files[-1]
+ fs.save_cache()
+ assert list(fs._metadata.cached_files[-1]) == ["/one", "/two"]
+ assert list(fs2._metadata.cached_files[-1]) == ["/one", "/two"]
+
+
+def test_metadata_save_blocked(ftp_writable, caplog):
+ import logging
+
+ host, port, user, pw = ftp_writable
+ fs = FTPFileSystem(host, port, user, pw)
+ with fs.open("/one", "wb") as f:
+ f.write(b"test" * 40)
+ fs = fsspec.filesystem(
+ "blockcache",
+ target_protocol="ftp",
+ target_options={"host": host, "port": port, "username": user, "password": pw},
+ )
+
+ with fs.open("/one", block_size=20) as f:
+ assert f.read(1) == b"t"
+ fn = os.path.join(fs.storage[-1], "cache")
+ with caplog.at_level(logging.DEBUG):
+ with fs.open("/one", block_size=20) as f:
+ f.seek(21)
+ assert f.read(1)
+ os.remove(fn)
+ os.mkdir(fn)
+ assert "Cache saving failed while closing file" in caplog.text
+ os.rmdir(fn)
+
+ def open_raise(*_, **__):
+ raise NameError
+
+ try:
+ # To simulate an interpreter shutdown we temporarily set an open function in the
+ # cache_metadata module which is used on the next attempt to save metadata.
+ with caplog.at_level(logging.DEBUG):
+ with fs.open("/one", block_size=20) as f:
+ fsspec.implementations.cache_metadata.open = open_raise
+ f.seek(21)
+ assert f.read(1)
+ finally:
+ fsspec.implementations.cache_metadata.__dict__.pop("open", None)
+ assert "Cache save failed due to interpreter shutdown" in caplog.text
+
+
+@pytest.mark.parametrize("impl", ["filecache", "simplecache", "blockcache"])
+def test_local_filecache_creates_dir_if_needed(impl):
+ import tempfile
+
+ original_location = tempfile.mkdtemp()
+ cache_location = tempfile.mkdtemp()
+ os.rmdir(cache_location)
+ assert not os.path.exists(cache_location)
+
+ original_file = os.path.join(original_location, "afile")
+ data = b"test data"
+ with open(original_file, "wb") as f:
+ f.write(data)
+
+ # we can access the file and read it
+ fs = fsspec.filesystem(impl, target_protocol="file", cache_storage=cache_location)
+
+ with fs.open(original_file, "rb") as f:
+ data_in_cache = f.read()
+
+ assert os.path.exists(cache_location)
+
+ assert data_in_cache == data
+
+
+@pytest.mark.parametrize("toplevel", [True, False])
+@pytest.mark.parametrize("impl", ["filecache", "simplecache", "blockcache"])
+def test_get_mapper(impl, toplevel):
+ import tempfile
+
+ original_location = tempfile.mkdtemp()
+ cache_location = tempfile.mkdtemp()
+ os.rmdir(cache_location)
+ original_file = os.path.join(original_location, "afile")
+ data = b"test data"
+ with open(original_file, "wb") as f:
+ f.write(data)
+
+ if toplevel:
+ m = fsspec.get_mapper(
+ f"{impl}::file://{original_location}",
+ **{impl: {"cache_storage": cache_location}},
+ )
+ else:
+ fs = fsspec.filesystem(
+ impl, target_protocol="file", cache_storage=cache_location
+ )
+ m = fs.get_mapper(original_location)
+
+ assert m["afile"] == data
+ assert os.listdir(cache_location)
+ assert m["afile"] == data
+
+
+def test_local_filecache_basic(local_filecache):
+ data, original_file, cache_location, fs = local_filecache
+
+ # reading from the file contains the right data
+ with fs.open(original_file, "rb") as f:
+ assert f.read() == data
+ assert "cache" in os.listdir(cache_location)
+
+ # the file in the location contains the right data
+ fn = list(fs._metadata.cached_files[-1].values())[0]["fn"] # this is a hash value
+ assert fn in os.listdir(cache_location)
+ with open(os.path.join(cache_location, fn), "rb") as f:
+ assert f.read() == data
+
+ # still there when original file is removed (check=False)
+ os.remove(original_file)
+ with fs.open(original_file, "rb") as f:
+ assert f.read() == data
+
+
+def test_local_filecache_does_not_change_when_original_data_changed(local_filecache):
+ old_data, original_file, cache_location, fs = local_filecache
+ new_data = b"abc"
+
+ with fs.open(original_file, "rb") as f:
+ assert f.read() == old_data
+
+ with open(original_file, "wb") as f:
+ f.write(new_data)
+
+ with fs.open(original_file, "rb") as f:
+ assert f.read() == old_data
+
+
+def test_local_filecache_gets_from_original_if_cache_deleted(local_filecache):
+ old_data, original_file, cache_location, fs = local_filecache
+ new_data = b"abc"
+
+ with fs.open(original_file, "rb") as f:
+ assert f.read() == old_data
+
+ with open(original_file, "wb") as f:
+ f.write(new_data)
+
+ shutil.rmtree(cache_location)
+ assert os.path.exists(original_file)
+
+ with open(original_file, "rb") as f:
+ assert f.read() == new_data
+
+ with fs.open(original_file, "rb") as f:
+ assert f.read() == new_data
+
+ # the file in the location contains the right data
+ fn = list(fs._metadata.cached_files[-1].values())[0]["fn"] # this is a hash value
+ assert fn in os.listdir(cache_location)
+ with open(os.path.join(cache_location, fn), "rb") as f:
+ assert f.read() == new_data
+
+
+def test_local_filecache_with_new_cache_location_makes_a_new_copy(local_filecache):
+ import tempfile
+
+ data, original_file, old_cache_location, old_fs = local_filecache
+ new_cache_location = tempfile.mkdtemp()
+
+ with old_fs.open(original_file, "rb") as f:
+ assert f.read() == data
+
+ new_fs = fsspec.filesystem(
+ "filecache", target_protocol="file", cache_storage=new_cache_location
+ )
+
+ with new_fs.open(original_file, "rb") as f:
+ assert f.read() == data
+
+ # the file in the location contains the right data
+ fn = list(new_fs._metadata.cached_files[-1].values())[0][
+ "fn"
+ ] # this is a hash value
+ assert fn in os.listdir(old_cache_location)
+ assert fn in os.listdir(new_cache_location)
+
+ with open(os.path.join(new_cache_location, fn), "rb") as f:
+ assert f.read() == data
+
+
+def test_filecache_multicache():
+ import tempfile
+
+ origin = tempfile.mkdtemp()
+ cache1 = tempfile.mkdtemp()
+ cache2 = tempfile.mkdtemp()
+ data = b"test data"
+ f1 = os.path.join(origin, "afile")
+ f2 = os.path.join(origin, "bfile")
+ with open(f1, "wb") as f:
+ f.write(data)
+ with open(f2, "wb") as f:
+ f.write(data * 2)
+
+ # populates first cache
+ fs = fsspec.filesystem("filecache", target_protocol="file", cache_storage=cache1)
+ assert fs.cat(f1) == data
+
+ assert len(os.listdir(cache1)) == 2 # cache and hashed afile
+ assert len(os.listdir(cache2)) == 0 # hasn't been initialized yet
+
+ # populates last cache if file not found in first cache
+ fs = fsspec.filesystem(
+ "filecache", target_protocol="file", cache_storage=[cache1, cache2]
+ )
+
+ assert fs.cat(f1) == data
+ assert fs.cat(f2) == data * 2
+
+ assert "cache" in os.listdir(cache1)
+ assert "cache" in os.listdir(cache2)
+
+ cache1_contents = [f for f in os.listdir(cache1) if f != "cache"]
+ assert len(cache1_contents) == 1
+
+ with open(os.path.join(cache1, cache1_contents[0]), "rb") as f:
+ assert f.read() == data
+
+ cache2_contents = [f for f in os.listdir(cache2) if f != "cache"]
+ assert len(cache2_contents) == 1
+
+ with open(os.path.join(cache2, cache2_contents[0]), "rb") as f:
+ assert f.read() == data * 2
+
+
+@pytest.mark.parametrize("impl", ["filecache", "simplecache"])
+def test_filecache_multicache_with_same_file_different_data_reads_from_first(impl):
+ import tempfile
+
+ origin = tempfile.mkdtemp()
+ cache1 = tempfile.mkdtemp()
+ cache2 = tempfile.mkdtemp()
+ data = b"test data"
+ f1 = os.path.join(origin, "afile")
+ with open(f1, "wb") as f:
+ f.write(data)
+
+ # populate first cache
+ fs1 = fsspec.filesystem(impl, target_protocol="file", cache_storage=cache1)
+ assert fs1.cat(f1) == data
+
+ with open(f1, "wb") as f:
+ f.write(data * 2)
+
+ # populate second cache
+ fs2 = fsspec.filesystem(impl, target_protocol="file", cache_storage=cache2)
+
+ assert fs2.cat(f1) == data * 2
+
+ # the filenames in each cache are the same, but the data is different
+ assert sorted(os.listdir(cache1)) == sorted(os.listdir(cache2))
+
+ fs = fsspec.filesystem(impl, target_protocol="file", cache_storage=[cache1, cache2])
+
+ assert fs.cat(f1) == data
+
+
+def test_filecache_with_checks():
+ import time
+
+ origin = tempfile.mkdtemp()
+ cache1 = tempfile.mkdtemp()
+ data = b"test data"
+ f1 = os.path.join(origin, "afile")
+ with open(f1, "wb") as f:
+ f.write(data)
+
+ # populate first cache
+ fs = fsspec.filesystem(
+ "filecache", target_protocol="file", cache_storage=cache1, expiry_time=0.1
+ )
+ fs2 = fsspec.filesystem(
+ "filecache", target_protocol="file", cache_storage=cache1, check_files=True
+ )
+ assert fs.cat(f1) == data
+ assert fs2.cat(f1) == data
+
+ with open(f1, "wb") as f:
+ f.write(data * 2)
+
+ assert fs.cat(f1) == data # does not change
+ assert fs2.cat(f1) == data * 2 # changed, since origin changed
+ with fs2.open(f1) as f:
+ assert f.read() == data * 2 # read also sees new data
+ time.sleep(0.11) # allow cache details to expire
+ assert fs.cat(f1) == data * 2 # changed, since origin changed
+
+
+@pytest.mark.parametrize("impl", ["filecache", "simplecache", "blockcache"])
+@pytest.mark.parametrize("fs", ["local", "multi"], indirect=["fs"])
+def test_filecache_takes_fs_instance(impl, fs):
+ origin = tempfile.mkdtemp()
+ data = b"test data"
+ f1 = os.path.join(origin, "afile")
+ with open(f1, "wb") as f:
+ f.write(data)
+
+ fs2 = fsspec.filesystem(impl, fs=fs)
+
+ assert fs2.cat(f1) == data
+
+
+@pytest.mark.parametrize("impl", ["filecache", "simplecache", "blockcache"])
+@pytest.mark.parametrize("fs", ["local", "multi"], indirect=["fs"])
+def test_filecache_serialization(impl, fs):
+ fs1 = fsspec.filesystem(impl, fs=fs)
+ json1 = fs1.to_json()
+
+ assert fs1 is fsspec.AbstractFileSystem.from_json(json1)
+
+
+def test_add_file_to_cache_after_save(local_filecache):
+ (data, original_file, cache_location, fs) = local_filecache
+
+ fs.save_cache()
+
+ fs.cat(original_file)
+ assert len(fs._metadata.cached_files[-1]) == 1
+
+ fs.save_cache()
+
+ fs2 = fsspec.filesystem(
+ "filecache",
+ target_protocol="file",
+ cache_storage=cache_location,
+ do_not_use_cache_for_this_instance=True, # cache is masking the issue
+ )
+ assert len(fs2._metadata.cached_files[-1]) == 1
+
+
+def test_cached_open_close_read(ftp_writable):
+ # Regression test for <https://github.com/fsspec/filesystem_spec/issues/799>
+ host, port, user, pw = ftp_writable
+ fs = FTPFileSystem(host, port, user, pw)
+ with fs.open("/out_block", "wb") as f:
+ f.write(b"test" * 4000)
+ fs = fsspec.filesystem(
+ "cached",
+ target_protocol="ftp",
+ target_options={"host": host, "port": port, "username": user, "password": pw},
+ )
+ with fs.open("/out_block", block_size=1024) as f:
+ pass
+ with fs.open("/out_block", block_size=1024) as f:
+ assert f.read(1) == b"t"
+ # Regression test for <https://github.com/fsspec/filesystem_spec/issues/845>
+ assert fs._metadata.cached_files[-1]["/out_block"]["blocks"] == {0}
+
+
+@pytest.mark.parametrize("impl", ["filecache", "simplecache"])
+@pytest.mark.parametrize("compression", ["gzip", "bz2"])
+def test_with_compression(impl, compression):
+ data = b"123456789"
+ tempdir = tempfile.mkdtemp()
+ cachedir = tempfile.mkdtemp()
+ fn = os.path.join(tempdir, "data")
+ f = compr[compression](open(fn, mode="wb"), mode="w")
+ f.write(data)
+ f.close()
+
+ with fsspec.open(
+ f"{impl}::{fn}",
+ "rb",
+ compression=compression,
+ **{impl: {"same_names": True, "cache_storage": cachedir}},
+ ) as f:
+ # stores original compressed file, uncompress on read
+ assert f.read() == data
+ assert "data" in os.listdir(cachedir)
+ assert open(os.path.join(cachedir, "data"), "rb").read() != data
+
+ cachedir = tempfile.mkdtemp()
+
+ with fsspec.open(
+ f"{impl}::{fn}",
+ "rb",
+ **{
+ impl: {
+ "same_names": True,
+ "compression": compression,
+ "cache_storage": cachedir,
+ }
+ },
+ ) as f:
+ # stores uncompressed data
+ assert f.read() == data
+ assert "data" in os.listdir(cachedir)
+ assert open(os.path.join(cachedir, "data"), "rb").read() == data
+
+
+@pytest.mark.parametrize("protocol", ["simplecache", "filecache"])
+def test_again(protocol):
+ fn = "memory://afile"
+ with fsspec.open(fn, "wb") as f:
+ f.write(b"hello")
+ d2 = tempfile.mkdtemp()
+ lurl = fsspec.open_local(f"{protocol}::{fn}", **{protocol: {"cache_storage": d2}})
+ assert os.path.exists(lurl)
+ assert d2 in lurl
+ assert open(lurl, "rb").read() == b"hello"
+
+ # remove cache dir
+ shutil.rmtree(d2)
+ assert not os.path.exists(lurl)
+
+ # gets recreated
+ lurl = fsspec.open_local(f"{protocol}::{fn}", **{protocol: {"cache_storage": d2}})
+ assert open(lurl, "rb").read() == b"hello"
+
+
+@pytest.mark.parametrize("protocol", ["simplecache", "filecache"])
+def test_multi_cache(protocol):
+ with fsspec.open_files("memory://file*", "wb", num=2) as files:
+ for f in files:
+ f.write(b"hello")
+
+ d2 = tempfile.mkdtemp()
+ lurl = fsspec.open_local(
+ f"{protocol}::memory://file*",
+ mode="rb",
+ **{protocol: {"cache_storage": d2, "same_names": True}},
+ )
+ assert all(d2 in u for u in lurl)
+ assert all(os.path.basename(f) in ["file0", "file1"] for f in lurl)
+ assert all(open(u, "rb").read() == b"hello" for u in lurl)
+
+ d2 = tempfile.mkdtemp()
+ lurl = fsspec.open_files(
+ f"{protocol}::memory://file*",
+ mode="rb",
+ **{protocol: {"cache_storage": d2, "same_names": True}},
+ )
+ with lurl as files:
+ for f in files:
+ assert os.path.basename(f.name) in ["file0", "file1"]
+ assert f.read() == b"hello"
+ fs = fsspec.filesystem("memory")
+ fs.store.clear()
+ with lurl as files:
+ for f in files:
+ assert os.path.basename(f.name) in ["file0", "file1"]
+ assert f.read() == b"hello"
+
+
+@pytest.mark.parametrize("protocol", ["simplecache", "filecache", "blockcache"])
+def test_multi_cat(protocol, ftp_writable):
+ host, port, user, pw = ftp_writable
+ fs = FTPFileSystem(host, port, user, pw)
+ for fn in ("/file0", "/file1"):
+ with fs.open(fn, "wb") as f:
+ f.write(b"hello")
+
+ d2 = tempfile.mkdtemp()
+ fs = fsspec.filesystem(protocol, storage=d2, fs=fs)
+ assert fs.cat("file*") == {"/file0": b"hello", "/file1": b"hello"}
+
+
+@pytest.mark.parametrize("protocol", ["simplecache", "filecache"])
+def test_multi_cache_chain(protocol):
+ import zipfile
+
+ d = tempfile.mkdtemp()
+ fn = os.path.join(d, "test.zip")
+ zipfile.ZipFile(fn, mode="w").open("test", "w").write(b"hello")
+
+ with fsspec.open_files(f"zip://test::{protocol}::file://{fn}") as files:
+ assert d not in files[0]._fileobj._file.name
+ assert files[0].read() == b"hello"
+
+ # special test contains "file:" string
+ fn = os.path.join(d, "file.zip")
+ zipfile.ZipFile(fn, mode="w").open("file", "w").write(b"hello")
+ with fsspec.open_files(f"zip://file::{protocol}::file://{fn}") as files:
+ assert d not in files[0]._fileobj._file.name
+ assert files[0].read() == b"hello"
+
+
+@pytest.mark.parametrize("protocol", ["blockcache", "simplecache", "filecache"])
+def test_strip(protocol):
+ fs = fsspec.filesystem(protocol, target_protocol="memory")
+ url1 = "memory://afile"
+ assert fs._strip_protocol(url1) == "/afile"
+ assert fs._strip_protocol(protocol + "://afile") == "/afile"
+ assert fs._strip_protocol(protocol + "::memory://afile") == "/afile"
+
+
+@pytest.mark.parametrize("protocol", ["simplecache", "filecache"])
+def test_cached_write(protocol):
+ d = tempfile.mkdtemp()
+ ofs = fsspec.open_files(f"{protocol}::file://{d}/*.out", mode="wb", num=2)
+ with ofs as files:
+ for f in files:
+ assert isinstance(f, LocalTempFile)
+ f.write(b"data")
+ fn = f.name
+
+ assert sorted(os.listdir(d)) == ["0.out", "1.out"]
+ assert not os.path.exists(fn)
+
+
+def test_expiry():
+ import time
+
+ d = tempfile.mkdtemp()
+ fs = fsspec.filesystem("memory")
+ fn = "/afile"
+ fn0 = "memory://afile"
+ data = b"hello"
+ with fs.open(fn0, "wb") as f:
+ f.write(data)
+
+ fs = fsspec.filesystem(
+ "filecache",
+ fs=fs,
+ cache_storage=d,
+ check_files=False,
+ expiry_time=0.1,
+ same_names=True,
+ )
+
+ # get file
+ assert fs._check_file(fn0) is False
+ assert fs.open(fn0, mode="rb").read() == data
+ start_time = fs._metadata.cached_files[-1][fn]["time"]
+
+ # cache time..
+ assert fs.last_cache - start_time < 0.19
+
+ # cache should have refreshed
+ time.sleep(0.01)
+
+ # file should still be valid... re-read
+ assert fs.open(fn0, mode="rb").read() == data
+ detail, _ = fs._check_file(fn0)
+ assert detail["time"] == start_time
+
+ time.sleep(0.11)
+ # file should still be invalid... re-read
+ assert fs._check_file(fn0) is False
+ assert fs.open(fn0, mode="rb").read() == data
+ detail, _ = fs._check_file(fn0)
+ assert detail["time"] - start_time > 0.09
+
+
def test_equality(tmpdir):
"""Test sane behaviour for equality and hashing.
@@ -25,9 +1137,193 @@ def test_equality(tmpdir):
Related: GitHub#577, GitHub#578
"""
- pass
+ from fsspec.implementations.local import LocalFileSystem
+
+ lfs = LocalFileSystem()
+ dir1 = f"{tmpdir}/raspberry"
+ dir2 = f"{tmpdir}/banana"
+ cfs1 = CachingFileSystem(fs=lfs, cache_storage=dir1)
+ cfs2 = CachingFileSystem(fs=lfs, cache_storage=dir2)
+ cfs3 = CachingFileSystem(fs=lfs, cache_storage=dir2)
+ assert cfs1 == cfs1
+ assert cfs1 != cfs2
+ assert cfs1 != cfs3
+ assert cfs2 == cfs3
+ assert cfs1 != lfs
+ assert cfs2 != lfs
+ assert cfs3 != lfs
+ assert hash(lfs) != hash(cfs1)
+ assert hash(lfs) != hash(cfs2)
+ assert hash(lfs) != hash(cfs3)
+ assert hash(cfs1) != hash(cfs2)
+ assert hash(cfs1) != hash(cfs2)
+ assert hash(cfs2) == hash(cfs3)
def test_str():
"""Test that the str representation refers to correct class."""
- pass
+ from fsspec.implementations.local import LocalFileSystem
+
+ lfs = LocalFileSystem()
+ cfs = CachingFileSystem(fs=lfs)
+ assert "CachingFileSystem" in str(cfs)
+
+
+def test_getitems_errors(tmpdir):
+ tmpdir = str(tmpdir)
+ os.makedirs(os.path.join(tmpdir, "afolder"))
+ open(os.path.join(tmpdir, "afile"), "w").write("test")
+ open(os.path.join(tmpdir, "afolder", "anotherfile"), "w").write("test2")
+ m = fsspec.get_mapper(f"file://{tmpdir}")
+ assert m.getitems(["afile", "bfile"], on_error="omit") == {"afile": b"test"}
+
+ # my code
+ m2 = fsspec.get_mapper(f"simplecache::file://{tmpdir}")
+ assert m2.getitems(["afile"], on_error="omit") == {"afile": b"test"} # works
+ assert m2.getitems(["afile", "bfile"], on_error="omit") == {
+ "afile": b"test"
+ } # throws KeyError
+
+ with pytest.raises(KeyError):
+ m.getitems(["afile", "bfile"])
+ out = m.getitems(["afile", "bfile"], on_error="return")
+ assert isinstance(out["bfile"], KeyError)
+ m = fsspec.get_mapper(f"file://{tmpdir}", missing_exceptions=())
+ assert m.getitems(["afile", "bfile"], on_error="omit") == {"afile": b"test"}
+ with pytest.raises(FileNotFoundError):
+ m.getitems(["afile", "bfile"])
+
+
+@pytest.mark.parametrize("temp_cache", [False, True])
+def test_cache_dir_auto_deleted(temp_cache, tmpdir):
+ import gc
+
+ source = os.path.join(tmpdir, "source")
+ afile = os.path.join(source, "afile")
+ os.mkdir(source)
+ open(afile, "w").write("test")
+
+ fs = fsspec.filesystem(
+ "filecache",
+ target_protocol="file",
+ cache_storage="TMP" if temp_cache else os.path.join(tmpdir, "cache"),
+ skip_instance_cache=True, # Important to avoid fs itself being cached
+ )
+
+ cache_dir = fs.storage[-1]
+
+ # Force cache to be created
+ with fs.open(afile, "rb") as f:
+ assert f.read(5) == b"test"
+
+ # Confirm cache exists
+ local = fsspec.filesystem("file")
+ assert local.exists(cache_dir)
+
+ # Delete file system
+ del fs
+ gc.collect()
+
+ # Ensure cache has been deleted, if it is temporary
+ if temp_cache:
+ assert not local.exists(cache_dir)
+ else:
+ assert local.exists(cache_dir)
+
+
+@pytest.mark.parametrize("protocol", ["filecache", "blockcache", "simplecache"])
+def test_cache_size(tmpdir, protocol):
+ if win and protocol == "blockcache":
+ pytest.skip("Windows file locking affects blockcache size tests")
+
+ source = os.path.join(tmpdir, "source")
+ afile = os.path.join(source, "afile")
+ os.mkdir(source)
+ open(afile, "w").write("test")
+
+ fs = fsspec.filesystem(protocol, target_protocol="file")
+ empty_cache_size = fs.cache_size()
+
+ # Create cache
+ with fs.open(afile, "rb") as f:
+ assert f.read(5) == b"test"
+ single_file_cache_size = fs.cache_size()
+ assert single_file_cache_size > empty_cache_size
+
+ # Remove cached file but leave cache metadata file
+ fs.pop_from_cache(afile)
+ if win and protocol == "filecache":
+ assert empty_cache_size < fs.cache_size()
+ elif protocol != "simplecache":
+ assert empty_cache_size < fs.cache_size() < single_file_cache_size
+ else:
+ # simplecache never stores metadata
+ assert fs.cache_size() == single_file_cache_size
+
+ # Completely remove cache
+ fs.clear_cache()
+ if protocol != "simplecache":
+ assert fs.cache_size() == empty_cache_size
+ else:
+ # Whole cache directory has been deleted
+ assert fs.cache_size() == 0
+
+
+def test_spurious_directory_issue1410(tmpdir):
+ import zipfile
+
+ os.chdir(tmpdir)
+ zipfile.ZipFile("dir.zip", mode="w").open("file.txt", "w").write(b"hello")
+ fs = WholeFileCacheFileSystem(fs=ZipFileSystem("dir.zip"))
+
+ assert len(os.listdir()) == 1
+ with fs.open("/file.txt", "rb"):
+ pass
+
+ # There was a bug reported in issue #1410 in which a directory
+ # would be created and the next assertion would fail.
+ assert len(os.listdir()) == 1
+ assert fs._parent("/any/path") == "any" # correct for ZIP, which has no leading /
+
+
+def test_write_transaction(tmpdir, m, monkeypatch):
+ called = [0]
+ orig = m.put
+
+ def patched_put(*args, **kwargs):
+ called[0] += 1
+ orig(*args, **kwargs)
+
+ monkeypatch.setattr(m, "put", patched_put)
+ tmpdir = str(tmpdir)
+ fs, _ = fsspec.core.url_to_fs("simplecache::memory://", cache_storage=tmpdir)
+ with fs.transaction:
+ fs.pipe("myfile", b"1")
+ fs.pipe("otherfile", b"2")
+ fs.pipe("deep/dir/otherfile", b"3")
+ with fs.open("blarh", "wb") as f:
+ f.write(b"ff")
+ assert not m.find("")
+
+ assert fs.info("otherfile")["size"] == 1
+ assert fs.info("deep")["type"] == "directory"
+ assert fs.isdir("deep")
+ assert fs.ls("deep", detail=False) == ["/deep/dir"]
+
+ assert m.cat("myfile") == b"1"
+ assert m.cat("otherfile") == b"2"
+ assert called[0] == 1 # copy was done in one go
+
+
+def test_filecache_write(tmpdir, m):
+ fs = fsspec.filesystem(
+ "filecache", target_protocol="memory", cache_storage=str(tmpdir)
+ )
+ fn = "sample_file_in_mem.txt"
+ data = "hello world from memory"
+ with fs.open(fn, "w") as f:
+ assert not m.exists(fn)
+ f.write(data)
+
+ assert m.cat(fn) == data.encode()
+ assert fs.cat(fn) == data.encode()
diff --git a/fsspec/implementations/tests/test_common.py b/fsspec/implementations/tests/test_common.py
index 3f44554..f09f13c 100644
--- a/fsspec/implementations/tests/test_common.py
+++ b/fsspec/implementations/tests/test_common.py
@@ -1,5 +1,35 @@
import datetime
import time
+
import pytest
+
from fsspec import AbstractFileSystem
from fsspec.implementations.tests.conftest import READ_ONLY_FILESYSTEMS
+
+
+@pytest.mark.parametrize("fs", ["local"], indirect=["fs"])
+def test_created(fs: AbstractFileSystem, temp_file):
+ try:
+ fs.touch(temp_file)
+ created = fs.created(path=temp_file)
+ assert isinstance(created, datetime.datetime)
+ finally:
+ if not isinstance(fs, tuple(READ_ONLY_FILESYSTEMS)):
+ fs.rm(temp_file)
+
+
+@pytest.mark.parametrize("fs", ["local", "memory", "arrow"], indirect=["fs"])
+def test_modified(fs: AbstractFileSystem, temp_file):
+ try:
+ fs.touch(temp_file)
+ # created = fs.created(path=temp_file)
+ created = datetime.datetime.now(
+ tz=datetime.timezone.utc
+ ) # pyarrow only have modified
+ time.sleep(0.05)
+ fs.touch(temp_file)
+ modified = fs.modified(path=temp_file)
+ assert isinstance(modified, datetime.datetime)
+ assert modified > created
+ finally:
+ fs.rm(temp_file)
diff --git a/fsspec/implementations/tests/test_dask.py b/fsspec/implementations/tests/test_dask.py
index cb460c2..13756d9 100644
--- a/fsspec/implementations/tests/test_dask.py
+++ b/fsspec/implementations/tests/test_dask.py
@@ -1,3 +1,29 @@
import pytest
+
import fsspec
-pytest.importorskip('distributed')
+
+pytest.importorskip("distributed")
+
+
+@pytest.fixture()
+def cli(tmpdir):
+ import dask.distributed
+
+ client = dask.distributed.Client(n_workers=1)
+
+ def setup():
+ m = fsspec.filesystem("memory")
+ with m.open("afile", "wb") as f:
+ f.write(b"data")
+
+ client.run(setup)
+ try:
+ yield client
+ finally:
+ client.shutdown()
+
+
+def test_basic(cli):
+ fs = fsspec.filesystem("dask", target_protocol="memory")
+ assert fs.ls("", detail=False) == ["/afile"]
+ assert fs.cat("/afile") == b"data"
diff --git a/fsspec/implementations/tests/test_data.py b/fsspec/implementations/tests/test_data.py
index 29bbf8e..ea99dc9 100644
--- a/fsspec/implementations/tests/test_data.py
+++ b/fsspec/implementations/tests/test_data.py
@@ -1 +1,20 @@
import fsspec
+
+
+def test_1():
+ with fsspec.open("data:text/plain;base64,SGVsbG8sIFdvcmxkIQ==") as f:
+ assert f.read() == b"Hello, World!"
+
+ with fsspec.open("data:,Hello%2C%20World%21") as f:
+ assert f.read() == b"Hello, World!"
+
+
+def test_info():
+ fs = fsspec.filesystem("data")
+ info = fs.info("data:text/html,%3Ch1%3EHello%2C%20World%21%3C%2Fh1%3E")
+ assert info == {
+ "name": "%3Ch1%3EHello%2C%20World%21%3C%2Fh1%3E",
+ "size": 22,
+ "type": "file",
+ "mimetype": "text/html",
+ }
diff --git a/fsspec/implementations/tests/test_dbfs.py b/fsspec/implementations/tests/test_dbfs.py
index 18a8bb2..66475b2 100644
--- a/fsspec/implementations/tests/test_dbfs.py
+++ b/fsspec/implementations/tests/test_dbfs.py
@@ -21,20 +21,25 @@ you need to re-record the answers. This can be done as follows:
5. Now execute the tests as normal. The results of the API calls will be recorded.
6. Unset the environment variables and replay the tests.
"""
+
import os
import sys
from urllib.parse import urlparse
+
import numpy
import pytest
+
import fsspec
+
if sys.version_info >= (3, 10):
- pytest.skip('These tests need to be re-recorded.', allow_module_level=True)
-DUMMY_INSTANCE = 'my_instance.com'
-INSTANCE = os.getenv('DBFS_INSTANCE', DUMMY_INSTANCE)
-TOKEN = os.getenv('DBFS_TOKEN', '')
+ pytest.skip("These tests need to be re-recorded.", allow_module_level=True)
+DUMMY_INSTANCE = "my_instance.com"
+INSTANCE = os.getenv("DBFS_INSTANCE", DUMMY_INSTANCE)
+TOKEN = os.getenv("DBFS_TOKEN", "")
-@pytest.fixture(scope='module')
+
+@pytest.fixture(scope="module")
def vcr_config():
"""
To not record information in the instance and token details
@@ -45,4 +50,219 @@ def vcr_config():
If the DBFS_TOKEN env variable is set, we record with VCR.
If not, we only replay (to not accidentally record with a wrong URL).
"""
- pass
+
+ def before_record_response(response):
+ try:
+ del response["headers"]["x-databricks-org-id"]
+ del response["headers"]["date"]
+ except KeyError:
+ pass
+ return response
+
+ def before_record_request(request):
+ # Replace the instance URL
+ uri = urlparse(request.uri)
+ uri = uri._replace(netloc=DUMMY_INSTANCE)
+ request.uri = uri.geturl()
+
+ return request
+
+ if TOKEN:
+ return {
+ "record_mode": "once",
+ "filter_headers": [("authorization", "DUMMY")],
+ "before_record_response": before_record_response,
+ "before_record_request": before_record_request,
+ }
+ else:
+ return {
+ "record_mode": "none",
+ }
+
+
+@pytest.fixture
+def dbfsFS():
+ fs = fsspec.filesystem("dbfs", instance=INSTANCE, token=TOKEN)
+
+ return fs
+
+
+@pytest.fixture
+def make_mock_diabetes_ds():
+ pa = pytest.importorskip("pyarrow")
+
+ names = [
+ "Pregnancies",
+ "Glucose",
+ "BloodPressure",
+ "SkinThickness",
+ "Insulin",
+ "BMI",
+ "DiabetesPedigreeFunction",
+ "Age",
+ "Outcome",
+ ]
+ pregnancies = pa.array(numpy.random.randint(low=0, high=17, size=25))
+ glucose = pa.array(numpy.random.randint(low=0, high=199, size=25))
+ blood_pressure = pa.array(numpy.random.randint(low=0, high=122, size=25))
+ skin_thickness = pa.array(numpy.random.randint(low=0, high=99, size=25))
+ insulin = pa.array(numpy.random.randint(low=0, high=846, size=25))
+ bmi = pa.array(numpy.random.uniform(0.0, 67.1, size=25))
+ diabetes_pedigree_function = pa.array(numpy.random.uniform(0.08, 2.42, size=25))
+ age = pa.array(numpy.random.randint(low=21, high=81, size=25))
+ outcome = pa.array(numpy.random.randint(low=0, high=1, size=25))
+
+ return pa.Table.from_arrays(
+ arrays=[
+ pregnancies,
+ glucose,
+ blood_pressure,
+ skin_thickness,
+ insulin,
+ bmi,
+ diabetes_pedigree_function,
+ age,
+ outcome,
+ ],
+ names=names,
+ )
+
+
+@pytest.mark.vcr()
+def test_dbfs_file_listing(dbfsFS):
+ assert "/FileStore" in dbfsFS.ls("/", detail=False)
+ assert {"name": "/FileStore", "size": 0, "type": "directory"} in dbfsFS.ls(
+ "/", detail=True
+ )
+
+
+@pytest.mark.vcr()
+def test_dbfs_mkdir(dbfsFS):
+ dbfsFS.rm("/FileStore/my", recursive=True)
+ assert "/FileStore/my" not in dbfsFS.ls("/FileStore/", detail=False)
+
+ dbfsFS.mkdir("/FileStore/my/dir", create_parents=True)
+
+ assert "/FileStore/my" in dbfsFS.ls("/FileStore/", detail=False)
+ assert "/FileStore/my/dir" in dbfsFS.ls("/FileStore/my/", detail=False)
+
+ with pytest.raises(FileExistsError):
+ dbfsFS.mkdir("/FileStore/my/dir", create_parents=True, exist_ok=False)
+
+ with pytest.raises(OSError):
+ dbfsFS.rm("/FileStore/my", recursive=False)
+
+ assert "/FileStore/my" in dbfsFS.ls("/FileStore/", detail=False)
+
+ dbfsFS.rm("/FileStore/my", recursive=True)
+ assert "/FileStore/my" not in dbfsFS.ls("/FileStore/", detail=False)
+
+
+@pytest.mark.vcr()
+def test_dbfs_write_and_read(dbfsFS):
+ dbfsFS.rm("/FileStore/file.csv")
+ assert "/FileStore/file.csv" not in dbfsFS.ls("/FileStore/", detail=False)
+
+ content = b"This is a test\n" * 100000 + b"For this is the end\n"
+
+ with dbfsFS.open("/FileStore/file.csv", "wb") as f:
+ f.write(content)
+
+ assert "/FileStore/file.csv" in dbfsFS.ls("/FileStore", detail=False)
+
+ with dbfsFS.open("/FileStore/file.csv", "rb") as f:
+ data = f.read()
+ assert data == content
+ dbfsFS.rm("/FileStore/file.csv")
+ assert "/FileStore/file.csv" not in dbfsFS.ls("/FileStore/", detail=False)
+
+
+@pytest.mark.vcr()
+def test_dbfs_read_range(dbfsFS):
+ dbfsFS.rm("/FileStore/file.txt")
+ assert "/FileStore/file.txt" not in dbfsFS.ls("/FileStore/", detail=False)
+ content = b"This is a test\n"
+ with dbfsFS.open("/FileStore/file.txt", "wb") as f:
+ f.write(content)
+ assert "/FileStore/file.txt" in dbfsFS.ls("/FileStore", detail=False)
+ assert dbfsFS.cat_file("/FileStore/file.txt", start=8, end=14) == content[8:14]
+ dbfsFS.rm("/FileStore/file.txt")
+ assert "/FileStore/file.txt" not in dbfsFS.ls("/FileStore/", detail=False)
+
+
+@pytest.mark.vcr()
+def test_dbfs_read_range_chunked(dbfsFS):
+ dbfsFS.rm("/FileStore/large_file.txt")
+ assert "/FileStore/large_file.txt" not in dbfsFS.ls("/FileStore/", detail=False)
+ content = b"This is a test\n" * (1 * 2**18) + b"For this is the end\n"
+ with dbfsFS.open("/FileStore/large_file.txt", "wb") as f:
+ f.write(content)
+ assert "/FileStore/large_file.txt" in dbfsFS.ls("/FileStore", detail=False)
+ assert dbfsFS.cat_file("/FileStore/large_file.txt", start=8) == content[8:]
+ dbfsFS.rm("/FileStore/large_file.txt")
+ assert "/FileStore/large_file.txt" not in dbfsFS.ls("/FileStore/", detail=False)
+
+
+@pytest.mark.vcr()
+def test_dbfs_write_pyarrow_non_partitioned(dbfsFS, make_mock_diabetes_ds):
+ pytest.importorskip("pyarrow.dataset")
+ pq = pytest.importorskip("pyarrow.parquet")
+
+ dbfsFS.rm("/FileStore/pyarrow", recursive=True)
+ assert "/FileStore/pyarrow" not in dbfsFS.ls("/FileStore/", detail=False)
+
+ pq.write_to_dataset(
+ make_mock_diabetes_ds,
+ filesystem=dbfsFS,
+ compression="none",
+ existing_data_behavior="error",
+ root_path="/FileStore/pyarrow/diabetes",
+ use_threads=False,
+ )
+
+ assert len(dbfsFS.ls("/FileStore/pyarrow/diabetes", detail=False)) == 1
+ assert (
+ "/FileStore/pyarrow/diabetes"
+ in dbfsFS.ls("/FileStore/pyarrow/diabetes", detail=False)[0]
+ and ".parquet" in dbfsFS.ls("/FileStore/pyarrow/diabetes", detail=False)[0]
+ )
+
+ dbfsFS.rm("/FileStore/pyarrow", recursive=True)
+ assert "/FileStore/pyarrow" not in dbfsFS.ls("/FileStore/", detail=False)
+
+
+@pytest.mark.vcr()
+def test_dbfs_read_pyarrow_non_partitioned(dbfsFS, make_mock_diabetes_ds):
+ ds = pytest.importorskip("pyarrow.dataset")
+ pq = pytest.importorskip("pyarrow.parquet")
+
+ dbfsFS.rm("/FileStore/pyarrow", recursive=True)
+ assert "/FileStore/pyarrow" not in dbfsFS.ls("/FileStore/", detail=False)
+
+ pq.write_to_dataset(
+ make_mock_diabetes_ds,
+ filesystem=dbfsFS,
+ compression="none",
+ existing_data_behavior="error",
+ root_path="/FileStore/pyarrow/diabetes",
+ use_threads=False,
+ )
+
+ assert len(dbfsFS.ls("/FileStore/pyarrow/diabetes", detail=False)) == 1
+ assert (
+ "/FileStore/pyarrow/diabetes"
+ in dbfsFS.ls("/FileStore/pyarrow/diabetes", detail=False)[0]
+ and ".parquet" in dbfsFS.ls("/FileStore/pyarrow/diabetes", detail=False)[0]
+ )
+
+ arr_res = ds.dataset(
+ source="/FileStore/pyarrow/diabetes",
+ filesystem=dbfsFS,
+ ).to_table()
+
+ assert arr_res.num_rows == make_mock_diabetes_ds.num_rows
+ assert arr_res.num_columns == make_mock_diabetes_ds.num_columns
+ assert set(arr_res.schema).difference(set(make_mock_diabetes_ds.schema)) == set()
+
+ dbfsFS.rm("/FileStore/pyarrow", recursive=True)
+ assert "/FileStore/pyarrow" not in dbfsFS.ls("/FileStore/", detail=False)
diff --git a/fsspec/implementations/tests/test_dirfs.py b/fsspec/implementations/tests/test_dirfs.py
index 45da94d..c04ba66 100644
--- a/fsspec/implementations/tests/test_dirfs.py
+++ b/fsspec/implementations/tests/test_dirfs.py
@@ -1,7 +1,591 @@
import pytest
+
from fsspec.asyn import AsyncFileSystem
from fsspec.implementations.dirfs import DirFileSystem
from fsspec.spec import AbstractFileSystem
-PATH = 'path/to/dir'
-ARGS = ['foo', 'bar']
-KWARGS = {'baz': 'baz', 'qux': 'qux'}
+
+PATH = "path/to/dir"
+ARGS = ["foo", "bar"]
+KWARGS = {"baz": "baz", "qux": "qux"}
+
+
+@pytest.fixture
+def make_fs(mocker):
+ def _make_fs(async_impl=False, asynchronous=False):
+ attrs = {
+ "sep": "/",
+ "async_impl": async_impl,
+ "_strip_protocol": lambda path: path,
+ }
+
+ if async_impl:
+ attrs["asynchronous"] = asynchronous
+ cls = AsyncFileSystem
+ else:
+ cls = AbstractFileSystem
+
+ fs = mocker.MagicMock(spec=cls, **attrs)
+
+ return fs
+
+ return _make_fs
+
+
+@pytest.fixture(
+ params=[
+ pytest.param(False, id="sync"),
+ pytest.param(True, id="async"),
+ ]
+)
+def fs(make_fs, request):
+ return make_fs(async_impl=request.param)
+
+
+@pytest.fixture
+def asyncfs(make_fs):
+ return make_fs(async_impl=True, asynchronous=True)
+
+
+@pytest.fixture
+def make_dirfs():
+ def _make_dirfs(fs, asynchronous=False):
+ return DirFileSystem(PATH, fs, asynchronous=asynchronous)
+
+ return _make_dirfs
+
+
+@pytest.fixture
+def dirfs(make_dirfs, fs):
+ return make_dirfs(fs)
+
+
+@pytest.fixture
+def adirfs(make_dirfs, asyncfs):
+ return make_dirfs(asyncfs, asynchronous=True)
+
+
+def test_dirfs(fs, asyncfs):
+ DirFileSystem("path", fs)
+ DirFileSystem("path", asyncfs, asynchronous=True)
+
+ with pytest.raises(ValueError):
+ DirFileSystem("path", asyncfs)
+
+ with pytest.raises(ValueError):
+ DirFileSystem("path", fs, asynchronous=True)
+
+
+@pytest.mark.parametrize(
+ "root, rel, full",
+ [
+ ("", "", ""),
+ ("", "foo", "foo"),
+ ("root", "", "root"),
+ ("root", "foo", "root/foo"),
+ ],
+)
+def test_path(fs, root, rel, full):
+ dirfs = DirFileSystem(root, fs)
+ assert dirfs._join(rel) == full
+ assert dirfs._relpath(full) == rel
+
+
+def test_sep(mocker, dirfs):
+ sep = mocker.Mock()
+ dirfs.fs.sep = sep
+ assert dirfs.sep == sep
+
+
+@pytest.mark.asyncio
+async def test_set_session(mocker, adirfs):
+ adirfs.fs.set_session = mocker.AsyncMock()
+ assert (
+ await adirfs.set_session(*ARGS, **KWARGS) == adirfs.fs.set_session.return_value
+ )
+ adirfs.fs.set_session.assert_called_once_with(*ARGS, **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_rm_file(adirfs):
+ await adirfs._rm_file("file", **KWARGS)
+ adirfs.fs._rm_file.assert_called_once_with(f"{PATH}/file", **KWARGS)
+
+
+def test_rm_file(dirfs):
+ dirfs.rm_file("file", **KWARGS)
+ dirfs.fs.rm_file.assert_called_once_with("path/to/dir/file", **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_rm(adirfs):
+ await adirfs._rm("file", *ARGS, **KWARGS)
+ adirfs.fs._rm.assert_called_once_with("path/to/dir/file", *ARGS, **KWARGS)
+
+
+def test_rm(dirfs):
+ dirfs.rm("file", *ARGS, **KWARGS)
+ dirfs.fs.rm.assert_called_once_with("path/to/dir/file", *ARGS, **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_cp_file(adirfs):
+ await adirfs._cp_file("one", "two", **KWARGS)
+ adirfs.fs._cp_file.assert_called_once_with(f"{PATH}/one", f"{PATH}/two", **KWARGS)
+
+
+def test_cp_file(dirfs):
+ dirfs.cp_file("one", "two", **KWARGS)
+ dirfs.fs.cp_file.assert_called_once_with(f"{PATH}/one", f"{PATH}/two", **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_copy(adirfs):
+ await adirfs._copy("one", "two", *ARGS, **KWARGS)
+ adirfs.fs._copy.assert_called_once_with(
+ f"{PATH}/one", f"{PATH}/two", *ARGS, **KWARGS
+ )
+
+
+def test_copy(dirfs):
+ dirfs.copy("one", "two", *ARGS, **KWARGS)
+ dirfs.fs.copy.assert_called_once_with(f"{PATH}/one", f"{PATH}/two", *ARGS, **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_pipe(adirfs):
+ await adirfs._pipe("file", *ARGS, **KWARGS)
+ adirfs.fs._pipe.assert_called_once_with(f"{PATH}/file", *ARGS, **KWARGS)
+
+
+def test_pipe(dirfs):
+ dirfs.pipe("file", *ARGS, **KWARGS)
+ dirfs.fs.pipe.assert_called_once_with(f"{PATH}/file", *ARGS, **KWARGS)
+
+
+def test_pipe_dict(dirfs):
+ dirfs.pipe({"file": b"foo"}, *ARGS, **KWARGS)
+ dirfs.fs.pipe.assert_called_once_with({f"{PATH}/file": b"foo"}, *ARGS, **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_pipe_file(adirfs):
+ await adirfs._pipe_file("file", *ARGS, **KWARGS)
+ adirfs.fs._pipe_file.assert_called_once_with(f"{PATH}/file", *ARGS, **KWARGS)
+
+
+def test_pipe_file(dirfs):
+ dirfs.pipe_file("file", *ARGS, **KWARGS)
+ dirfs.fs.pipe_file.assert_called_once_with(f"{PATH}/file", *ARGS, **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_cat_file(adirfs):
+ assert (
+ await adirfs._cat_file("file", *ARGS, **KWARGS)
+ == adirfs.fs._cat_file.return_value
+ )
+ adirfs.fs._cat_file.assert_called_once_with(f"{PATH}/file", *ARGS, **KWARGS)
+
+
+def test_cat_file(dirfs):
+ assert dirfs.cat_file("file", *ARGS, **KWARGS) == dirfs.fs.cat_file.return_value
+ dirfs.fs.cat_file.assert_called_once_with(f"{PATH}/file", *ARGS, **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_cat(adirfs):
+ assert await adirfs._cat("file", *ARGS, **KWARGS) == adirfs.fs._cat.return_value
+ adirfs.fs._cat.assert_called_once_with(f"{PATH}/file", *ARGS, **KWARGS)
+
+
+def test_cat(dirfs):
+ assert dirfs.cat("file", *ARGS, **KWARGS) == dirfs.fs.cat.return_value
+ dirfs.fs.cat.assert_called_once_with(f"{PATH}/file", *ARGS, **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_cat_list(adirfs):
+ adirfs.fs._cat.return_value = {f"{PATH}/one": "foo", f"{PATH}/two": "bar"}
+ assert await adirfs._cat(["one", "two"], *ARGS, **KWARGS) == {
+ "one": "foo",
+ "two": "bar",
+ }
+ adirfs.fs._cat.assert_called_once_with(
+ [f"{PATH}/one", f"{PATH}/two"], *ARGS, **KWARGS
+ )
+
+
+def test_cat_list(dirfs):
+ dirfs.fs.cat.return_value = {f"{PATH}/one": "foo", f"{PATH}/two": "bar"}
+ assert dirfs.cat(["one", "two"], *ARGS, **KWARGS) == {"one": "foo", "two": "bar"}
+ dirfs.fs.cat.assert_called_once_with(
+ [f"{PATH}/one", f"{PATH}/two"], *ARGS, **KWARGS
+ )
+
+
+@pytest.mark.asyncio
+async def test_async_put_file(adirfs):
+ await adirfs._put_file("local", "file", **KWARGS)
+ adirfs.fs._put_file.assert_called_once_with("local", f"{PATH}/file", **KWARGS)
+
+
+def test_put_file(dirfs):
+ dirfs.put_file("local", "file", **KWARGS)
+ dirfs.fs.put_file.assert_called_once_with("local", f"{PATH}/file", **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_put(adirfs):
+ await adirfs._put("local", "file", **KWARGS)
+ adirfs.fs._put.assert_called_once_with("local", f"{PATH}/file", **KWARGS)
+
+
+def test_put(dirfs):
+ dirfs.put("local", "file", **KWARGS)
+ dirfs.fs.put.assert_called_once_with("local", f"{PATH}/file", **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_get_file(adirfs):
+ await adirfs._get_file("file", "local", **KWARGS)
+ adirfs.fs._get_file.assert_called_once_with(f"{PATH}/file", "local", **KWARGS)
+
+
+def test_get_file(dirfs):
+ dirfs.get_file("file", "local", **KWARGS)
+ dirfs.fs.get_file.assert_called_once_with(f"{PATH}/file", "local", **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_get(adirfs):
+ await adirfs._get("file", "local", **KWARGS)
+ adirfs.fs._get.assert_called_once_with(f"{PATH}/file", "local", **KWARGS)
+
+
+def test_get(dirfs):
+ dirfs.get("file", "local", **KWARGS)
+ dirfs.fs.get.assert_called_once_with(f"{PATH}/file", "local", **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_isfile(adirfs):
+ assert await adirfs._isfile("file") == adirfs.fs._isfile.return_value
+ adirfs.fs._isfile.assert_called_once_with(f"{PATH}/file")
+
+
+def test_isfile(dirfs):
+ assert dirfs.isfile("file") == dirfs.fs.isfile.return_value
+ dirfs.fs.isfile.assert_called_once_with(f"{PATH}/file")
+
+
+@pytest.mark.asyncio
+async def test_async_isdir(adirfs):
+ assert await adirfs._isdir("file") == adirfs.fs._isdir.return_value
+ adirfs.fs._isdir.assert_called_once_with(f"{PATH}/file")
+
+
+def test_isdir(dirfs):
+ assert dirfs.isdir("file") == dirfs.fs.isdir.return_value
+ dirfs.fs.isdir.assert_called_once_with(f"{PATH}/file")
+
+
+@pytest.mark.asyncio
+async def test_async_size(adirfs):
+ assert await adirfs._size("file") == adirfs.fs._size.return_value
+ adirfs.fs._size.assert_called_once_with(f"{PATH}/file")
+
+
+def test_size(dirfs):
+ assert dirfs.size("file") == dirfs.fs.size.return_value
+ dirfs.fs.size.assert_called_once_with(f"{PATH}/file")
+
+
+@pytest.mark.asyncio
+async def test_async_exists(adirfs):
+ assert await adirfs._exists("file") == adirfs.fs._exists.return_value
+ adirfs.fs._exists.assert_called_once_with(f"{PATH}/file")
+
+
+def test_exists(dirfs):
+ assert dirfs.exists("file") == dirfs.fs.exists.return_value
+ dirfs.fs.exists.assert_called_once_with(f"{PATH}/file")
+
+
+@pytest.mark.asyncio
+async def test_async_info(adirfs):
+ assert await adirfs._info("file", **KWARGS) == adirfs.fs._info.return_value
+ adirfs.fs._info.assert_called_once_with(f"{PATH}/file", **KWARGS)
+
+
+def test_info(dirfs):
+ assert dirfs.info("file", **KWARGS) == dirfs.fs.info.return_value
+ dirfs.fs.info.assert_called_once_with(f"{PATH}/file", **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_ls(adirfs):
+ adirfs.fs._ls.return_value = [f"{PATH}/file"]
+ assert await adirfs._ls("file", detail=False, **KWARGS) == ["file"]
+ adirfs.fs._ls.assert_called_once_with(f"{PATH}/file", detail=False, **KWARGS)
+
+
+def test_ls(dirfs):
+ dirfs.fs.ls.return_value = [f"{PATH}/file"]
+ assert dirfs.ls("file", detail=False, **KWARGS) == ["file"]
+ dirfs.fs.ls.assert_called_once_with(f"{PATH}/file", detail=False, **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_ls_detail(adirfs):
+ adirfs.fs._ls.return_value = [{"name": f"{PATH}/file", "foo": "bar"}]
+ assert await adirfs._ls("file", detail=True, **KWARGS) == [
+ {"name": "file", "foo": "bar"}
+ ]
+ adirfs.fs._ls.assert_called_once_with(f"{PATH}/file", detail=True, **KWARGS)
+
+
+def test_ls_detail(dirfs):
+ dirfs.fs.ls.return_value = [{"name": f"{PATH}/file", "foo": "bar"}]
+ assert dirfs.ls("file", detail=True, **KWARGS) == [{"name": "file", "foo": "bar"}]
+ dirfs.fs.ls.assert_called_once_with(f"{PATH}/file", detail=True, **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_walk(adirfs, mocker):
+ async def _walk(path, *args, **kwargs):
+ yield (f"{PATH}/root", ["foo", "bar"], ["baz", "qux"])
+
+ adirfs.fs._walk = mocker.MagicMock()
+ adirfs.fs._walk.side_effect = _walk
+
+ actual = [entry async for entry in adirfs._walk("root", *ARGS, **KWARGS)]
+ assert actual == [("root", ["foo", "bar"], ["baz", "qux"])]
+ adirfs.fs._walk.assert_called_once_with(f"{PATH}/root", *ARGS, **KWARGS)
+
+
+def test_walk(dirfs):
+ dirfs.fs.walk.return_value = iter(
+ [(f"{PATH}/root", ["foo", "bar"], ["baz", "qux"])]
+ )
+ assert list(dirfs.walk("root", *ARGS, **KWARGS)) == [
+ ("root", ["foo", "bar"], ["baz", "qux"])
+ ]
+ dirfs.fs.walk.assert_called_once_with(f"{PATH}/root", *ARGS, **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_glob(adirfs):
+ adirfs.fs._glob.return_value = [f"{PATH}/one", f"{PATH}/two"]
+ assert await adirfs._glob("*", **KWARGS) == ["one", "two"]
+ adirfs.fs._glob.assert_called_once_with(f"{PATH}/*", **KWARGS)
+
+
+def test_glob(dirfs):
+ dirfs.fs.glob.return_value = [f"{PATH}/one", f"{PATH}/two"]
+ assert dirfs.glob("*", **KWARGS) == ["one", "two"]
+ dirfs.fs.glob.assert_called_once_with(f"{PATH}/*", **KWARGS)
+
+
+def test_glob_with_protocol(dirfs):
+ dirfs.fs.glob.return_value = [f"{PATH}/one", f"{PATH}/two"]
+ assert dirfs.glob("dir://*", **KWARGS) == ["one", "two"]
+ dirfs.fs.glob.assert_called_once_with(f"{PATH}/*", **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_glob_detail(adirfs):
+ adirfs.fs._glob.return_value = {
+ f"{PATH}/one": {"foo": "bar"},
+ f"{PATH}/two": {"baz": "qux"},
+ }
+ assert await adirfs._glob("*", detail=True, **KWARGS) == {
+ "one": {"foo": "bar"},
+ "two": {"baz": "qux"},
+ }
+ adirfs.fs._glob.assert_called_once_with(f"{PATH}/*", detail=True, **KWARGS)
+
+
+def test_glob_detail(dirfs):
+ dirfs.fs.glob.return_value = {
+ f"{PATH}/one": {"foo": "bar"},
+ f"{PATH}/two": {"baz": "qux"},
+ }
+ assert dirfs.glob("*", detail=True, **KWARGS) == {
+ "one": {"foo": "bar"},
+ "two": {"baz": "qux"},
+ }
+ dirfs.fs.glob.assert_called_once_with(f"{PATH}/*", detail=True, **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_du(adirfs):
+ adirfs.fs._du.return_value = 1234
+ assert await adirfs._du("file", *ARGS, **KWARGS) == 1234
+ adirfs.fs._du.assert_called_once_with(f"{PATH}/file", *ARGS, **KWARGS)
+
+
+def test_du(dirfs):
+ dirfs.fs.du.return_value = 1234
+ assert dirfs.du("file", *ARGS, **KWARGS) == 1234
+ dirfs.fs.du.assert_called_once_with(f"{PATH}/file", *ARGS, **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_du_granular(adirfs):
+ adirfs.fs._du.return_value = {f"{PATH}/dir/one": 1, f"{PATH}/dir/two": 2}
+ assert await adirfs._du("dir", *ARGS, total=False, **KWARGS) == {
+ "dir/one": 1,
+ "dir/two": 2,
+ }
+ adirfs.fs._du.assert_called_once_with(f"{PATH}/dir", *ARGS, total=False, **KWARGS)
+
+
+def test_du_granular(dirfs):
+ dirfs.fs.du.return_value = {f"{PATH}/dir/one": 1, f"{PATH}/dir/two": 2}
+ assert dirfs.du("dir", *ARGS, total=False, **KWARGS) == {"dir/one": 1, "dir/two": 2}
+ dirfs.fs.du.assert_called_once_with(f"{PATH}/dir", *ARGS, total=False, **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_find(adirfs):
+ adirfs.fs._find.return_value = [f"{PATH}/dir/one", f"{PATH}/dir/two"]
+ assert await adirfs._find("dir", *ARGS, **KWARGS) == ["dir/one", "dir/two"]
+ adirfs.fs._find.assert_called_once_with(f"{PATH}/dir", *ARGS, **KWARGS)
+
+
+def test_find(dirfs):
+ dirfs.fs.find.return_value = [f"{PATH}/dir/one", f"{PATH}/dir/two"]
+ assert dirfs.find("dir", *ARGS, **KWARGS) == ["dir/one", "dir/two"]
+ dirfs.fs.find.assert_called_once_with(f"{PATH}/dir", *ARGS, **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_find_detail(adirfs):
+ adirfs.fs._find.return_value = {
+ f"{PATH}/dir/one": {"foo": "bar"},
+ f"{PATH}/dir/two": {"baz": "qux"},
+ }
+ assert await adirfs._find("dir", *ARGS, detail=True, **KWARGS) == {
+ "dir/one": {"foo": "bar"},
+ "dir/two": {"baz": "qux"},
+ }
+ adirfs.fs._find.assert_called_once_with(f"{PATH}/dir", *ARGS, detail=True, **KWARGS)
+
+
+def test_find_detail(dirfs):
+ dirfs.fs.find.return_value = {
+ f"{PATH}/dir/one": {"foo": "bar"},
+ f"{PATH}/dir/two": {"baz": "qux"},
+ }
+ assert dirfs.find("dir", *ARGS, detail=True, **KWARGS) == {
+ "dir/one": {"foo": "bar"},
+ "dir/two": {"baz": "qux"},
+ }
+ dirfs.fs.find.assert_called_once_with(f"{PATH}/dir", *ARGS, detail=True, **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_expand_path(adirfs):
+ adirfs.fs._expand_path.return_value = [f"{PATH}/file"]
+ assert await adirfs._expand_path("*", *ARGS, **KWARGS) == ["file"]
+ adirfs.fs._expand_path.assert_called_once_with(f"{PATH}/*", *ARGS, **KWARGS)
+
+
+def test_expand_path(dirfs):
+ dirfs.fs.expand_path.return_value = [f"{PATH}/file"]
+ assert dirfs.expand_path("*", *ARGS, **KWARGS) == ["file"]
+ dirfs.fs.expand_path.assert_called_once_with(f"{PATH}/*", *ARGS, **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_expand_path_list(adirfs):
+ adirfs.fs._expand_path.return_value = [f"{PATH}/1file", f"{PATH}/2file"]
+ assert await adirfs._expand_path(["1*", "2*"], *ARGS, **KWARGS) == [
+ "1file",
+ "2file",
+ ]
+ adirfs.fs._expand_path.assert_called_once_with(
+ [f"{PATH}/1*", f"{PATH}/2*"], *ARGS, **KWARGS
+ )
+
+
+def test_expand_path_list(dirfs):
+ dirfs.fs.expand_path.return_value = [f"{PATH}/1file", f"{PATH}/2file"]
+ assert dirfs.expand_path(["1*", "2*"], *ARGS, **KWARGS) == ["1file", "2file"]
+ dirfs.fs.expand_path.assert_called_once_with(
+ [f"{PATH}/1*", f"{PATH}/2*"], *ARGS, **KWARGS
+ )
+
+
+@pytest.mark.asyncio
+async def test_async_mkdir(adirfs):
+ await adirfs._mkdir("dir", *ARGS, **KWARGS)
+ adirfs.fs._mkdir.assert_called_once_with(f"{PATH}/dir", *ARGS, **KWARGS)
+
+
+def test_mkdir(dirfs):
+ dirfs.mkdir("dir", *ARGS, **KWARGS)
+ dirfs.fs.mkdir.assert_called_once_with(f"{PATH}/dir", *ARGS, **KWARGS)
+
+
+@pytest.mark.asyncio
+async def test_async_makedirs(adirfs):
+ await adirfs._makedirs("dir", *ARGS, **KWARGS)
+ adirfs.fs._makedirs.assert_called_once_with(f"{PATH}/dir", *ARGS, **KWARGS)
+
+
+def test_makedirs(dirfs):
+ dirfs.makedirs("dir", *ARGS, **KWARGS)
+ dirfs.fs.makedirs.assert_called_once_with(f"{PATH}/dir", *ARGS, **KWARGS)
+
+
+def test_rmdir(mocker, dirfs):
+ dirfs.fs.rmdir = mocker.Mock()
+ dirfs.rmdir("dir")
+ dirfs.fs.rmdir.assert_called_once_with(f"{PATH}/dir")
+
+
+def test_mv(mocker, dirfs):
+ dirfs.fs.mv = mocker.Mock()
+ dirfs.mv("one", "two", **KWARGS)
+ dirfs.fs.mv.assert_called_once_with(f"{PATH}/one", f"{PATH}/two", **KWARGS)
+
+
+def test_touch(mocker, dirfs):
+ dirfs.fs.touch = mocker.Mock()
+ dirfs.touch("file", **KWARGS)
+ dirfs.fs.touch.assert_called_once_with(f"{PATH}/file", **KWARGS)
+
+
+def test_created(mocker, dirfs):
+ dirfs.fs.created = mocker.Mock(return_value="date")
+ assert dirfs.created("file") == "date"
+ dirfs.fs.created.assert_called_once_with(f"{PATH}/file")
+
+
+def test_modified(mocker, dirfs):
+ dirfs.fs.modified = mocker.Mock(return_value="date")
+ assert dirfs.modified("file") == "date"
+ dirfs.fs.modified.assert_called_once_with(f"{PATH}/file")
+
+
+def test_sign(mocker, dirfs):
+ dirfs.fs.sign = mocker.Mock(return_value="url")
+ assert dirfs.sign("file", *ARGS, **KWARGS) == "url"
+ dirfs.fs.sign.assert_called_once_with(f"{PATH}/file", *ARGS, **KWARGS)
+
+
+def test_open(mocker, dirfs):
+ dirfs.fs.open = mocker.Mock()
+ assert dirfs.open("file", *ARGS, **KWARGS) == dirfs.fs.open.return_value
+ dirfs.fs.open.assert_called_once_with(f"{PATH}/file", *ARGS, **KWARGS)
+
+
+def test_from_url(m):
+ from fsspec.core import url_to_fs
+
+ m.pipe("inner/file", b"data")
+ fs, _ = url_to_fs("dir::memory://inner")
+ assert fs.ls("", False) == ["file"]
+ assert fs.ls("", True)[0]["name"] == "file"
+ assert fs.cat("file") == b"data"
diff --git a/fsspec/implementations/tests/test_ftp.py b/fsspec/implementations/tests/test_ftp.py
index 65bf2c1..d443d86 100644
--- a/fsspec/implementations/tests/test_ftp.py
+++ b/fsspec/implementations/tests/test_ftp.py
@@ -2,9 +2,177 @@ import os
import subprocess
import sys
import time
+
import pytest
+
import fsspec
from fsspec import open_files
from fsspec.implementations.ftp import FTPFileSystem
-ftplib = pytest.importorskip('ftplib')
+
+ftplib = pytest.importorskip("ftplib")
here = os.path.dirname(os.path.abspath(__file__))
+
+
+@pytest.fixture()
+def ftp():
+ pytest.importorskip("pyftpdlib")
+ P = subprocess.Popen(
+ [sys.executable, "-m", "pyftpdlib", "-d", here],
+ stderr=subprocess.STDOUT,
+ stdout=subprocess.PIPE,
+ )
+ try:
+ time.sleep(1)
+ yield "localhost", 2121
+ finally:
+ P.terminate()
+ P.wait()
+
+
+def test_basic(ftp):
+ host, port = ftp
+ fs = FTPFileSystem(host, port)
+ assert fs.ls("/", detail=False) == sorted(os.listdir(here))
+ out = fs.cat(f"/{os.path.basename(__file__)}")
+ assert out == open(__file__, "rb").read()
+
+
+def test_not_cached(ftp):
+ host, port = ftp
+ fs = FTPFileSystem(host, port)
+ fs2 = FTPFileSystem(host, port)
+ assert fs is not fs2
+
+
+@pytest.mark.parametrize("cache_type", ["bytes", "mmap"])
+def test_complex(ftp_writable, cache_type):
+ from fsspec.core import BytesCache
+
+ host, port, user, pw = ftp_writable
+ files = open_files(
+ "ftp:///ou*",
+ host=host,
+ port=port,
+ username=user,
+ password=pw,
+ block_size=10000,
+ cache_type=cache_type,
+ )
+ assert len(files) == 1
+ with files[0] as fo:
+ assert fo.read(10) == b"hellohello"
+ if isinstance(fo.cache, BytesCache):
+ assert len(fo.cache.cache) == 10010
+ assert fo.read(2) == b"he"
+ assert fo.tell() == 12
+
+
+def test_write_small(ftp_writable):
+ host, port, user, pw = ftp_writable
+ fs = FTPFileSystem(host, port, user, pw)
+ with fs.open("/out2", "wb") as f:
+ f.write(b"oi")
+ assert fs.cat("/out2") == b"oi"
+
+
+def test_with_url(ftp_writable):
+ host, port, user, pw = ftp_writable
+ fo = fsspec.open(f"ftp://{user}:{pw}@{host}:{port}/out", "wb")
+ with fo as f:
+ f.write(b"hello")
+ fo = fsspec.open(f"ftp://{user}:{pw}@{host}:{port}/out", "rb")
+ with fo as f:
+ assert f.read() == b"hello"
+
+
+@pytest.mark.parametrize("cache_type", ["bytes", "mmap"])
+def test_write_big(ftp_writable, cache_type):
+ host, port, user, pw = ftp_writable
+ fs = FTPFileSystem(host, port, user, pw, block_size=1000, cache_type=cache_type)
+ fn = "/bigger"
+ with fs.open(fn, "wb") as f:
+ f.write(b"o" * 500)
+ assert not fs.exists(fn)
+ f.write(b"o" * 1000)
+ fs.invalidate_cache()
+ assert fs.exists(fn)
+ f.write(b"o" * 200)
+ f.flush()
+
+ assert fs.info(fn)["size"] == 1700
+ assert fs.cat(fn) == b"o" * 1700
+
+
+def test_transaction(ftp_writable):
+ host, port, user, pw = ftp_writable
+ fs = FTPFileSystem(host, port, user, pw)
+ fs.mkdir("/tmp")
+ fn = "/tr"
+ with fs.transaction:
+ with fs.open(fn, "wb") as f:
+ f.write(b"not")
+ assert not fs.exists(fn)
+ assert fs.exists(fn)
+ assert fs.cat(fn) == b"not"
+
+ fs.rm(fn)
+ assert not fs.exists(fn)
+
+
+def test_transaction_with_cache(ftp_writable, tmpdir):
+ host, port, user, pw = ftp_writable
+ fs = FTPFileSystem(host, port, user, pw)
+ fs.mkdir("/tmp")
+ fs.mkdir("/tmp/dir")
+ assert "dir" in fs.ls("/tmp", detail=False)
+
+ with fs.transaction:
+ fs.rmdir("/tmp/dir")
+
+ assert "dir" not in fs.ls("/tmp", detail=False)
+ assert not fs.exists("/tmp/dir")
+
+
+def test_cat_get(ftp_writable, tmpdir):
+ host, port, user, pw = ftp_writable
+ fs = FTPFileSystem(host, port, user, pw, block_size=500)
+ fs.mkdir("/tmp")
+ data = b"hello" * 500
+ fs.pipe("/tmp/myfile", data)
+ assert fs.cat_file("/tmp/myfile") == data
+
+ fn = os.path.join(tmpdir, "lfile")
+ fs.get_file("/tmp/myfile", fn)
+ assert open(fn, "rb").read() == data
+
+
+def test_mkdir(ftp_writable):
+ host, port, user, pw = ftp_writable
+ fs = FTPFileSystem(host, port, user, pw)
+ with pytest.raises(ftplib.error_perm):
+ fs.mkdir("/tmp/not/exist", create_parents=False)
+ fs.mkdir("/tmp/not/exist")
+ assert fs.exists("/tmp/not/exist")
+ fs.makedirs("/tmp/not/exist", exist_ok=True)
+ with pytest.raises(FileExistsError):
+ fs.makedirs("/tmp/not/exist", exist_ok=False)
+ fs.makedirs("/tmp/not/exist/inner/inner")
+ assert fs.isdir("/tmp/not/exist/inner/inner")
+
+
+def test_rm_get_recursive(ftp_writable, tmpdir):
+ tmpdir = str(tmpdir)
+ host, port, user, pw = ftp_writable
+ fs = FTPFileSystem(host, port, user, pw)
+ fs.mkdir("/tmp/topdir")
+ fs.mkdir("/tmp/topdir/underdir")
+ fs.touch("/tmp/topdir/afile")
+ fs.touch("/tmp/topdir/underdir/afile")
+
+ fs.get("/tmp/topdir", tmpdir, recursive=True)
+
+ with pytest.raises(ftplib.error_perm):
+ fs.rmdir("/tmp/topdir")
+
+ fs.rm("/tmp/topdir", recursive=True)
+ assert not fs.exists("/tmp/topdir")
diff --git a/fsspec/implementations/tests/test_git.py b/fsspec/implementations/tests/test_git.py
index f742628..ffa7b47 100644
--- a/fsspec/implementations/tests/test_git.py
+++ b/fsspec/implementations/tests/test_git.py
@@ -2,7 +2,75 @@ import os
import shutil
import subprocess
import tempfile
+
import pytest
+
import fsspec
from fsspec.implementations.local import make_path_posix
-pygit2 = pytest.importorskip('pygit2')
+
+pygit2 = pytest.importorskip("pygit2")
+
+
+@pytest.fixture()
+def repo():
+ orig_dir = os.getcwd()
+ d = tempfile.mkdtemp()
+ try:
+ os.chdir(d)
+ subprocess.call("git init -b master", shell=True, cwd=d)
+ subprocess.call("git init -b master", shell=True, cwd=d)
+ subprocess.call('git config user.email "you@example.com"', shell=True, cwd=d)
+ subprocess.call('git config user.name "Your Name"', shell=True, cwd=d)
+ open(os.path.join(d, "file1"), "wb").write(b"data0")
+ subprocess.call("git add file1", shell=True, cwd=d)
+ subprocess.call('git commit -m "init"', shell=True, cwd=d)
+ sha = open(os.path.join(d, ".git/refs/heads/master"), "r").read().strip()
+ open(os.path.join(d, "file1"), "wb").write(b"data00")
+ subprocess.check_output('git commit -a -m "tagger"', shell=True, cwd=d)
+ subprocess.call('git tag -a thetag -m "make tag"', shell=True, cwd=d)
+ open(os.path.join(d, "file2"), "wb").write(b"data000")
+ subprocess.call("git add file2", shell=True)
+ subprocess.call('git commit -m "master tip"', shell=True, cwd=d)
+ subprocess.call("git checkout -b abranch", shell=True, cwd=d)
+ os.mkdir("inner")
+ open(os.path.join(d, "inner", "file1"), "wb").write(b"data3")
+ subprocess.call("git add inner/file1", shell=True, cwd=d)
+ subprocess.call('git commit -m "branch tip"', shell=True, cwd=d)
+ os.chdir(orig_dir)
+ yield d, sha
+ finally:
+ os.chdir(orig_dir)
+ shutil.rmtree(d)
+
+
+def test_refs(repo):
+ d, sha = repo
+ with fsspec.open("git://file1", path=d, ref=sha) as f:
+ assert f.read() == b"data0"
+
+ with fsspec.open("git://file1", path=d, ref="thetag") as f:
+ assert f.read() == b"data00"
+
+ with fsspec.open("git://file2", path=d, ref="master") as f:
+ assert f.read() == b"data000"
+
+ with fsspec.open("git://file2", path=d, ref=None) as f:
+ assert f.read() == b"data000"
+
+ with fsspec.open("git://inner/file1", path=d, ref="abranch") as f:
+ assert f.read() == b"data3"
+
+
+def test_url(repo):
+ d, sha = repo
+ fs, _, paths = fsspec.core.get_fs_token_paths(f"git://file1::file://{d}")
+ assert make_path_posix(d) in make_path_posix(fs.repo.path)
+ assert paths == ["file1"]
+ with fsspec.open(f"git://file1::file://{d}") as f:
+ assert f.read() == b"data00"
+
+ fs, _, paths = fsspec.core.get_fs_token_paths(f"git://{d}:master@file1")
+ assert make_path_posix(d) in make_path_posix(fs.repo.path)
+ assert paths == ["file1"]
+ with fsspec.open(f"git://{d}:master@file1") as f:
+ assert f.read() == b"data00"
diff --git a/fsspec/implementations/tests/test_http.py b/fsspec/implementations/tests/test_http.py
index 4ff3da6..81e438a 100644
--- a/fsspec/implementations/tests/test_http.py
+++ b/fsspec/implementations/tests/test_http.py
@@ -4,9 +4,572 @@ import json
import os
import sys
import time
+
import aiohttp
import pytest
+
import fsspec.asyn
import fsspec.utils
from fsspec.implementations.http import HTTPStreamFile
-from fsspec.tests.conftest import data, reset_files, server, win
+from fsspec.tests.conftest import data, reset_files, server, win # noqa: F401
+
+
+def test_list(server):
+ h = fsspec.filesystem("http")
+ out = h.glob(server + "/index/*")
+ assert out == [server + "/index/realfile"]
+
+
+def test_list_invalid_args(server):
+ with pytest.raises(TypeError):
+ h = fsspec.filesystem("http", use_foobar=True)
+ h.glob(server + "/index/*")
+
+
+def test_list_cache(server):
+ h = fsspec.filesystem("http", use_listings_cache=True)
+ out = h.glob(server + "/index/*")
+ assert out == [server + "/index/realfile"]
+
+
+def test_list_cache_with_expiry_time_cached(server):
+ h = fsspec.filesystem("http", use_listings_cache=True, listings_expiry_time=30)
+
+ # First, the directory cache is not initialized.
+ assert not h.dircache
+
+ # By querying the filesystem with "use_listings_cache=True",
+ # the cache will automatically get populated.
+ out = h.glob(server + "/index/*")
+ assert out == [server + "/index/realfile"]
+
+ # Verify cache content.
+ assert len(h.dircache) == 1
+
+ out = h.glob(server + "/index/*")
+ assert out == [server + "/index/realfile"]
+
+
+def test_list_cache_with_expiry_time_purged(server):
+ h = fsspec.filesystem("http", use_listings_cache=True, listings_expiry_time=0.3)
+
+ # First, the directory cache is not initialized.
+ assert not h.dircache
+
+ # By querying the filesystem with "use_listings_cache=True",
+ # the cache will automatically get populated.
+ out = h.glob(server + "/index/*")
+ assert out == [server + "/index/realfile"]
+ assert len(h.dircache) == 1
+
+ # Verify cache content.
+ assert server + "/index/" in h.dircache
+ assert len(h.dircache.get(server + "/index/")) == 1
+
+ # Wait beyond the TTL / cache expiry time.
+ time.sleep(0.31)
+
+ # Verify that the cache item should have been purged.
+ cached_items = h.dircache.get(server + "/index/")
+ assert cached_items is None
+
+ # Verify that after clearing the item from the cache,
+ # it can get populated again.
+ out = h.glob(server + "/index/*")
+ assert out == [server + "/index/realfile"]
+ cached_items = h.dircache.get(server + "/index/")
+ assert len(cached_items) == 1
+
+
+def test_list_cache_reuse(server):
+ h = fsspec.filesystem("http", use_listings_cache=True, listings_expiry_time=5)
+
+ # First, the directory cache is not initialized.
+ assert not h.dircache
+
+ # By querying the filesystem with "use_listings_cache=True",
+ # the cache will automatically get populated.
+ out = h.glob(server + "/index/*")
+ assert out == [server + "/index/realfile"]
+
+ # Verify cache content.
+ assert len(h.dircache) == 1
+
+ # Verify another instance without caching enabled does not have cache content.
+ h = fsspec.filesystem("http", use_listings_cache=False)
+ assert not h.dircache
+
+ # Verify that yet another new instance, with caching enabled,
+ # will see the same cache content again.
+ h = fsspec.filesystem("http", use_listings_cache=True, listings_expiry_time=5)
+ assert len(h.dircache) == 1
+
+ # However, yet another instance with a different expiry time will also not have
+ # any valid cache content.
+ h = fsspec.filesystem("http", use_listings_cache=True, listings_expiry_time=666)
+ assert len(h.dircache) == 0
+
+
+def test_ls_raises_filenotfound(server):
+ h = fsspec.filesystem("http")
+
+ with pytest.raises(FileNotFoundError):
+ h.ls(server + "/not-a-key")
+
+
+def test_list_cache_with_max_paths(server):
+ h = fsspec.filesystem("http", use_listings_cache=True, max_paths=5)
+ out = h.glob(server + "/index/*")
+ assert out == [server + "/index/realfile"]
+
+
+def test_list_cache_with_skip_instance_cache(server):
+ h = fsspec.filesystem("http", use_listings_cache=True, skip_instance_cache=True)
+ out = h.glob(server + "/index/*")
+ assert out == [server + "/index/realfile"]
+
+
+def test_glob_return_subfolders(server):
+ h = fsspec.filesystem("http")
+ out = h.glob(server + "/simple/*")
+ assert set(out) == {
+ server + "/simple/dir/",
+ server + "/simple/file",
+ }
+
+
+def test_isdir(server):
+ h = fsspec.filesystem("http")
+ assert h.isdir(server + "/index/")
+ assert not h.isdir(server + "/index/realfile")
+ assert not h.isdir(server + "doesnotevenexist")
+
+
+def test_policy_arg(server):
+ h = fsspec.filesystem("http", size_policy="get")
+ out = h.glob(server + "/index/*")
+ assert out == [server + "/index/realfile"]
+
+
+def test_exists(server):
+ h = fsspec.filesystem("http")
+ assert not h.exists(server + "/notafile")
+ with pytest.raises(FileNotFoundError):
+ h.cat(server + "/notafile")
+
+
+def test_read(server):
+ h = fsspec.filesystem("http")
+ out = server + "/index/realfile"
+ with h.open(out, "rb") as f:
+ assert f.read() == data
+ with h.open(out, "rb", block_size=0) as f:
+ assert f.read() == data
+ with h.open(out, "rb") as f:
+ assert f.read(100) + f.read() == data
+
+
+def test_file_pickle(server):
+ import pickle
+
+ # via HTTPFile
+ h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"})
+ out = server + "/index/realfile"
+
+ with fsspec.open(out, headers={"give_length": "true", "head_ok": "true"}) as f:
+ pic = pickle.loads(pickle.dumps(f))
+ assert pic.read() == data
+
+ with h.open(out, "rb") as f:
+ pic = pickle.dumps(f)
+ assert f.read() == data
+ with pickle.loads(pic) as f:
+ assert f.read() == data
+
+ # via HTTPStreamFile
+ h = fsspec.filesystem("http")
+ out = server + "/index/realfile"
+ with h.open(out, "rb") as f:
+ out = pickle.dumps(f)
+ assert f.read() == data
+ with pickle.loads(out) as f:
+ assert f.read() == data
+
+
+def test_methods(server):
+ h = fsspec.filesystem("http")
+ url = server + "/index/realfile"
+ assert h.exists(url)
+ assert h.cat(url) == data
+
+
+@pytest.mark.parametrize(
+ "headers",
+ [
+ {},
+ {"give_length": "true"},
+ {"give_length": "true", "head_ok": "true"},
+ {"give_range": "true"},
+ {"give_length": "true", "head_not_auth": "true"},
+ {"give_range": "true", "head_not_auth": "true"},
+ {"use_206": "true", "head_ok": "true", "head_give_length": "true"},
+ {"use_206": "true", "give_length": "true"},
+ {"use_206": "true", "give_range": "true"},
+ ],
+)
+def test_random_access(server, headers):
+ h = fsspec.filesystem("http", headers=headers)
+ url = server + "/index/realfile"
+ with h.open(url, "rb") as f:
+ if headers:
+ assert f.size == len(data)
+ assert f.read(5) == data[:5]
+
+ if headers:
+ f.seek(5, 1)
+ assert f.read(5) == data[10:15]
+ else:
+ with pytest.raises(ValueError):
+ f.seek(5, 1)
+ assert f.closed
+
+
+@pytest.mark.parametrize(
+ "headers",
+ [
+ {"ignore_range": "true", "head_ok": "true", "head_give_length": "true"},
+ {"ignore_range": "true", "give_length": "true"},
+ {"ignore_range": "true", "give_range": "true"},
+ ],
+)
+def test_no_range_support(server, headers):
+ h = fsspec.filesystem("http", headers=headers)
+ url = server + "/index/realfile"
+ with h.open(url, "rb") as f:
+ # Random access is not possible if the server doesn't respect Range
+ f.seek(5)
+ with pytest.raises(ValueError):
+ f.read(10)
+
+ # Reading from the beginning should still work
+ f.seek(0)
+ assert f.read(10) == data[:10]
+
+
+def test_stream_seek(server):
+ h = fsspec.filesystem("http")
+ url = server + "/index/realfile"
+ with h.open(url, "rb") as f:
+ f.seek(0) # is OK
+ data1 = f.read(5)
+ assert len(data1) == 5
+ f.seek(5)
+ f.seek(0, 1)
+ data2 = f.read()
+ assert data1 + data2 == data
+
+
+def test_mapper_url(server):
+ h = fsspec.filesystem("http")
+ mapper = h.get_mapper(server + "/index/")
+ assert mapper.root.startswith("http:")
+ assert list(mapper)
+
+ mapper2 = fsspec.get_mapper(server + "/index/")
+ assert mapper2.root.startswith("http:")
+ assert list(mapper) == list(mapper2)
+
+
+def test_content_length_zero(server):
+ h = fsspec.filesystem(
+ "http", headers={"give_length": "true", "zero_length": "true"}
+ )
+ url = server + "/index/realfile"
+
+ with h.open(url, "rb") as f:
+ assert f.read() == data
+
+
+def test_content_encoding_gzip(server):
+ h = fsspec.filesystem(
+ "http", headers={"give_length": "true", "gzip_encoding": "true"}
+ )
+ url = server + "/index/realfile"
+
+ with h.open(url, "rb") as f:
+ assert isinstance(f, HTTPStreamFile)
+ assert f.size is None
+ assert f.read() == data
+
+
+def test_download(server, tmpdir):
+ h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "})
+ url = server + "/index/realfile"
+ fn = os.path.join(tmpdir, "afile")
+ h.get(url, fn)
+ assert open(fn, "rb").read() == data
+
+
+def test_multi_download(server, tmpdir):
+ h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "})
+ urla = server + "/index/realfile"
+ urlb = server + "/index/otherfile"
+ fna = os.path.join(tmpdir, "afile")
+ fnb = os.path.join(tmpdir, "bfile")
+ h.get([urla, urlb], [fna, fnb])
+ assert open(fna, "rb").read() == data
+ assert open(fnb, "rb").read() == data
+
+
+def test_ls(server):
+ h = fsspec.filesystem("http")
+ l = h.ls(server + "/data/20020401/", detail=False)
+ nc = server + "/data/20020401/GRACEDADM_CLSM0125US_7D.A20020401.030.nc4"
+ assert nc in l
+ assert len(l) == 11
+ assert all(u["type"] == "file" for u in h.ls(server + "/data/20020401/"))
+ assert h.glob(server + "/data/20020401/*.nc4") == [nc]
+
+
+def test_mcat(server):
+ h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "})
+ urla = server + "/index/realfile"
+ urlb = server + "/index/otherfile"
+ out = h.cat([urla, urlb])
+ assert out == {urla: data, urlb: data}
+
+
+def test_cat_file_range(server):
+ h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "})
+ urla = server + "/index/realfile"
+ assert h.cat(urla, start=1, end=10) == data[1:10]
+ assert h.cat(urla, start=1) == data[1:]
+
+ assert h.cat(urla, start=-10) == data[-10:]
+ assert h.cat(urla, start=-10, end=-2) == data[-10:-2]
+
+ assert h.cat(urla, end=-10) == data[:-10]
+
+
+def test_cat_file_range_numpy(server):
+ np = pytest.importorskip("numpy")
+ h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "})
+ urla = server + "/index/realfile"
+ assert h.cat(urla, start=np.int8(1), end=np.int8(10)) == data[1:10]
+ out = h.cat_ranges([urla, urla], starts=np.array([1, 5]), ends=np.array([10, 15]))
+ assert out == [data[1:10], data[5:15]]
+
+
+def test_mcat_cache(server):
+ urla = server + "/index/realfile"
+ urlb = server + "/index/otherfile"
+ fs = fsspec.filesystem("simplecache", target_protocol="http")
+ assert fs.cat([urla, urlb]) == {urla: data, urlb: data}
+
+
+def test_mcat_expand(server):
+ h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "})
+ out = h.cat(server + "/index/*")
+ assert out == {server + "/index/realfile": data}
+
+
+def test_info(server):
+ fs = fsspec.filesystem("http", headers={"give_etag": "true", "head_ok": "true"})
+ info = fs.info(server + "/index/realfile")
+ assert info["ETag"] == "xxx"
+
+ fs = fsspec.filesystem("http", headers={"give_mimetype": "true"})
+ info = fs.info(server + "/index/realfile")
+ assert info["mimetype"] == "text/html"
+
+ fs = fsspec.filesystem("http", headers={"redirect": "true"})
+ info = fs.info(server + "/redirectme")
+ assert info["url"] == server + "/index/realfile"
+
+
+@pytest.mark.parametrize("method", ["POST", "PUT"])
+def test_put_file(server, tmp_path, method, reset_files):
+ src_file = tmp_path / "file_1"
+ src_file.write_bytes(data)
+
+ dwl_file = tmp_path / "down_1"
+
+ fs = fsspec.filesystem("http", headers={"head_ok": "true", "give_length": "true"})
+ with pytest.raises(FileNotFoundError):
+ fs.info(server + "/hey")
+
+ fs.put_file(src_file, server + "/hey", method=method)
+ assert fs.info(server + "/hey")["size"] == len(data)
+
+ fs.get_file(server + "/hey", dwl_file)
+ assert dwl_file.read_bytes() == data
+
+ src_file.write_bytes(b"xxx")
+ with open(src_file, "rb") as stream:
+ fs.put_file(stream, server + "/hey_2", method=method)
+ assert fs.cat(server + "/hey_2") == b"xxx"
+
+ fs.put_file(io.BytesIO(b"yyy"), server + "/hey_3", method=method)
+ assert fs.cat(server + "/hey_3") == b"yyy"
+
+
+async def get_aiohttp():
+ from aiohttp import ClientSession
+
+ return ClientSession()
+
+
+async def get_proxy():
+ class ProxyClient:
+ pass
+
+ return ProxyClient()
+
+
+@pytest.mark.xfail(
+ condition=sys.flags.optimize > 1, reason="no docstrings when optimised"
+)
+def test_docstring():
+ h = fsspec.filesystem("http")
+ # most methods have empty docstrings and draw from base class, but this one
+ # is generated
+ assert h.pipe.__doc__
+
+
+def test_async_other_thread(server):
+ import threading
+
+ loop = asyncio.get_event_loop()
+ th = threading.Thread(target=loop.run_forever)
+
+ th.daemon = True
+ th.start()
+ fs = fsspec.filesystem("http", asynchronous=True, loop=loop)
+ asyncio.run_coroutine_threadsafe(fs.set_session(), loop=loop).result()
+ url = server + "/index/realfile"
+ cor = fs._cat([url])
+ fut = asyncio.run_coroutine_threadsafe(cor, loop=loop)
+ assert fut.result() == {url: data}
+ loop.call_soon_threadsafe(loop.stop)
+
+
+def test_async_this_thread(server):
+ async def _():
+ fs = fsspec.filesystem("http", asynchronous=True)
+
+ session = await fs.set_session() # creates client
+
+ url = server + "/index/realfile"
+ with pytest.raises((NotImplementedError, RuntimeError)):
+ fs.cat([url])
+ out = await fs._cat([url])
+ del fs
+ assert out == {url: data}
+ await session.close()
+
+ asyncio.run(_())
+
+
+def _inner_pass(fs, q, fn):
+ # pass the FS instance, but don't use it; in new process, the instance
+ # cache should be skipped to make a new instance
+ import traceback
+
+ try:
+ fs = fsspec.filesystem("http")
+ q.put(fs.cat(fn))
+ except Exception:
+ q.put(traceback.format_exc())
+
+
+@pytest.mark.parametrize("method", ["spawn", "forkserver"])
+def test_processes(server, method):
+ import multiprocessing as mp
+
+ if win and method != "spawn":
+ pytest.skip("Windows can only spawn")
+ ctx = mp.get_context(method)
+ fn = server + "/index/realfile"
+ fs = fsspec.filesystem("http")
+
+ q = ctx.Queue()
+ p = ctx.Process(target=_inner_pass, args=(fs, q, fn))
+ p.start()
+ out = q.get()
+ assert out == fs.cat(fn)
+ p.join()
+
+
+@pytest.mark.parametrize("get_client", [get_aiohttp, get_proxy])
+def test_close(get_client):
+ fs = fsspec.filesystem("http", skip_instance_cache=True)
+ fs.close_session(None, asyncio.run(get_client()))
+
+
+@pytest.mark.asyncio
+async def test_async_file(server):
+ fs = fsspec.filesystem("http", asynchronous=True, skip_instance_cache=True)
+ fn = server + "/index/realfile"
+ of = await fs.open_async(fn)
+ async with of as f:
+ out1 = await f.read(10)
+ assert data.startswith(out1)
+ out2 = await f.read()
+ assert data == out1 + out2
+ await fs._session.close()
+
+
+def test_encoded(server):
+ fs = fsspec.filesystem("http", encoded=True)
+ out = fs.cat(server + "/Hello%3A%20G%C3%BCnter", headers={"give_path": "true"})
+ assert json.loads(out)["path"] == "/Hello%3A%20G%C3%BCnter"
+ with pytest.raises(aiohttp.client_exceptions.ClientError):
+ fs.cat(server + "/Hello: Günter", headers={"give_path": "true"})
+
+ fs = fsspec.filesystem("http", encoded=False)
+ out = fs.cat(server + "/Hello: Günter", headers={"give_path": "true"})
+ assert json.loads(out)["path"] == "/Hello:%20G%C3%BCnter"
+
+
+def test_with_cache(server):
+ fs = fsspec.filesystem("http", headers={"head_ok": "true", "give_length": "true"})
+ fn = server + "/index/realfile"
+ fs1 = fsspec.filesystem("blockcache", fs=fs)
+ with fs1.open(fn, "rb") as f:
+ out = f.read()
+ assert out == fs1.cat(fn)
+
+
+@pytest.mark.asyncio
+async def test_async_expand_path(server):
+ fs = fsspec.filesystem("http", asynchronous=True, skip_instance_cache=True)
+
+ # maxdepth=1
+ assert await fs._expand_path(server + "/index", recursive=True, maxdepth=1) == [
+ server + "/index",
+ server + "/index/realfile",
+ ]
+
+ # maxdepth=0
+ with pytest.raises(ValueError):
+ await fs._expand_path(server + "/index", maxdepth=0)
+ with pytest.raises(ValueError):
+ await fs._expand_path(server + "/index", recursive=True, maxdepth=0)
+
+ await fs._session.close()
+
+
+@pytest.mark.asyncio
+async def test_async_walk(server):
+ fs = fsspec.filesystem("http", asynchronous=True, skip_instance_cache=True)
+
+ # No maxdepth
+ res = [a async for a in fs._walk(server + "/index")]
+ assert res == [(server + "/index", [], ["realfile"])]
+
+ # maxdepth=0
+ with pytest.raises(ValueError):
+ async for a in fs._walk(server + "/index", maxdepth=0):
+ pass
+
+ await fs._session.close()
diff --git a/fsspec/implementations/tests/test_jupyter.py b/fsspec/implementations/tests/test_jupyter.py
index 9e56841..9b2eaa9 100644
--- a/fsspec/implementations/tests/test_jupyter.py
+++ b/fsspec/implementations/tests/test_jupyter.py
@@ -2,7 +2,56 @@ import os
import shlex
import subprocess
import time
+
import pytest
+
import fsspec
-pytest.importorskip('notebook')
-requests = pytest.importorskip('requests')
+
+pytest.importorskip("notebook")
+requests = pytest.importorskip("requests")
+
+
+@pytest.fixture()
+def jupyter(tmpdir):
+ tmpdir = str(tmpdir)
+ os.environ["JUPYTER_TOKEN"] = "blah"
+ try:
+ cmd = f'jupyter notebook --notebook-dir="{tmpdir}" --no-browser --port=5566'
+ P = subprocess.Popen(shlex.split(cmd))
+ except FileNotFoundError:
+ pytest.skip("notebook not installed correctly")
+ try:
+ timeout = 15
+ while True:
+ try:
+ r = requests.get("http://localhost:5566/?token=blah")
+ r.raise_for_status()
+ break
+ except (requests.exceptions.BaseHTTPError, OSError):
+ time.sleep(0.1)
+ timeout -= 0.1
+ if timeout < 0:
+ pytest.xfail("Timed out for jupyter")
+ yield "http://localhost:5566/?token=blah", tmpdir
+ finally:
+ P.terminate()
+
+
+def test_simple(jupyter):
+ url, d = jupyter
+ fs = fsspec.filesystem("jupyter", url=url)
+ assert fs.ls("") == []
+
+ fs.pipe("afile", b"data")
+ assert fs.cat("afile") == b"data"
+ assert "afile" in os.listdir(d)
+
+ with fs.open("bfile", "wb") as f:
+ f.write(b"more")
+ with fs.open("bfile", "rb") as f:
+ assert f.read() == b"more"
+
+ assert fs.info("bfile")["size"] == 4
+ fs.rm("afile")
+
+ assert "afile" not in os.listdir(d)
diff --git a/fsspec/implementations/tests/test_libarchive.py b/fsspec/implementations/tests/test_libarchive.py
index 5aabc42..a5bef34 100644
--- a/fsspec/implementations/tests/test_libarchive.py
+++ b/fsspec/implementations/tests/test_libarchive.py
@@ -1,2 +1,33 @@
+# this test case checks that the libarchive can be used from a seekable source (any fs
+# with a block cache active)
import fsspec
from fsspec.implementations.tests.test_archive import archive_data, temparchive
+
+
+def test_cache(ftp_writable):
+ host, port, username, password = "localhost", 2121, "user", "pass"
+
+ with temparchive(archive_data) as archive_file:
+ with fsspec.open(
+ "ftp:///archive.7z",
+ "wb",
+ host=host,
+ port=port,
+ username=username,
+ password=password,
+ ) as f:
+ f.write(open(archive_file, "rb").read())
+ of = fsspec.open(
+ "libarchive://deeply/nested/path::ftp:///archive.7z",
+ ftp={
+ "host": host,
+ "port": port,
+ "username": username,
+ "password": password,
+ },
+ )
+
+ with of as f:
+ readdata = f.read()
+
+ assert readdata == archive_data["deeply/nested/path"]
diff --git a/fsspec/implementations/tests/test_local.py b/fsspec/implementations/tests/test_local.py
index ccabb9c..ef39279 100644
--- a/fsspec/implementations/tests/test_local.py
+++ b/fsspec/implementations/tests/test_local.py
@@ -9,42 +9,1277 @@ import tempfile
from contextlib import contextmanager
from pathlib import Path
from unittest.mock import patch
+
import pytest
+
import fsspec
from fsspec import compression
from fsspec.core import OpenFile, get_fs_token_paths, open_files
from fsspec.implementations.local import LocalFileSystem, make_path_posix
from fsspec.tests.test_utils import WIN
-files = {'.test.accounts.1.json':
- b'{"amount": 100, "name": "Alice"}\n{"amount": 200, "name": "Bob"}\n{"amount": 300, "name": "Charlie"}\n{"amount": 400, "name": "Dennis"}\n'
- , '.test.accounts.2.json':
- b'{"amount": 500, "name": "Alice"}\n{"amount": 600, "name": "Bob"}\n{"amount": 700, "name": "Charlie"}\n{"amount": 800, "name": "Dennis"}\n'
- }
-csv_files = {'.test.fakedata.1.csv': b'a,b\n1,2\n', '.test.fakedata.2.csv':
- b'a,b\n3,4\n'}
+
+files = {
+ ".test.accounts.1.json": (
+ b'{"amount": 100, "name": "Alice"}\n'
+ b'{"amount": 200, "name": "Bob"}\n'
+ b'{"amount": 300, "name": "Charlie"}\n'
+ b'{"amount": 400, "name": "Dennis"}\n'
+ ),
+ ".test.accounts.2.json": (
+ b'{"amount": 500, "name": "Alice"}\n'
+ b'{"amount": 600, "name": "Bob"}\n'
+ b'{"amount": 700, "name": "Charlie"}\n'
+ b'{"amount": 800, "name": "Dennis"}\n'
+ ),
+}
+
+csv_files = {
+ ".test.fakedata.1.csv": (b"a,b\n1,2\n"),
+ ".test.fakedata.2.csv": (b"a,b\n3,4\n"),
+}
odir = os.getcwd()
+@pytest.fixture()
+def cwd():
+ pth = os.getcwd().replace("\\", "/")
+ assert not pth.endswith("/")
+ yield pth
+
+
+@pytest.fixture()
+def current_drive(cwd):
+ drive = os.path.splitdrive(cwd)[0]
+ assert not drive or (len(drive) == 2 and drive.endswith(":"))
+ yield drive
+
+
+@pytest.fixture()
+def user_home():
+ pth = os.path.expanduser("~").replace("\\", "/")
+ assert not pth.endswith("/")
+ yield pth
+
+
+def winonly(*args):
+ return pytest.param(*args, marks=pytest.mark.skipif(not WIN, reason="Windows only"))
+
+
+def posixonly(*args):
+ return pytest.param(*args, marks=pytest.mark.skipif(WIN, reason="Posix only"))
+
+
@contextmanager
-def filetexts(d, open=open, mode='t'):
+def filetexts(d, open=open, mode="t"):
"""Dumps a number of textfiles to disk
d - dict
- a mapping from filename to text like {'a.csv': '1,1
-2,2'}
+ a mapping from filename to text like {'a.csv': '1,1\n2,2'}
Since this is meant for use in tests, this context manager will
automatically switch to a temporary current directory, to avoid
race conditions when running tests in parallel.
"""
- pass
+ dirname = tempfile.mkdtemp()
+ try:
+ os.chdir(dirname)
+ for filename, text in d.items():
+ if dirname := os.path.dirname(filename):
+ os.makedirs(dirname, exist_ok=True)
+ f = open(filename, f"w{mode}")
+ try:
+ f.write(text)
+ finally:
+ try:
+ f.close()
+ except AttributeError:
+ pass
+
+ yield list(d)
+
+ for filename in d:
+ if os.path.exists(filename):
+ try:
+ os.remove(filename)
+ except OSError:
+ pass
+ finally:
+ os.chdir(odir)
+
+
+def test_urlpath_inference_strips_protocol(tmpdir):
+ tmpdir = make_path_posix(str(tmpdir))
+ paths = ["/".join([tmpdir, f"test.{i:02d}.csv"]) for i in range(20)]
+
+ for path in paths:
+ with open(path, "wb") as f:
+ f.write(b"1,2,3\n" * 10)
+
+ # globstring
+ protocol = "file:///" if sys.platform == "win32" else "file://"
+ urlpath = protocol + os.path.join(tmpdir, "test.*.csv")
+ _, _, paths2 = get_fs_token_paths(urlpath)
+ assert paths2 == paths
+
+ # list of paths
+ _, _, paths2 = get_fs_token_paths([protocol + p for p in paths])
+ assert paths2 == paths
+
+
+def test_urlpath_inference_errors():
+ # Empty list
+ with pytest.raises(ValueError) as err:
+ get_fs_token_paths([])
+ assert "empty" in str(err.value)
+
+ pytest.importorskip("s3fs")
+ # Protocols differ
+ with pytest.raises(ValueError) as err:
+ get_fs_token_paths(["s3://test/path.csv", "/other/path.csv"])
+ assert "Protocol" in str(err.value)
def test_urlpath_expand_read():
"""Make sure * is expanded in file paths when reading."""
- pass
+ # when reading, globs should be expanded to read files by mask
+ with filetexts(csv_files, mode="b"):
+ _, _, paths = get_fs_token_paths("./.*.csv")
+ assert len(paths) == 2
+ _, _, paths = get_fs_token_paths(["./.*.csv"])
+ assert len(paths) == 2
+
+
+def test_cats():
+ with filetexts(csv_files, mode="b"):
+ fs = fsspec.filesystem("file")
+ assert fs.cat(".test.fakedata.1.csv") == b"a,b\n1,2\n"
+ out = set(fs.cat([".test.fakedata.1.csv", ".test.fakedata.2.csv"]).values())
+ assert out == {b"a,b\n1,2\n", b"a,b\n3,4\n"}
+ assert fs.cat(".test.fakedata.1.csv", None, None) == b"a,b\n1,2\n"
+ assert fs.cat(".test.fakedata.1.csv", start=1, end=6) == b"a,b\n1,2\n"[1:6]
+ assert fs.cat(".test.fakedata.1.csv", start=-1) == b"a,b\n1,2\n"[-1:]
+ assert fs.cat(".test.fakedata.1.csv", start=1, end=-2) == b"a,b\n1,2\n"[1:-2]
+ out = set(
+ fs.cat(
+ [".test.fakedata.1.csv", ".test.fakedata.2.csv"], start=1, end=-1
+ ).values()
+ )
+ assert out == {b"a,b\n1,2\n"[1:-1], b"a,b\n3,4\n"[1:-1]}
def test_urlpath_expand_write():
"""Make sure * is expanded in file paths when writing."""
- pass
+ _, _, paths = get_fs_token_paths("prefix-*.csv", mode="wb", num=2)
+ assert all(
+ p.endswith(pa) for p, pa in zip(paths, ["/prefix-0.csv", "/prefix-1.csv"])
+ )
+ _, _, paths = get_fs_token_paths(["prefix-*.csv"], mode="wb", num=2)
+ assert all(
+ p.endswith(pa) for p, pa in zip(paths, ["/prefix-0.csv", "/prefix-1.csv"])
+ )
+ # we can read with multiple masks, but not write
+ with pytest.raises(ValueError):
+ _, _, paths = get_fs_token_paths(
+ ["prefix1-*.csv", "prefix2-*.csv"], mode="wb", num=2
+ )
+
+
+def test_open_files():
+ with filetexts(files, mode="b"):
+ myfiles = open_files("./.test.accounts.*")
+ assert len(myfiles) == len(files)
+ for lazy_file, data_file in zip(myfiles, sorted(files)):
+ with lazy_file as f:
+ x = f.read()
+ assert x == files[data_file]
+
+
+@pytest.mark.parametrize("encoding", ["utf-8", "ascii"])
+def test_open_files_text_mode(encoding):
+ with filetexts(files, mode="b"):
+ myfiles = open_files("./.test.accounts.*", mode="rt", encoding=encoding)
+ assert len(myfiles) == len(files)
+ data = []
+ for file in myfiles:
+ with file as f:
+ data.append(f.read())
+ assert list(data) == [files[k].decode(encoding) for k in sorted(files)]
+
+
+@pytest.mark.parametrize("mode", ["rt", "rb"])
+@pytest.mark.parametrize("fmt", list(compression.compr))
+def test_compressions(fmt, mode, tmpdir):
+ tmpdir = str(tmpdir)
+ fn = os.path.join(tmpdir, ".tmp.getsize")
+ fs = LocalFileSystem()
+ f = OpenFile(fs, fn, compression=fmt, mode="wb")
+ data = b"Long line of readily compressible text"
+ with f as fo:
+ fo.write(data)
+ if fmt is None:
+ assert fs.size(fn) == len(data)
+ else:
+ assert fs.size(fn) != len(data)
+
+ f = OpenFile(fs, fn, compression=fmt, mode=mode)
+ with f as fo:
+ if mode == "rb":
+ assert fo.read() == data
+ else:
+ assert fo.read() == data.decode()
+
+
+def test_bad_compression():
+ with filetexts(files, mode="b"):
+ for func in [open_files]:
+ with pytest.raises(ValueError):
+ func("./.test.accounts.*", compression="not-found")
+
+
+def test_not_found():
+ fn = "not-a-file"
+ fs = LocalFileSystem()
+ with pytest.raises((FileNotFoundError, OSError)):
+ with OpenFile(fs, fn, mode="rb"):
+ pass
+
+
+def test_isfile():
+ fs = LocalFileSystem()
+ with filetexts(files, mode="b"):
+ for f in files.keys():
+ assert fs.isfile(f)
+ assert fs.isfile(f"file://{f}")
+ assert not fs.isfile("not-a-file")
+ assert not fs.isfile("file://not-a-file")
+
+
+def test_isdir():
+ fs = LocalFileSystem()
+ with filetexts(files, mode="b"):
+ for f in files.keys():
+ assert fs.isdir(os.path.dirname(os.path.abspath(f)))
+ assert not fs.isdir(f)
+ assert not fs.isdir("not-a-dir")
+
+
+@pytest.mark.parametrize("compression_opener", [(None, open), ("gzip", gzip.open)])
+def test_open_files_write(tmpdir, compression_opener):
+ tmpdir = str(tmpdir)
+ compression, opener = compression_opener
+ fn = str(tmpdir) + "/*.part"
+ files = open_files(fn, num=2, mode="wb", compression=compression)
+ assert len(files) == 2
+ assert {f.mode for f in files} == {"wb"}
+ for fil in files:
+ with fil as f:
+ f.write(b"000")
+ files = sorted(os.listdir(tmpdir))
+ assert files == ["0.part", "1.part"]
+
+ with opener(os.path.join(tmpdir, files[0]), "rb") as f:
+ d = f.read()
+ assert d == b"000"
+
+
+def test_pickability_of_lazy_files(tmpdir):
+ tmpdir = str(tmpdir)
+ cloudpickle = pytest.importorskip("cloudpickle")
+
+ with filetexts(files, mode="b"):
+ myfiles = open_files("./.test.accounts.*")
+ myfiles2 = cloudpickle.loads(cloudpickle.dumps(myfiles))
+
+ for f, f2 in zip(myfiles, myfiles2):
+ assert f.path == f2.path
+ assert isinstance(f.fs, type(f2.fs))
+ with f as f_open, f2 as f2_open:
+ assert f_open.read() == f2_open.read()
+
+
+def test_abs_paths(tmpdir):
+ tmpdir = str(tmpdir)
+ here = os.getcwd()
+ os.chdir(tmpdir)
+ with open("tmp", "w") as f:
+ f.write("hi")
+ out = LocalFileSystem().glob("./*")
+ assert len(out) == 1
+ assert "/" in out[0]
+ assert "tmp" in out[0]
+
+ # I don't know what this was testing - but should avoid local paths anyway
+ # fs = LocalFileSystem()
+ os.chdir(here)
+ # with fs.open('tmp', 'r') as f:
+ # res = f.read()
+ # assert res == 'hi'
+
+
+@pytest.mark.parametrize("sep", ["/", "\\"])
+@pytest.mark.parametrize("chars", ["+", "++", "(", ")", "|", "\\"])
+def test_glob_weird_characters(tmpdir, sep, chars):
+ tmpdir = make_path_posix(str(tmpdir))
+
+ subdir = f"{tmpdir}{sep}test{chars}x"
+ try:
+ os.makedirs(subdir, exist_ok=True)
+ except OSError as e:
+ if WIN and "label syntax" in str(e):
+ pytest.xfail("Illegal windows directory name")
+ else:
+ raise
+ with open(subdir + sep + "tmp", "w") as f:
+ f.write("hi")
+
+ out = LocalFileSystem().glob(subdir + sep + "*")
+ assert len(out) == 1
+ assert "/" in out[0]
+ assert "tmp" in out[0]
+
+
+def test_globfind_dirs(tmpdir):
+ tmpdir = make_path_posix(str(tmpdir))
+ fs = fsspec.filesystem("file")
+ fs.mkdir(tmpdir + "/dir")
+ fs.touch(tmpdir + "/dir/afile")
+ assert [tmpdir + "/dir"] == fs.glob(tmpdir + "/*")
+ assert fs.glob(tmpdir + "/*", detail=True)[tmpdir + "/dir"]["type"] == "directory"
+ assert (
+ fs.glob(tmpdir + "/dir/*", detail=True)[tmpdir + "/dir/afile"]["type"] == "file"
+ )
+ assert [tmpdir + "/dir/afile"] == fs.find(tmpdir)
+ assert [tmpdir, tmpdir + "/dir", tmpdir + "/dir/afile"] == fs.find(
+ tmpdir, withdirs=True
+ )
+
+
+def test_touch(tmpdir):
+ import time
+
+ fn = str(tmpdir + "/in/file")
+ fs = fsspec.filesystem("file", auto_mkdir=False)
+ with pytest.raises(OSError):
+ fs.touch(fn)
+ fs = fsspec.filesystem("file", auto_mkdir=True)
+ fs.touch(fn)
+ info = fs.info(fn)
+ time.sleep(0.2)
+ fs.touch(fn)
+ info2 = fs.info(fn)
+ if not WIN:
+ assert info2["mtime"] > info["mtime"]
+
+
+def test_touch_truncate(tmpdir):
+ fn = str(tmpdir + "/tfile")
+ fs = fsspec.filesystem("file")
+ fs.touch(fn, truncate=True)
+ fs.pipe(fn, b"a")
+ fs.touch(fn, truncate=True)
+ assert fs.cat(fn) == b""
+ fs.pipe(fn, b"a")
+ fs.touch(fn, truncate=False)
+ assert fs.cat(fn) == b"a"
+
+
+def test_directories(tmpdir):
+ tmpdir = make_path_posix(str(tmpdir))
+ fs = LocalFileSystem()
+ fs.mkdir(tmpdir + "/dir")
+ assert tmpdir + "/dir" in fs.ls(tmpdir)
+ assert fs.ls(tmpdir, True)[0]["type"] == "directory"
+ fs.rmdir(tmpdir + "/dir")
+ assert not fs.ls(tmpdir)
+ assert fs.ls(fs.root_marker)
+
+
+def test_ls_on_file(tmpdir):
+ tmpdir = make_path_posix(str(tmpdir))
+ fs = LocalFileSystem()
+ resource = tmpdir + "/a.json"
+ fs.touch(resource)
+ assert fs.exists(resource)
+ assert fs.ls(tmpdir) == fs.ls(resource)
+ assert fs.ls(resource, detail=True)[0] == fs.info(resource)
+
+
+@pytest.mark.parametrize("file_protocol", ["", "file://"])
+def test_file_ops(tmpdir, file_protocol):
+ tmpdir = make_path_posix(str(tmpdir))
+ tmpdir_with_protocol = file_protocol + tmpdir
+ fs = LocalFileSystem(auto_mkdir=True)
+ with pytest.raises(FileNotFoundError):
+ fs.info(tmpdir_with_protocol + "/nofile")
+ fs.touch(tmpdir_with_protocol + "/afile")
+ i1 = fs.ukey(tmpdir_with_protocol + "/afile")
+
+ assert tmpdir + "/afile" in fs.ls(tmpdir_with_protocol)
+
+ with fs.open(tmpdir_with_protocol + "/afile", "wb") as f:
+ f.write(b"data")
+ i2 = fs.ukey(tmpdir_with_protocol + "/afile")
+ assert i1 != i2 # because file changed
+
+ fs.copy(tmpdir_with_protocol + "/afile", tmpdir_with_protocol + "/afile2")
+ assert tmpdir + "/afile2" in fs.ls(tmpdir_with_protocol)
+
+ fs.move(tmpdir_with_protocol + "/afile", tmpdir_with_protocol + "/afile3")
+ assert not fs.exists(tmpdir_with_protocol + "/afile")
+
+ fs.cp(
+ tmpdir_with_protocol + "/afile3", tmpdir_with_protocol + "/deeply/nested/file"
+ )
+ assert fs.exists(tmpdir_with_protocol + "/deeply/nested/file")
+
+ fs.rm(tmpdir_with_protocol + "/afile3", recursive=True)
+ assert not fs.exists(tmpdir_with_protocol + "/afile3")
+
+ files = [tmpdir_with_protocol + "/afile4", tmpdir_with_protocol + "/afile5"]
+ [fs.touch(f) for f in files]
+
+ with pytest.raises(AttributeError):
+ fs.rm_file(files)
+ fs.rm(files)
+ assert all(not fs.exists(f) for f in files)
+
+ fs.touch(tmpdir_with_protocol + "/afile6")
+ fs.rm_file(tmpdir_with_protocol + "/afile6")
+ assert not fs.exists(tmpdir_with_protocol + "/afile6")
+
+ # IsADirectoryError raised on Linux, PermissionError on Windows
+ with pytest.raises((IsADirectoryError, PermissionError)):
+ fs.rm_file(tmpdir_with_protocol)
+
+ fs.rm(tmpdir_with_protocol, recursive=True)
+ assert not fs.exists(tmpdir_with_protocol)
+
+
+def test_recursive_get_put(tmpdir):
+ tmpdir = make_path_posix(str(tmpdir))
+ fs = LocalFileSystem(auto_mkdir=True)
+
+ fs.mkdir(tmpdir + "/a1/a2/a3")
+ fs.touch(tmpdir + "/a1/a2/a3/afile")
+ fs.touch(tmpdir + "/a1/afile")
+
+ fs.get(f"file://{tmpdir}/a1", tmpdir + "/b1", recursive=True)
+ assert fs.isfile(tmpdir + "/b1/afile")
+ assert fs.isfile(tmpdir + "/b1/a2/a3/afile")
+
+ fs.put(tmpdir + "/b1", f"file://{tmpdir}/c1", recursive=True)
+ assert fs.isfile(tmpdir + "/c1/afile")
+ assert fs.isfile(tmpdir + "/c1/a2/a3/afile")
+
+
+def test_commit_discard(tmpdir):
+ tmpdir = str(tmpdir)
+ fs = LocalFileSystem()
+ with fs.transaction:
+ with fs.open(tmpdir + "/afile", "wb") as f:
+ assert not fs.exists(tmpdir + "/afile")
+ f.write(b"data")
+ assert not fs.exists(tmpdir + "/afile")
+
+ assert fs._transaction is None
+ assert fs.cat(tmpdir + "/afile") == b"data"
+
+ try:
+ with fs.transaction:
+ with fs.open(tmpdir + "/bfile", "wb") as f:
+ f.write(b"data")
+ raise KeyboardInterrupt
+ except KeyboardInterrupt:
+ assert not fs.exists(tmpdir + "/bfile")
+
+
+def test_make_path_posix():
+ cwd = os.getcwd()
+ if WIN:
+ drive = cwd[0]
+ assert make_path_posix("/a/posix/path") == f"{drive}:/a/posix/path"
+ assert make_path_posix("/posix") == f"{drive}:/posix"
+ # Windows drive requires trailing slash
+ assert make_path_posix("C:\\") == "C:/"
+ else:
+ assert make_path_posix("/a/posix/path") == "/a/posix/path"
+ assert make_path_posix("/posix") == "/posix"
+ assert make_path_posix("relpath") == posixpath.join(make_path_posix(cwd), "relpath")
+ assert make_path_posix("rel/path") == posixpath.join(
+ make_path_posix(cwd), "rel/path"
+ )
+ # NT style
+ if WIN:
+ assert make_path_posix("C:\\path") == "C:/path"
+ assert (
+ make_path_posix(
+ "\\\\windows-server\\someshare\\path\\more\\path\\dir\\foo.parquet",
+ )
+ == "//windows-server/someshare/path/more/path/dir/foo.parquet"
+ )
+ assert (
+ make_path_posix(
+ "\\\\SERVER\\UserHomeFolder$\\me\\My Documents\\proj\\data\\fname.csv",
+ )
+ == "//SERVER/UserHomeFolder$/me/My Documents/proj/data/fname.csv"
+ )
+ assert "/" in make_path_posix("rel\\path")
+ # Relative
+ pp = make_path_posix("./path")
+ cd = make_path_posix(cwd)
+ assert pp == cd + "/path"
+ # Userpath
+ userpath = make_path_posix("~/path")
+ assert userpath.endswith("/path")
+
+
+@pytest.mark.parametrize(
+ "path",
+ [
+ "/abc/def",
+ "abc/def",
+ "",
+ ".",
+ "//server/share/",
+ "\\\\server\\share\\",
+ "C:\\",
+ "d:/abc/def",
+ "e:",
+ pytest.param(
+ "\\\\server\\share",
+ marks=[
+ pytest.mark.xfail(
+ WIN and sys.version_info < (3, 11),
+ reason="requires py3.11+ see: python/cpython#96290",
+ )
+ ],
+ ),
+ pytest.param(
+ "f:foo",
+ marks=[pytest.mark.xfail(WIN, reason="unsupported")],
+ id="relative-path-with-drive",
+ ),
+ ],
+)
+def test_make_path_posix_returns_absolute_paths(path):
+ posix_pth = make_path_posix(path)
+ assert os.path.isabs(posix_pth)
+
+
+@pytest.mark.parametrize("container_cls", [list, set, tuple])
+def test_make_path_posix_set_list_tuple(container_cls):
+ paths = container_cls(
+ [
+ "/foo/bar",
+ "bar/foo",
+ ]
+ )
+ posix_paths = make_path_posix(paths)
+ assert isinstance(posix_paths, container_cls)
+ assert posix_paths == container_cls(
+ [
+ make_path_posix("/foo/bar"),
+ make_path_posix("bar/foo"),
+ ]
+ )
+
+
+@pytest.mark.parametrize(
+ "obj",
+ [
+ 1,
+ True,
+ None,
+ object(),
+ ],
+)
+def test_make_path_posix_wrong_type(obj):
+ with pytest.raises(TypeError):
+ make_path_posix(obj)
+
+
+def test_parent():
+ if WIN:
+ assert LocalFileSystem._parent("C:\\file or folder") == "C:/"
+ assert LocalFileSystem._parent("C:\\") == "C:/"
+ else:
+ assert LocalFileSystem._parent("/file or folder") == "/"
+ assert LocalFileSystem._parent("/") == "/"
+
+
+@pytest.mark.parametrize(
+ "path,parent",
+ [
+ ("C:\\", "C:/"),
+ ("C:\\.", "C:/"),
+ ("C:\\.\\", "C:/"),
+ ("file:C:/", "C:/"),
+ ("file://C:/", "C:/"),
+ ("local:C:/", "C:/"),
+ ("local://C:/", "C:/"),
+ ("\\\\server\\share", "//server/share"),
+ ("\\\\server\\share\\", "//server/share"),
+ ("\\\\server\\share\\path", "//server/share"),
+ ("//server/share", "//server/share"),
+ ("//server/share/", "//server/share"),
+ ("//server/share/path", "//server/share"),
+ ("C:\\file or folder", "C:/"),
+ ("C:\\file or folder\\", "C:/"),
+ ("file:///", "{current_drive}/"),
+ ("file:///path", "{current_drive}/"),
+ ]
+ if WIN
+ else [
+ ("/", "/"),
+ ("/.", "/"),
+ ("/./", "/"),
+ ("file:/", "/"),
+ ("file:///", "/"),
+ ("local:/", "/"),
+ ("local:///", "/"),
+ ("/file or folder", "/"),
+ ("/file or folder/", "/"),
+ ("file:///path", "/"),
+ ("file://c/", "{cwd}"),
+ ],
+)
+def test_parent_edge_cases(path, parent, cwd, current_drive):
+ parent = parent.format(cwd=cwd, current_drive=current_drive)
+
+ assert LocalFileSystem._parent(path) == parent
+
+
+def test_linked_files(tmpdir):
+ tmpdir = str(tmpdir)
+ fn0 = os.path.join(tmpdir, "target")
+ fn1 = os.path.join(tmpdir, "link1")
+ fn2 = os.path.join(tmpdir, "link2")
+ data = b"my target data"
+ with open(fn0, "wb") as f:
+ f.write(data)
+ try:
+ os.symlink(fn0, fn1)
+ os.symlink(fn0, fn2)
+ except OSError:
+ if WIN:
+ pytest.xfail("Ran on win without admin permissions")
+ else:
+ raise
+
+ fs = LocalFileSystem()
+ assert fs.info(fn0)["type"] == "file"
+ assert fs.info(fn1)["type"] == "file"
+ assert fs.info(fn2)["type"] == "file"
+
+ assert not fs.info(fn0)["islink"]
+ assert fs.info(fn1)["islink"]
+ assert fs.info(fn2)["islink"]
+
+ assert fs.info(fn0)["size"] == len(data)
+ assert fs.info(fn1)["size"] == len(data)
+ assert fs.info(fn2)["size"] == len(data)
+
+ of = fsspec.open(fn1, "rb")
+ with of as f:
+ assert f.read() == data
+
+ of = fsspec.open(fn2, "rb")
+ with of as f:
+ assert f.read() == data
+
+
+def test_linked_files_exists(tmpdir):
+ origin = tmpdir / "original"
+ copy_file = tmpdir / "copy"
+
+ fs = LocalFileSystem()
+ fs.touch(origin)
+
+ try:
+ os.symlink(origin, copy_file)
+ except OSError:
+ if WIN:
+ pytest.xfail("Ran on win without admin permissions")
+ else:
+ raise
+
+ assert fs.exists(copy_file)
+ assert fs.lexists(copy_file)
+
+ os.unlink(origin)
+
+ assert not fs.exists(copy_file)
+ assert fs.lexists(copy_file)
+
+ os.unlink(copy_file)
+
+ assert not fs.exists(copy_file)
+ assert not fs.lexists(copy_file)
+
+
+def test_linked_directories(tmpdir):
+ tmpdir = str(tmpdir)
+
+ subdir0 = os.path.join(tmpdir, "target")
+ subdir1 = os.path.join(tmpdir, "link1")
+ subdir2 = os.path.join(tmpdir, "link2")
+
+ os.makedirs(subdir0)
+
+ try:
+ os.symlink(subdir0, subdir1)
+ os.symlink(subdir0, subdir2)
+ except OSError:
+ if WIN:
+ pytest.xfail("Ran on win without admin permissions")
+ else:
+ raise
+
+ fs = LocalFileSystem()
+ assert fs.info(subdir0)["type"] == "directory"
+ assert fs.info(subdir1)["type"] == "directory"
+ assert fs.info(subdir2)["type"] == "directory"
+
+ assert not fs.info(subdir0)["islink"]
+ assert fs.info(subdir1)["islink"]
+ assert fs.info(subdir2)["islink"]
+
+
+def test_isfilestore():
+ fs = LocalFileSystem(auto_mkdir=False)
+ assert fs._isfilestore()
+
+
+def test_pickle(tmpdir):
+ fs = LocalFileSystem()
+ tmpdir = str(tmpdir)
+ fn0 = os.path.join(tmpdir, "target")
+
+ with open(fn0, "wb") as f:
+ f.write(b"data")
+
+ f = fs.open(fn0, "rb")
+ f.seek(1)
+ f2 = pickle.loads(pickle.dumps(f))
+ assert f2.read() == f.read()
+
+ f = fs.open(fn0, "wb")
+ with pytest.raises(ValueError):
+ pickle.dumps(f)
+
+ # with context
+ with fs.open(fn0, "rb") as f:
+ f.seek(1)
+ f2 = pickle.loads(pickle.dumps(f))
+ assert f2.tell() == 1
+ assert f2.read() == f.read()
+
+ # with fsspec.open https://github.com/fsspec/filesystem_spec/issues/579
+ with fsspec.open(fn0, "rb") as f:
+ f.seek(1)
+ f2 = pickle.loads(pickle.dumps(f))
+ assert f2.tell() == 1
+ assert f2.read() == f.read()
+
+
+@pytest.mark.parametrize(
+ "uri, expected",
+ [
+ ("file://~/foo/bar", "{user_home}/foo/bar"),
+ ("~/foo/bar", "{user_home}/foo/bar"),
+ winonly("~\\foo\\bar", "{user_home}/foo/bar"),
+ winonly("file://~\\foo\\bar", "{user_home}/foo/bar"),
+ ],
+)
+def test_strip_protocol_expanduser(uri, expected, user_home):
+ expected = expected.format(user_home=user_home)
+
+ stripped = LocalFileSystem._strip_protocol(uri)
+ assert expected == stripped
+
+
+@pytest.mark.parametrize(
+ "uri, expected",
+ [
+ ("file://", "{cwd}"),
+ ("file://.", "{cwd}"),
+ ("file://./", "{cwd}"),
+ ("./", "{cwd}"),
+ ("file:path", "{cwd}/path"),
+ ("file://path", "{cwd}/path"),
+ ("path", "{cwd}/path"),
+ ("./path", "{cwd}/path"),
+ winonly(".\\", "{cwd}"),
+ winonly("file://.\\path", "{cwd}/path"),
+ ],
+)
+def test_strip_protocol_relative_paths(uri, expected, cwd):
+ expected = expected.format(cwd=cwd)
+
+ stripped = LocalFileSystem._strip_protocol(uri)
+ assert expected == stripped
+
+
+@pytest.mark.parametrize(
+ "uri, expected",
+ [
+ posixonly("file:/foo/bar", "/foo/bar"),
+ winonly("file:/foo/bar", "{current_drive}/foo/bar"),
+ winonly("file:\\foo\\bar", "{current_drive}/foo/bar"),
+ winonly("file:D:\\path\\file", "D:/path/file"),
+ winonly("file:/D:\\path\\file", "D:/path/file"),
+ winonly("file://D:\\path\\file", "D:/path/file"),
+ ],
+)
+def test_strip_protocol_no_authority(uri, expected, cwd, current_drive):
+ expected = expected.format(cwd=cwd, current_drive=current_drive)
+
+ stripped = LocalFileSystem._strip_protocol(uri)
+ assert expected == stripped
+
+
+@pytest.mark.parametrize(
+ "uri, expected",
+ [
+ ("file:/path", "/path"),
+ ("file:///path", "/path"),
+ ("file:////path", "//path"),
+ ("local:/path", "/path"),
+ ("s3://bucket/key", "{cwd}/s3://bucket/key"),
+ ("/path", "/path"),
+ ("file:///", "/"),
+ ]
+ if not WIN
+ else [
+ ("file:c:/path", "c:/path"),
+ ("file:/c:/path", "c:/path"),
+ ("file:/C:/path", "C:/path"),
+ ("file://c:/path", "c:/path"),
+ ("file:///c:/path", "c:/path"),
+ ("local:/path", "{current_drive}/path"),
+ ("s3://bucket/key", "{cwd}/s3://bucket/key"),
+ ("c:/path", "c:/path"),
+ ("c:\\path", "c:/path"),
+ ("file:///", "{current_drive}/"),
+ pytest.param(
+ "file://localhost/c:/path",
+ "c:/path",
+ marks=pytest.mark.xfail(
+ reason="rfc8089 section3 'localhost uri' not supported"
+ ),
+ ),
+ ],
+)
+def test_strip_protocol_absolute_paths(uri, expected, current_drive, cwd):
+ expected = expected.format(current_drive=current_drive, cwd=cwd)
+
+ stripped = LocalFileSystem._strip_protocol(uri)
+ assert expected == stripped
+
+
+@pytest.mark.parametrize(
+ "uri, expected",
+ [
+ ("file:c|/path", "c:/path"),
+ ("file:/D|/path", "D:/path"),
+ ("file:///C|/path", "C:/path"),
+ ],
+)
+@pytest.mark.skipif(not WIN, reason="Windows only")
+@pytest.mark.xfail(WIN, reason="legacy dos uris not supported")
+def test_strip_protocol_legacy_dos_uris(uri, expected):
+ stripped = LocalFileSystem._strip_protocol(uri)
+ assert expected == stripped
+
+
+@pytest.mark.parametrize(
+ "uri, stripped",
+ [
+ ("file://remote/share/pth", "{cwd}/remote/share/pth"),
+ ("file:////remote/share/pth", "//remote/share/pth"),
+ ("file://///remote/share/pth", "///remote/share/pth"),
+ ("//remote/share/pth", "//remote/share/pth"),
+ winonly("\\\\remote\\share\\pth", "//remote/share/pth"),
+ ],
+)
+def test_strip_protocol_windows_remote_shares(uri, stripped, cwd):
+ stripped = stripped.format(cwd=cwd)
+
+ assert LocalFileSystem._strip_protocol(uri) == stripped
+
+
+def test_mkdir_twice_faile(tmpdir):
+ fn = os.path.join(tmpdir, "test")
+ fs = fsspec.filesystem("file")
+ fs.mkdir(fn)
+ with pytest.raises(FileExistsError):
+ fs.mkdir(fn)
+
+
+def test_iterable(tmpdir):
+ data = b"a\nhello\noi"
+ fn = os.path.join(tmpdir, "test")
+ with open(fn, "wb") as f:
+ f.write(data)
+ of = fsspec.open(f"file://{fn}", "rb")
+ with of as f:
+ out = list(f)
+ assert b"".join(out) == data
+
+
+def test_mv_empty(tmpdir):
+ localfs = fsspec.filesystem("file")
+ src = os.path.join(str(tmpdir), "src")
+ dest = os.path.join(str(tmpdir), "dest")
+ assert localfs.isdir(src) is False
+ localfs.mkdir(src)
+ assert localfs.isdir(src)
+ localfs.move(src, dest, recursive=True)
+ assert localfs.isdir(src) is False
+ assert localfs.isdir(dest)
+ assert localfs.info(dest)
+
+
+def test_mv_recursive(tmpdir):
+ localfs = fsspec.filesystem("file")
+ src = os.path.join(str(tmpdir), "src")
+ dest = os.path.join(str(tmpdir), "dest")
+ assert localfs.isdir(src) is False
+ localfs.mkdir(src)
+ assert localfs.isdir(src)
+ localfs.touch(os.path.join(src, "afile"))
+ localfs.move(src, dest, recursive=True)
+ assert localfs.isdir(src) is False
+ assert localfs.isdir(dest)
+ assert localfs.info(os.path.join(dest, "afile"))
+
+
+@pytest.mark.xfail(WIN, reason="windows expand path to be revisited")
+def test_copy_errors(tmpdir):
+ localfs = fsspec.filesystem("file", auto_mkdir=True)
+
+ dest1 = os.path.join(str(tmpdir), "dest1")
+ dest2 = os.path.join(str(tmpdir), "dest2")
+
+ src = os.path.join(str(tmpdir), "src")
+ file1 = os.path.join(src, "afile1")
+ file2 = os.path.join(src, "afile2")
+ dne = os.path.join(str(tmpdir), "src", "notafile")
+
+ localfs.mkdir(src)
+ localfs.mkdir(dest1)
+ localfs.mkdir(dest2)
+ localfs.touch(file1)
+ localfs.touch(file2)
+
+ # Non recursive should raise an error unless we specify ignore
+ with pytest.raises(FileNotFoundError):
+ localfs.copy([file1, file2, dne], dest1)
+
+ localfs.copy([file1, file2, dne], dest1, on_error="ignore")
+
+ assert sorted(localfs.ls(dest1)) == [
+ make_path_posix(os.path.join(dest1, "afile1")),
+ make_path_posix(os.path.join(dest1, "afile2")),
+ ]
+
+ # Recursive should raise an error only if we specify raise
+ # the patch simulates the filesystem finding a file that does not
+ # exist in the directory
+ current_files = localfs.expand_path(src, recursive=True)
+ with patch.object(localfs, "expand_path", return_value=current_files + [dne]):
+ with pytest.raises(FileNotFoundError):
+ localfs.copy(src + "/", dest2, recursive=True, on_error="raise")
+
+ localfs.copy(src + "/", dest2, recursive=True)
+ assert sorted(localfs.ls(dest2)) == [
+ make_path_posix(os.path.join(dest2, "afile1")),
+ make_path_posix(os.path.join(dest2, "afile2")),
+ ]
+
+
+def test_transaction(tmpdir):
+ file = str(tmpdir / "test.txt")
+ fs = LocalFileSystem()
+
+ with fs.transaction:
+ content = "hello world"
+ with fs.open(file, "w") as fp:
+ fp.write(content)
+
+ with fs.open(file, "r") as fp:
+ read_content = fp.read()
+
+ assert content == read_content
+
+
+def test_delete_cwd(tmpdir):
+ cwd = os.getcwd()
+ fs = LocalFileSystem()
+ try:
+ os.chdir(tmpdir)
+ with pytest.raises(ValueError):
+ fs.rm(".", recursive=True)
+ finally:
+ os.chdir(cwd)
+
+
+def test_delete_non_recursive_dir_fails(tmpdir):
+ fs = LocalFileSystem()
+ subdir = os.path.join(tmpdir, "testdir")
+ fs.mkdir(subdir)
+ with pytest.raises(ValueError):
+ fs.rm(subdir)
+ fs.rm(subdir, recursive=True)
+
+
+@pytest.mark.parametrize(
+ "opener, ext", [(bz2.open, ".bz2"), (gzip.open, ".gz"), (open, "")]
+)
+def test_infer_compression(tmpdir, opener, ext):
+ filename = str(tmpdir / f"test{ext}")
+ content = b"hello world"
+ with opener(filename, "wb") as fp:
+ fp.write(content)
+
+ fs = LocalFileSystem()
+ with fs.open(f"file://{filename}", "rb", compression="infer") as fp:
+ read_content = fp.read()
+
+ assert content == read_content
+
+
+def test_info_path_like(tmpdir):
+ path = Path(tmpdir / "test_info")
+ path.write_text("fsspec")
+
+ fs = LocalFileSystem()
+ assert fs.exists(path)
+
+
+def test_seekable(tmpdir):
+ fs = LocalFileSystem()
+ tmpdir = str(tmpdir)
+ fn0 = os.path.join(tmpdir, "target")
+
+ with open(fn0, "wb") as f:
+ f.write(b"data")
+
+ f = fs.open(fn0, "rt")
+ assert f.seekable(), "file is not seekable"
+ f.seek(1)
+ assert f.read(1) == "a"
+ assert f.tell() == 2
+
+
+def test_numpy_fromfile(tmpdir):
+ # Regression test for #1005.
+ np = pytest.importorskip("numpy")
+ fn = str(tmpdir / "test_arr.npy")
+ dt = np.int64
+ arr = np.arange(10, dtype=dt)
+ arr.tofile(fn)
+ assert np.array_equal(np.fromfile(fn, dtype=dt), arr)
+
+
+def test_link(tmpdir):
+ target = os.path.join(tmpdir, "target")
+ link = os.path.join(tmpdir, "link")
+
+ fs = LocalFileSystem()
+ fs.touch(target)
+
+ fs.link(target, link)
+ assert fs.info(link)["nlink"] > 1
+
+
+def test_symlink(tmpdir):
+ target = os.path.join(tmpdir, "target")
+ link = os.path.join(tmpdir, "link")
+
+ fs = LocalFileSystem()
+ fs.touch(target)
+ try:
+ fs.symlink(target, link)
+ except OSError as e:
+ if "[WinError 1314]" in str(e):
+ # Windows requires developer mode to be enabled to use symbolic links
+ return
+ raise
+ assert fs.islink(link)
+
+
+# https://github.com/fsspec/filesystem_spec/issues/967
+def test_put_file_to_dir(tmpdir):
+ src_file = os.path.join(str(tmpdir), "src")
+ target_dir = os.path.join(str(tmpdir), "target")
+ target_file = os.path.join(target_dir, "src")
+
+ fs = LocalFileSystem()
+ fs.touch(src_file)
+ fs.mkdir(target_dir)
+ fs.put(src_file, target_dir)
+
+ assert fs.isfile(target_file)
+
+
+def test_du(tmpdir):
+ file = tmpdir / "file"
+ subdir = tmpdir / "subdir"
+ subfile = subdir / "subfile"
+
+ fs = LocalFileSystem()
+ with open(file, "wb") as f:
+ f.write(b"4444")
+ fs.mkdir(subdir)
+ with open(subfile, "wb") as f:
+ f.write(b"7777777")
+
+ # Switch to posix paths for comparisons
+ tmpdir_posix = Path(tmpdir).as_posix()
+ file_posix = Path(file).as_posix()
+ subdir_posix = Path(subdir).as_posix()
+ subfile_posix = Path(subfile).as_posix()
+
+ assert fs.du(tmpdir) == 11
+ assert fs.du(tmpdir, total=False) == {file_posix: 4, subfile_posix: 7}
+ # Note directory size is OS-specific, but must be >= 0
+ assert fs.du(tmpdir, withdirs=True) >= 11
+
+ d = fs.du(tmpdir, total=False, withdirs=True)
+ assert len(d) == 4
+ assert d[file_posix] == 4
+ assert d[subfile_posix] == 7
+ assert d[tmpdir_posix] >= 0
+ assert d[subdir_posix] >= 0
+
+ assert fs.du(tmpdir, maxdepth=2) == 11
+ assert fs.du(tmpdir, maxdepth=1) == 4
+ with pytest.raises(ValueError):
+ fs.du(tmpdir, maxdepth=0)
+
+ # Size of file only.
+ assert fs.du(file) == 4
+ assert fs.du(file, withdirs=True) == 4
+
+
+@pytest.mark.parametrize("funcname", ["cp", "get", "put"])
+def test_cp_get_put_directory_recursive(tmpdir, funcname):
+ # https://github.com/fsspec/filesystem_spec/issues/1062
+ # Recursive cp/get/put of source directory into non-existent target directory.
+ fs = LocalFileSystem()
+ src = os.path.join(str(tmpdir), "src")
+ fs.mkdir(src)
+ fs.touch(os.path.join(src, "file"))
+
+ target = os.path.join(str(tmpdir), "target")
+
+ if funcname == "cp":
+ func = fs.cp
+ elif funcname == "get":
+ func = fs.get
+ elif funcname == "put":
+ func = fs.put
+
+ # cp/get/put without slash
+ assert not fs.exists(target)
+ for loop in range(2):
+ func(src, target, recursive=True)
+ assert fs.isdir(target)
+
+ if loop == 0:
+ assert fs.find(target) == [make_path_posix(os.path.join(target, "file"))]
+ else:
+ assert sorted(fs.find(target)) == [
+ make_path_posix(os.path.join(target, "file")),
+ make_path_posix(os.path.join(target, "src", "file")),
+ ]
+
+ fs.rm(target, recursive=True)
+
+ # cp/get/put with slash
+ assert not fs.exists(target)
+ for loop in range(2):
+ func(src + "/", target, recursive=True)
+ assert fs.isdir(target)
+ assert fs.find(target) == [make_path_posix(os.path.join(target, "file"))]
+
+
+@pytest.mark.parametrize("funcname", ["cp", "get", "put"])
+def test_cp_get_put_empty_directory(tmpdir, funcname):
+ # https://github.com/fsspec/filesystem_spec/issues/1198
+ # cp/get/put of empty directory.
+ fs = LocalFileSystem(auto_mkdir=True)
+ empty = os.path.join(str(tmpdir), "empty")
+ fs.mkdir(empty)
+
+ target = os.path.join(str(tmpdir), "target")
+ fs.mkdir(target)
+
+ if funcname == "cp":
+ func = fs.cp
+ elif funcname == "get":
+ func = fs.get
+ elif funcname == "put":
+ func = fs.put
+
+ # cp/get/put without slash, target directory exists
+ assert fs.isdir(target)
+ func(empty, target)
+ assert fs.find(target, withdirs=True) == [make_path_posix(target)]
+
+ # cp/get/put with slash, target directory exists
+ assert fs.isdir(target)
+ func(empty + "/", target)
+ assert fs.find(target, withdirs=True) == [make_path_posix(target)]
+
+ fs.rmdir(target)
+
+ # cp/get/put without slash, target directory doesn't exist
+ assert not fs.isdir(target)
+ func(empty, target)
+ assert not fs.isdir(target)
+
+ # cp/get/put with slash, target directory doesn't exist
+ assert not fs.isdir(target)
+ func(empty + "/", target)
+ assert not fs.isdir(target)
+
+
+def test_cp_two_files(tmpdir):
+ fs = LocalFileSystem(auto_mkdir=True)
+ src = os.path.join(str(tmpdir), "src")
+ file0 = os.path.join(src, "file0")
+ file1 = os.path.join(src, "file1")
+ fs.mkdir(src)
+ fs.touch(file0)
+ fs.touch(file1)
+
+ target = os.path.join(str(tmpdir), "target")
+ assert not fs.exists(target)
+
+ fs.cp([file0, file1], target)
+
+ assert fs.isdir(target)
+ assert sorted(fs.find(target)) == [
+ make_path_posix(os.path.join(target, "file0")),
+ make_path_posix(os.path.join(target, "file1")),
+ ]
+
+
+@pytest.mark.skipif(WIN, reason="Windows does not support colons in filenames")
+def test_issue_1447():
+ files_with_colons = {
+ ".local:file:with:colons.txt": b"content1",
+ ".colons-after-extension.txt:after": b"content2",
+ ".colons-after-extension/file:colon.txt:before/after": b"content3",
+ }
+ with filetexts(files_with_colons, mode="b"):
+ for file, contents in files_with_colons.items():
+ with fsspec.filesystem("file").open(file, "rb") as f:
+ assert f.read() == contents
+
+ fs, urlpath = fsspec.core.url_to_fs(file)
+ assert isinstance(fs, fsspec.implementations.local.LocalFileSystem)
+ with fs.open(urlpath, "rb") as f:
+ assert f.read() == contents
diff --git a/fsspec/implementations/tests/test_memory.py b/fsspec/implementations/tests/test_memory.py
index 105eeb8..600022a 100644
--- a/fsspec/implementations/tests/test_memory.py
+++ b/fsspec/implementations/tests/test_memory.py
@@ -1,4 +1,382 @@
import os
from pathlib import PurePosixPath, PureWindowsPath
+
import pytest
+
from fsspec.implementations.local import LocalFileSystem, make_path_posix
+
+
+def test_1(m):
+ m.touch("/somefile") # NB: is found with or without initial /
+ m.touch("afiles/and/another")
+ files = m.find("")
+ assert files == ["/afiles/and/another", "/somefile"]
+
+ files = sorted(m.get_mapper())
+ assert files == ["afiles/and/another", "somefile"]
+
+
+def test_strip(m):
+ assert m._strip_protocol("") == ""
+ assert m._strip_protocol("memory://") == ""
+ assert m._strip_protocol("afile") == "/afile"
+ assert m._strip_protocol("/b/c") == "/b/c"
+ assert m._strip_protocol("/b/c/") == "/b/c"
+
+
+def test_ls(m):
+ m.mkdir("/dir")
+ m.mkdir("/dir/dir1")
+
+ m.touch("/dir/afile")
+ m.touch("/dir/dir1/bfile")
+ m.touch("/dir/dir1/cfile")
+
+ assert m.ls("/", False) == ["/dir"]
+ assert m.ls("/dir", False) == ["/dir/afile", "/dir/dir1"]
+ assert m.ls("/dir", True)[0]["type"] == "file"
+ assert m.ls("/dir", True)[1]["type"] == "directory"
+ assert m.ls("/dir/afile", False) == ["/dir/afile"]
+ assert m.ls("/dir/afile", True)[0]["type"] == "file"
+
+ assert len(m.ls("/dir/dir1")) == 2
+ assert len(m.ls("/dir/afile")) == 1
+
+
+def test_directories(m):
+ m.mkdir("outer/inner")
+ assert m.info("outer/inner")["type"] == "directory"
+
+ assert m.ls("outer")
+ assert m.ls("outer/inner") == []
+
+ with pytest.raises(OSError):
+ m.rmdir("outer")
+
+ m.rmdir("outer/inner")
+ m.rmdir("outer")
+
+ assert not m.store
+
+
+def test_exists_isdir_isfile(m):
+ m.mkdir("/root")
+ m.touch("/root/a")
+
+ assert m.exists("/root")
+ assert m.isdir("/root")
+ assert not m.isfile("/root")
+
+ assert m.exists("/root/a")
+ assert m.isfile("/root/a")
+ assert not m.isdir("/root/a")
+
+ assert not m.exists("/root/not-exists")
+ assert not m.isfile("/root/not-exists")
+ assert not m.isdir("/root/not-exists")
+
+ m.rm("/root/a")
+ m.rmdir("/root")
+
+ assert not m.exists("/root")
+
+ m.touch("/a/b")
+ assert m.isfile("/a/b")
+
+ assert m.exists("/a")
+ assert m.isdir("/a")
+ assert not m.isfile("/a")
+
+
+def test_touch(m):
+ m.touch("/root/a")
+ with pytest.raises(FileExistsError):
+ m.touch("/root/a/b")
+ with pytest.raises(FileExistsError):
+ m.touch("/root/a/b/c")
+ assert not m.exists("/root/a/b/")
+
+
+def test_mv_recursive(m):
+ m.mkdir("src")
+ m.touch("src/file.txt")
+ m.mv("src", "dest", recursive=True)
+ assert m.exists("dest/file.txt")
+ assert not m.exists("src")
+
+
+def test_mv_same_paths(m):
+ m.mkdir("src")
+ m.touch("src/file.txt")
+ m.mv("src", "src", recursive=True)
+ assert m.exists("src/file.txt")
+
+
+def test_rm_no_pseudo_dir(m):
+ m.touch("/dir1/dir2/file")
+ m.rm("/dir1", recursive=True)
+ assert not m.exists("/dir1/dir2/file")
+ assert not m.exists("/dir1/dir2")
+ assert not m.exists("/dir1")
+
+ with pytest.raises(FileNotFoundError):
+ m.rm("/dir1", recursive=True)
+
+
+def test_rewind(m):
+ # https://github.com/fsspec/filesystem_spec/issues/349
+ with m.open("src/file.txt", "w") as f:
+ f.write("content")
+ with m.open("src/file.txt") as f:
+ assert f.tell() == 0
+
+
+def test_empty_raises(m):
+ with pytest.raises(FileNotFoundError):
+ m.ls("nonexistent")
+
+ with pytest.raises(FileNotFoundError):
+ m.info("nonexistent")
+
+
+def test_dir_errors(m):
+ m.mkdir("/first")
+
+ with pytest.raises(FileExistsError):
+ m.mkdir("/first")
+ with pytest.raises(FileExistsError):
+ m.makedirs("/first", exist_ok=False)
+ m.makedirs("/first", exist_ok=True)
+ m.makedirs("/first/second/third")
+ assert "/first/second" in m.pseudo_dirs
+
+ m.touch("/afile")
+ with pytest.raises(NotADirectoryError):
+ m.mkdir("/afile/nodir")
+
+
+def test_no_rewind_append_mode(m):
+ # https://github.com/fsspec/filesystem_spec/issues/349
+ with m.open("src/file.txt", "w") as f:
+ f.write("content")
+ with m.open("src/file.txt", "a") as f:
+ assert f.tell() == 7
+
+
+def test_moves(m):
+ m.touch("source.txt")
+ m.mv("source.txt", "target.txt")
+
+ m.touch("source2.txt")
+ m.mv("source2.txt", "target2.txt", recursive=True)
+ assert m.find("") == ["/target.txt", "/target2.txt"]
+
+
+def test_rm_reursive_empty_subdir(m):
+ # https://github.com/fsspec/filesystem_spec/issues/500
+ m.mkdir("recdir")
+ m.mkdir("recdir/subdir2")
+ m.rm("recdir/", recursive=True)
+ assert not m.exists("dir")
+
+
+def test_seekable(m):
+ fn0 = "foo.txt"
+ with m.open(fn0, "wb") as f:
+ f.write(b"data")
+
+ f = m.open(fn0, "rt")
+ assert f.seekable(), "file is not seekable"
+ f.seek(1)
+ assert f.read(1) == "a"
+ assert f.tell() == 2
+
+
+# https://github.com/fsspec/filesystem_spec/issues/1425
+@pytest.mark.parametrize("mode", ["r", "rb", "w", "wb", "ab", "r+b"])
+def test_open_mode(m, mode):
+ filename = "mode.txt"
+ m.touch(filename)
+ with m.open(filename, mode=mode) as _:
+ pass
+
+
+def test_remove_all(m):
+ m.touch("afile")
+ m.rm("/", recursive=True)
+ assert not m.ls("/")
+
+
+def test_cp_directory_recursive(m):
+ # https://github.com/fsspec/filesystem_spec/issues/1062
+ # Recursive cp/get/put of source directory into non-existent target directory.
+ src = "/src"
+ src_file = src + "/file"
+ m.mkdir(src)
+ m.touch(src_file)
+
+ target = "/target"
+
+ # cp without slash
+ assert not m.exists(target)
+ for loop in range(2):
+ m.cp(src, target, recursive=True)
+ assert m.isdir(target)
+
+ if loop == 0:
+ correct = [target + "/file"]
+ assert m.find(target) == correct
+ else:
+ correct = [target + "/file", target + "/src/file"]
+ assert sorted(m.find(target)) == correct
+
+ m.rm(target, recursive=True)
+
+ # cp with slash
+ assert not m.exists(target)
+ for loop in range(2):
+ m.cp(src + "/", target, recursive=True)
+ assert m.isdir(target)
+ correct = [target + "/file"]
+ assert m.find(target) == correct
+
+
+def test_get_directory_recursive(m, tmpdir):
+ # https://github.com/fsspec/filesystem_spec/issues/1062
+ # Recursive cp/get/put of source directory into non-existent target directory.
+ src = "/src"
+ src_file = src + "/file"
+ m.mkdir(src)
+ m.touch(src_file)
+
+ target = os.path.join(tmpdir, "target")
+ target_fs = LocalFileSystem()
+
+ # get without slash
+ assert not target_fs.exists(target)
+ for loop in range(2):
+ m.get(src, target, recursive=True)
+ assert target_fs.isdir(target)
+
+ if loop == 0:
+ correct = [make_path_posix(os.path.join(target, "file"))]
+ assert target_fs.find(target) == correct
+ else:
+ correct = [
+ make_path_posix(os.path.join(target, "file")),
+ make_path_posix(os.path.join(target, "src", "file")),
+ ]
+ assert sorted(target_fs.find(target)) == correct
+
+ target_fs.rm(target, recursive=True)
+
+ # get with slash
+ assert not target_fs.exists(target)
+ for loop in range(2):
+ m.get(src + "/", target, recursive=True)
+ assert target_fs.isdir(target)
+ correct = [make_path_posix(os.path.join(target, "file"))]
+ assert target_fs.find(target) == correct
+
+
+def test_put_directory_recursive(m, tmpdir):
+ # https://github.com/fsspec/filesystem_spec/issues/1062
+ # Recursive cp/get/put of source directory into non-existent target directory.
+ src = os.path.join(tmpdir, "src")
+ src_file = os.path.join(src, "file")
+ source_fs = LocalFileSystem()
+ source_fs.mkdir(src)
+ source_fs.touch(src_file)
+
+ target = "/target"
+
+ # put without slash
+ assert not m.exists(target)
+ for loop in range(2):
+ m.put(src, target, recursive=True)
+ assert m.isdir(target)
+
+ if loop == 0:
+ correct = [target + "/file"]
+ assert m.find(target) == correct
+ else:
+ correct = [target + "/file", target + "/src/file"]
+ assert sorted(m.find(target)) == correct
+
+ m.rm(target, recursive=True)
+
+ # put with slash
+ assert not m.exists(target)
+ for loop in range(2):
+ m.put(src + "/", target, recursive=True)
+ assert m.isdir(target)
+ correct = [target + "/file"]
+ assert m.find(target) == correct
+
+
+def test_cp_empty_directory(m):
+ # https://github.com/fsspec/filesystem_spec/issues/1198
+ # cp/get/put of empty directory.
+ empty = "/src/empty"
+ m.mkdir(empty)
+
+ target = "/target"
+ m.mkdir(target)
+
+ # cp without slash, target directory exists
+ assert m.isdir(target)
+ m.cp(empty, target)
+ assert m.find(target, withdirs=True) == [target]
+
+ # cp with slash, target directory exists
+ assert m.isdir(target)
+ m.cp(empty + "/", target)
+ assert m.find(target, withdirs=True) == [target]
+
+ m.rmdir(target)
+
+ # cp without slash, target directory doesn't exist
+ assert not m.isdir(target)
+ m.cp(empty, target)
+ assert not m.isdir(target)
+
+ # cp with slash, target directory doesn't exist
+ assert not m.isdir(target)
+ m.cp(empty + "/", target)
+ assert not m.isdir(target)
+
+
+def test_cp_two_files(m):
+ src = "/src"
+ file0 = src + "/file0"
+ file1 = src + "/file1"
+ m.mkdir(src)
+ m.touch(file0)
+ m.touch(file1)
+
+ target = "/target"
+ assert not m.exists(target)
+
+ m.cp([file0, file1], target)
+
+ assert m.isdir(target)
+ assert sorted(m.find(target)) == [
+ "/target/file0",
+ "/target/file1",
+ ]
+
+
+def test_open_path_posix(m):
+ path = PurePosixPath("/myfile/foo/bar")
+ with m.open(path, "wb") as f:
+ f.write(b"some\nlines\nof\ntext")
+
+ assert m.read_text(path) == "some\nlines\nof\ntext"
+
+
+def test_open_path_windows(m):
+ path = PureWindowsPath("C:\\myfile\\foo\\bar")
+ with m.open(path, "wb") as f:
+ f.write(b"some\nlines\nof\ntext")
+
+ assert m.read_text(path) == "some\nlines\nof\ntext"
diff --git a/fsspec/implementations/tests/test_reference.py b/fsspec/implementations/tests/test_reference.py
index 99b84b2..762d831 100644
--- a/fsspec/implementations/tests/test_reference.py
+++ b/fsspec/implementations/tests/test_reference.py
@@ -1,10 +1,192 @@
import json
import os
+
import pytest
+
import fsspec
from fsspec.implementations.local import LocalFileSystem
-from fsspec.implementations.reference import LazyReferenceMapper, ReferenceFileSystem, ReferenceNotReachable
-from fsspec.tests.conftest import data, realfile, reset_files, server, win
+from fsspec.implementations.reference import (
+ LazyReferenceMapper,
+ ReferenceFileSystem,
+ ReferenceNotReachable,
+)
+from fsspec.tests.conftest import data, realfile, reset_files, server, win # noqa: F401
+
+
+def test_simple(server): # noqa: F811
+ # The dictionary in refs may be dumped with a different separator
+ # depending on whether json or ujson is imported
+ from fsspec.implementations.reference import json as json_impl
+
+ refs = {
+ "a": b"data",
+ "b": (realfile, 0, 5),
+ "c": (realfile, 1, 5),
+ "d": b"base64:aGVsbG8=",
+ "e": {"key": "value"},
+ }
+ h = fsspec.filesystem("http")
+ fs = fsspec.filesystem("reference", fo=refs, fs=h)
+
+ assert fs.cat("a") == b"data"
+ assert fs.cat("b") == data[:5]
+ assert fs.cat("c") == data[1 : 1 + 5]
+ assert fs.cat("d") == b"hello"
+ assert fs.cat("e") == json_impl.dumps(refs["e"]).encode("utf-8")
+ with fs.open("d", "rt") as f:
+ assert f.read(2) == "he"
+
+
+def test_simple_ver1(server): # noqa: F811
+ # The dictionary in refs may be dumped with a different separator
+ # depending on whether json or ujson is imported
+ from fsspec.implementations.reference import json as json_impl
+
+ in_data = {
+ "version": 1,
+ "refs": {
+ "a": b"data",
+ "b": (realfile, 0, 5),
+ "c": (realfile, 1, 5),
+ "d": b"base64:aGVsbG8=",
+ "e": {"key": "value"},
+ },
+ }
+ h = fsspec.filesystem("http")
+ fs = fsspec.filesystem("reference", fo=in_data, fs=h)
+
+ assert fs.cat("a") == b"data"
+ assert fs.cat("b") == data[:5]
+ assert fs.cat("c") == data[1 : 1 + 5]
+ assert fs.cat("d") == b"hello"
+ assert fs.cat("e") == json_impl.dumps(in_data["refs"]["e"]).encode("utf-8")
+ with fs.open("d", "rt") as f:
+ assert f.read(2) == "he"
+
+
+def test_target_options(m):
+ m.pipe("data/0", b"hello")
+ refs = {"a": ["memory://data/0"]}
+ fn = "memory://refs.json.gz"
+ with fsspec.open(fn, "wt", compression="gzip") as f:
+ json.dump(refs, f)
+
+ fs = fsspec.filesystem("reference", fo=fn, target_options={"compression": "gzip"})
+ assert fs.cat("a") == b"hello"
+
+
+def test_ls(server): # noqa: F811
+ refs = {"a": b"data", "b": (realfile, 0, 5), "c/d": (realfile, 1, 6)}
+ h = fsspec.filesystem("http")
+ fs = fsspec.filesystem("reference", fo=refs, fs=h)
+
+ assert fs.ls("", detail=False) == ["a", "b", "c"]
+ assert {"name": "c", "type": "directory", "size": 0} in fs.ls("", detail=True)
+ assert fs.find("") == ["a", "b", "c/d"]
+ assert fs.find("", withdirs=True) == ["a", "b", "c", "c/d"]
+ assert fs.find("c", detail=True) == {
+ "c/d": {"name": "c/d", "size": 6, "type": "file"}
+ }
+
+
+def test_nested_dirs_ls():
+ # issue #1430
+ refs = {"a": "A", "B/C/b": "B", "B/C/d": "d", "B/_": "_"}
+ fs = fsspec.filesystem("reference", fo=refs)
+ assert len(fs.ls("")) == 2
+ assert {e["name"] for e in fs.ls("")} == {"a", "B"}
+ assert len(fs.ls("B")) == 2
+ assert {e["name"] for e in fs.ls("B")} == {"B/C", "B/_"}
+
+
+def test_info(server): # noqa: F811
+ refs = {
+ "a": b"data",
+ "b": (realfile, 0, 5),
+ "c/d": (realfile, 1, 6),
+ "e": (realfile,),
+ }
+ h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"})
+ fs = fsspec.filesystem("reference", fo=refs, fs=h)
+ assert fs.size("a") == 4
+ assert fs.size("b") == 5
+ assert fs.size("c/d") == 6
+ assert fs.info("e")["size"] == len(data)
+
+
+def test_mutable(server, m):
+ refs = {
+ "a": b"data",
+ "b": (realfile, 0, 5),
+ "c/d": (realfile, 1, 6),
+ "e": (realfile,),
+ }
+ h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"})
+ fs = fsspec.filesystem("reference", fo=refs, fs=h)
+ fs.rm("a")
+ assert not fs.exists("a")
+
+ bin_data = b"bin data"
+ fs.pipe("aa", bin_data)
+ assert fs.cat("aa") == bin_data
+
+ fs.save_json("memory://refs.json")
+ assert m.exists("refs.json")
+
+ fs = fsspec.filesystem("reference", fo="memory://refs.json", remote_protocol="http")
+ assert not fs.exists("a")
+ assert fs.cat("aa") == bin_data
+
+
+def test_put_get(tmpdir):
+ d1 = f"{tmpdir}/d1"
+ os.mkdir(d1)
+ with open(f"{d1}/a", "wb") as f:
+ f.write(b"1")
+ with open(f"{d1}/b", "wb") as f:
+ f.write(b"2")
+ d2 = f"{tmpdir}/d2"
+
+ fs = fsspec.filesystem("reference", fo={}, remote_protocol="file")
+ fs.put(d1, "out", recursive=True)
+
+ fs.get("out", d2, recursive=True)
+ assert open(f"{d2}/a", "rb").read() == b"1"
+ assert open(f"{d2}/b", "rb").read() == b"2"
+
+
+def test_put_get_single(tmpdir):
+ d1 = f"{tmpdir}/f1"
+ d2 = f"{tmpdir}/f2"
+ with open(d1, "wb") as f:
+ f.write(b"1")
+
+ # skip instance cache since this is the same kwargs as previous test
+ fs = fsspec.filesystem(
+ "reference", fo={}, remote_protocol="file", skip_instance_cache=True
+ )
+ fs.put_file(d1, "out")
+
+ fs.get_file("out", d2)
+ assert open(d2, "rb").read() == b"1"
+ fs.pipe({"hi": b"data"})
+ assert fs.cat("hi") == b"data"
+
+
+def test_defaults(server): # noqa: F811
+ refs = {"a": b"data", "b": (None, 0, 5)}
+ fs = fsspec.filesystem(
+ "reference",
+ fo=refs,
+ target_protocol="http",
+ target=realfile,
+ remote_protocol="http",
+ )
+
+ assert fs.cat("a") == b"data"
+ assert fs.cat("b") == data[:5]
+
+
jdata = """{
"metadata": {
".zattrs": {
@@ -43,3 +225,537 @@ jdata = """{
"zarr_consolidated_format": 1
}
"""
+
+
+def test_spec1_expand():
+ pytest.importorskip("jinja2")
+ from fsspec.implementations.reference import json as json_impl
+
+ in_data = {
+ "version": 1,
+ "templates": {"u": "server.domain/path", "f": "{{c}}"},
+ "gen": [
+ {
+ "key": "gen_key{{i}}",
+ "url": "http://{{u}}_{{i}}",
+ "offset": "{{(i + 1) * 1000}}",
+ "length": "1000",
+ "dimensions": {"i": {"stop": 5}},
+ },
+ {
+ "key": "gen_key{{i}}",
+ "url": "http://{{u}}_{{i}}",
+ "dimensions": {"i": {"start": 5, "stop": 7}},
+ },
+ ],
+ "refs": {
+ "key0": "data",
+ "key1": ["http://target_url", 10000, 100],
+ "key2": ["http://{{u}}", 10000, 100],
+ "key3": ["http://{{f(c='text')}}", 10000, 100],
+ "key4": ["http://target_url"],
+ "key5": {"key": "value"},
+ },
+ }
+ fs = fsspec.filesystem(
+ "reference", fo=in_data, target_protocol="http", simple_templates=False
+ )
+ assert fs.references == {
+ "key0": "data",
+ "key1": ["http://target_url", 10000, 100],
+ "key2": ["http://server.domain/path", 10000, 100],
+ "key3": ["http://text", 10000, 100],
+ "key4": ["http://target_url"],
+ "key5": json_impl.dumps(in_data["refs"]["key5"]),
+ "gen_key0": ["http://server.domain/path_0", 1000, 1000],
+ "gen_key1": ["http://server.domain/path_1", 2000, 1000],
+ "gen_key2": ["http://server.domain/path_2", 3000, 1000],
+ "gen_key3": ["http://server.domain/path_3", 4000, 1000],
+ "gen_key4": ["http://server.domain/path_4", 5000, 1000],
+ "gen_key5": ["http://server.domain/path_5"],
+ "gen_key6": ["http://server.domain/path_6"],
+ }
+
+
+def test_spec1_expand_simple():
+ pytest.importorskip("jinja2")
+ from fsspec.implementations.reference import json as json_impl
+
+ in_data = {
+ "version": 1,
+ "templates": {"u": "server.domain/path"},
+ "refs": {
+ "key0": "base64:ZGF0YQ==",
+ "key2": ["http://{{u}}", 10000, 100],
+ "key4": ["http://target_url"],
+ "key5": {"key": "value"},
+ },
+ }
+ fs = fsspec.filesystem("reference", fo=in_data, target_protocol="http")
+ assert fs.references["key2"] == ["http://server.domain/path", 10000, 100]
+ fs = fsspec.filesystem(
+ "reference",
+ fo=in_data,
+ target_protocol="http",
+ template_overrides={"u": "not.org/p"},
+ )
+ assert fs.references["key2"] == ["http://not.org/p", 10000, 100]
+ assert fs.cat("key0") == b"data"
+ assert fs.cat("key5") == json_impl.dumps(in_data["refs"]["key5"]).encode("utf-8")
+
+
+def test_spec1_gen_variants():
+ pytest.importorskip("jinja2")
+ with pytest.raises(ValueError):
+ missing_length_spec = {
+ "version": 1,
+ "templates": {"u": "server.domain/path"},
+ "gen": [
+ {
+ "key": "gen_key{{i}}",
+ "url": "http://{{u}}_{{i}}",
+ "offset": "{{(i + 1) * 1000}}",
+ "dimensions": {"i": {"stop": 2}},
+ },
+ ],
+ }
+ fsspec.filesystem("reference", fo=missing_length_spec, target_protocol="http")
+
+ with pytest.raises(ValueError):
+ missing_offset_spec = {
+ "version": 1,
+ "templates": {"u": "server.domain/path"},
+ "gen": [
+ {
+ "key": "gen_key{{i}}",
+ "url": "http://{{u}}_{{i}}",
+ "length": "1000",
+ "dimensions": {"i": {"stop": 2}},
+ },
+ ],
+ }
+ fsspec.filesystem("reference", fo=missing_offset_spec, target_protocol="http")
+
+ url_only_gen_spec = {
+ "version": 1,
+ "templates": {"u": "server.domain/path"},
+ "gen": [
+ {
+ "key": "gen_key{{i}}",
+ "url": "http://{{u}}_{{i}}",
+ "dimensions": {"i": {"stop": 2}},
+ },
+ ],
+ }
+
+ fs = fsspec.filesystem("reference", fo=url_only_gen_spec, target_protocol="http")
+ assert fs.references == {
+ "gen_key0": ["http://server.domain/path_0"],
+ "gen_key1": ["http://server.domain/path_1"],
+ }
+
+
+def test_empty():
+ pytest.importorskip("jinja2")
+ fs = fsspec.filesystem("reference", fo={"version": 1}, target_protocol="http")
+ assert fs.references == {}
+
+
+def test_get_sync(tmpdir):
+ localfs = LocalFileSystem()
+
+ real = tmpdir / "file"
+ real.write_binary(b"0123456789")
+
+ refs = {"a": b"data", "b": (str(real), 0, 5), "c/d": (str(real), 1, 6)}
+ fs = fsspec.filesystem("reference", fo=refs, fs=localfs)
+
+ fs.get("a", str(tmpdir / "a"))
+ assert (tmpdir / "a").read_binary() == b"data"
+ fs.get("b", str(tmpdir / "b"))
+ assert (tmpdir / "b").read_binary() == b"01234"
+ fs.get("c/d", str(tmpdir / "d"))
+ assert (tmpdir / "d").read_binary() == b"123456"
+ fs.get("c", str(tmpdir / "c"), recursive=True)
+ assert (tmpdir / "c").isdir()
+ assert (tmpdir / "c" / "d").read_binary() == b"123456"
+
+
+def test_multi_fs_provided(m, tmpdir):
+ localfs = LocalFileSystem()
+
+ real = tmpdir / "file"
+ real.write_binary(b"0123456789")
+
+ m.pipe("afile", b"hello")
+
+ # local URLs are file:// by default
+ refs = {
+ "a": b"data",
+ "b": (f"file://{real}", 0, 5),
+ "c/d": (f"file://{real}", 1, 6),
+ "c/e": ["memory://afile"],
+ }
+
+ fs = fsspec.filesystem("reference", fo=refs, fs={"file": localfs, "memory": m})
+ assert fs.cat("c/e") == b"hello"
+ assert fs.cat(["c/e", "a", "b"]) == {
+ "a": b"data",
+ "b": b"01234",
+ "c/e": b"hello",
+ }
+
+
+def test_multi_fs_created(m, tmpdir):
+ real = tmpdir / "file"
+ real.write_binary(b"0123456789")
+
+ m.pipe("afile", b"hello")
+
+ # local URLs are file:// by default
+ refs = {
+ "a": b"data",
+ "b": (f"file://{real}", 0, 5),
+ "c/d": (f"file://{real}", 1, 6),
+ "c/e": ["memory://afile"],
+ }
+
+ fs = fsspec.filesystem("reference", fo=refs, fs={"file": {}, "memory": {}})
+ assert fs.cat("c/e") == b"hello"
+ assert fs.cat(["c/e", "a", "b"]) == {
+ "a": b"data",
+ "b": b"01234",
+ "c/e": b"hello",
+ }
+
+
+def test_missing_nonasync(m):
+ zarr = pytest.importorskip("zarr")
+ zarray = {
+ "chunks": [1],
+ "compressor": None,
+ "dtype": "<f8",
+ "fill_value": "NaN",
+ "filters": [],
+ "order": "C",
+ "shape": [10],
+ "zarr_format": 2,
+ }
+ refs = {".zarray": json.dumps(zarray)}
+
+ m = fsspec.get_mapper("reference://", fo=refs, remote_protocol="memory")
+
+ a = zarr.open_array(m)
+ assert str(a[0]) == "nan"
+
+
+def test_fss_has_defaults(m):
+ fs = fsspec.filesystem("reference", fo={})
+ assert None in fs.fss
+
+ fs = fsspec.filesystem("reference", fo={}, remote_protocol="memory")
+ assert fs.fss[None].protocol == "memory"
+ assert fs.fss["memory"].protocol == "memory"
+
+ fs = fsspec.filesystem("reference", fs=m, fo={})
+ assert fs.fss[None] is m
+
+ fs = fsspec.filesystem("reference", fs={"memory": m}, fo={})
+ assert fs.fss["memory"] is m
+ assert fs.fss[None].protocol == ("file", "local")
+
+ fs = fsspec.filesystem("reference", fs={None: m}, fo={})
+ assert fs.fss[None] is m
+
+ fs = fsspec.filesystem("reference", fo={"key": ["memory://a"]})
+ assert fs.fss[None] is fs.fss["memory"]
+
+ fs = fsspec.filesystem("reference", fo={"key": ["memory://a"], "blah": ["path"]})
+ assert fs.fss[None] is fs.fss["memory"]
+
+
+def test_merging(m):
+ m.pipe("/a", b"test data")
+ other = b"other test data"
+ m.pipe("/b", other)
+ fs = fsspec.filesystem(
+ "reference",
+ fo={
+ "a": ["memory://a", 1, 1],
+ "b": ["memory://a", 2, 1],
+ "c": ["memory://b"],
+ "d": ["memory://b", 4, 6],
+ },
+ )
+ out = fs.cat(["a", "b", "c", "d"])
+ assert out == {"a": b"e", "b": b"s", "c": other, "d": other[4:10]}
+
+
+def test_cat_file_ranges(m):
+ other = b"other test data"
+ m.pipe("/b", other)
+
+ fs = fsspec.filesystem(
+ "reference",
+ fo={
+ "c": ["memory://b"],
+ "d": ["memory://b", 4, 6],
+ },
+ )
+ assert fs.cat_file("c") == other
+ assert fs.cat_file("c", start=1) == other[1:]
+ assert fs.cat_file("c", start=-5) == other[-5:]
+ assert fs.cat_file("c", 1, -5) == other[1:-5]
+
+ assert fs.cat_file("d") == other[4:10]
+ assert fs.cat_file("d", start=1) == other[4:10][1:]
+ assert fs.cat_file("d", start=-5) == other[4:10][-5:]
+ assert fs.cat_file("d", 1, -3) == other[4:10][1:-3]
+
+
+@pytest.mark.parametrize(
+ "fo",
+ [
+ {
+ "c": ["memory://b"],
+ "d": ["memory://unknown", 4, 6],
+ },
+ {
+ "c": ["memory://b"],
+ "d": ["//unknown", 4, 6],
+ },
+ ],
+ ids=["memory protocol", "mixed protocols: memory and unspecified"],
+)
+def test_cat_missing(m, fo):
+ other = b"other test data"
+ m.pipe("/b", other)
+ fs = fsspec.filesystem(
+ "reference",
+ fo=fo,
+ )
+ with pytest.raises(FileNotFoundError):
+ fs.cat("notafile")
+
+ with pytest.raises(FileNotFoundError):
+ fs.cat(["notone", "nottwo"])
+
+ mapper = fs.get_mapper("")
+
+ with pytest.raises(KeyError):
+ mapper["notakey"]
+
+ with pytest.raises(KeyError):
+ mapper.getitems(["notone", "nottwo"])
+
+ with pytest.raises(ReferenceNotReachable) as ex:
+ fs.cat("d")
+ assert ex.value.__cause__
+ out = fs.cat("d", on_error="return")
+ assert isinstance(out, ReferenceNotReachable)
+
+ with pytest.raises(ReferenceNotReachable) as e:
+ mapper["d"]
+ assert '"d"' in str(e.value)
+ assert "//unknown" in str(e.value)
+
+ with pytest.raises(ReferenceNotReachable):
+ mapper.getitems(["c", "d"])
+
+ out = mapper.getitems(["c", "d"], on_error="return")
+ assert isinstance(out["d"], ReferenceNotReachable)
+
+ out = fs.cat(["notone", "c", "d"], on_error="return")
+ assert isinstance(out["notone"], FileNotFoundError)
+ assert out["c"] == other
+ assert isinstance(out["d"], ReferenceNotReachable)
+
+ out = mapper.getitems(["c", "d"], on_error="omit")
+ assert list(out) == ["c"]
+
+
+def test_df_single(m):
+ pd = pytest.importorskip("pandas")
+ pytest.importorskip("fastparquet")
+ data = b"data0data1data2"
+ m.pipe({"data": data})
+ df = pd.DataFrame(
+ {
+ "path": [None, "memory://data", "memory://data"],
+ "offset": [0, 0, 4],
+ "size": [0, 0, 4],
+ "raw": [b"raw", None, None],
+ }
+ )
+ df.to_parquet("memory://stuff/refs.0.parq")
+ m.pipe(
+ ".zmetadata",
+ b"""{
+ "metadata": {
+ ".zgroup": {
+ "zarr_format": 2
+ },
+ "stuff/.zarray": {
+ "chunks": [1],
+ "compressor": null,
+ "dtype": "i8",
+ "filters": null,
+ "shape": [3],
+ "zarr_format": 2
+ }
+ },
+ "zarr_consolidated_format": 1,
+ "record_size": 10
+ }
+ """,
+ )
+ fs = ReferenceFileSystem(fo="memory:///", remote_protocol="memory")
+ allfiles = fs.find("")
+ assert ".zmetadata" in allfiles
+ assert ".zgroup" in allfiles
+ assert "stuff/2" in allfiles
+
+ assert fs.cat("stuff/0") == b"raw"
+ assert fs.cat("stuff/1") == data
+ assert fs.cat("stuff/2") == data[4:8]
+
+
+def test_df_multi(m):
+ pd = pytest.importorskip("pandas")
+ pytest.importorskip("fastparquet")
+ data = b"data0data1data2"
+ m.pipe({"data": data})
+ df0 = pd.DataFrame(
+ {
+ "path": [None, "memory://data", "memory://data"],
+ "offset": [0, 0, 4],
+ "size": [0, 0, 4],
+ "raw": [b"raw1", None, None],
+ }
+ )
+ df0.to_parquet("memory://stuff/refs.0.parq")
+ df1 = pd.DataFrame(
+ {
+ "path": [None, "memory://data", "memory://data"],
+ "offset": [0, 0, 2],
+ "size": [0, 0, 2],
+ "raw": [b"raw2", None, None],
+ }
+ )
+ df1.to_parquet("memory://stuff/refs.1.parq")
+ m.pipe(
+ ".zmetadata",
+ b"""{
+ "metadata": {
+ ".zgroup": {
+ "zarr_format": 2
+ },
+ "stuff/.zarray": {
+ "chunks": [1],
+ "compressor": null,
+ "dtype": "i8",
+ "filters": null,
+ "shape": [6],
+ "zarr_format": 2
+ }
+ },
+ "zarr_consolidated_format": 1,
+ "record_size": 3
+ }
+ """,
+ )
+ fs = ReferenceFileSystem(
+ fo="memory:///", remote_protocol="memory", skip_instance_cache=True
+ )
+ allfiles = fs.find("")
+ assert ".zmetadata" in allfiles
+ assert ".zgroup" in allfiles
+ assert "stuff/2" in allfiles
+ assert "stuff/4" in allfiles
+
+ assert fs.cat("stuff/0") == b"raw1"
+ assert fs.cat("stuff/1") == data
+ assert fs.cat("stuff/2") == data[4:8]
+ assert fs.cat("stuff/3") == b"raw2"
+ assert fs.cat("stuff/4") == data
+ assert fs.cat("stuff/5") == data[2:4]
+
+
+def test_mapping_getitems(m):
+ m.pipe({"a": b"A", "b": b"B"})
+
+ refs = {
+ "a": ["a"],
+ "b": ["b"],
+ }
+ h = fsspec.filesystem("memory")
+ fs = fsspec.filesystem("reference", fo=refs, fs=h)
+ mapping = fs.get_mapper("")
+ assert mapping.getitems(["b", "a"]) == {"a": b"A", "b": b"B"}
+
+
+def test_cached(m, tmpdir):
+ fn = f"{tmpdir}/ref.json"
+
+ m.pipe({"a": b"A", "b": b"B"})
+ m.pipe("ref.json", b"""{"a": ["a"], "b": ["b"]}""")
+
+ fs = fsspec.filesystem(
+ "reference",
+ fo="simplecache::memory://ref.json",
+ fs=m,
+ target_options={"cache_storage": str(tmpdir), "same_names": True},
+ )
+ assert fs.cat("a") == b"A"
+ assert os.path.exists(fn)
+
+ # truncate original file to show we are loading from the cached version
+ m.pipe("ref.json", b"")
+ fs = fsspec.filesystem(
+ "reference",
+ fo="simplecache::memory://ref.json",
+ fs=m,
+ target_options={"cache_storage": str(tmpdir), "same_names": True},
+ skip_instance_cache=True,
+ )
+ assert fs.cat("a") == b"A"
+
+
+@pytest.fixture()
+def lazy_refs(m):
+ zarr = pytest.importorskip("zarr")
+ l = LazyReferenceMapper.create("memory://refs", fs=m)
+ g = zarr.open(l, mode="w")
+ g.create_dataset(name="data", shape=(100,), chunks=(10,), dtype="int64")
+ return l
+
+
+def test_append_parquet(lazy_refs, m):
+ pytest.importorskip("kerchunk")
+ with pytest.raises(KeyError):
+ lazy_refs["data/0"]
+ lazy_refs["data/0"] = b"data"
+ assert lazy_refs["data/0"] == b"data"
+ lazy_refs.flush()
+
+ lazy2 = LazyReferenceMapper("memory://refs", fs=m)
+ assert lazy2["data/0"] == b"data"
+ with pytest.raises(KeyError):
+ lazy_refs["data/1"]
+ lazy2["data/1"] = b"Bdata"
+ assert lazy2["data/1"] == b"Bdata"
+ lazy2.flush()
+
+ lazy2 = LazyReferenceMapper("memory://refs", fs=m)
+ assert lazy2["data/0"] == b"data"
+ assert lazy2["data/1"] == b"Bdata"
+ lazy2["data/1"] = b"Adata"
+ del lazy2["data/0"]
+ assert lazy2["data/1"] == b"Adata"
+ assert "data/0" not in lazy2
+ lazy2.flush()
+
+ lazy2 = LazyReferenceMapper("memory://refs", fs=m)
+ with pytest.raises(KeyError):
+ lazy2["data/0"]
+ assert lazy2["data/1"] == b"Adata"
diff --git a/fsspec/implementations/tests/test_sftp.py b/fsspec/implementations/tests/test_sftp.py
index c50f763..b91f0b5 100644
--- a/fsspec/implementations/tests/test_sftp.py
+++ b/fsspec/implementations/tests/test_sftp.py
@@ -3,11 +3,231 @@ import shlex
import subprocess
import time
from tarfile import TarFile
+
import pytest
+
import fsspec
-pytest.importorskip('paramiko')
+
+pytest.importorskip("paramiko")
+
+
+def stop_docker(name):
+ cmd = shlex.split(f'docker ps -a -q --filter "name={name}"')
+ cid = subprocess.check_output(cmd).strip().decode()
+ if cid:
+ subprocess.call(["docker", "rm", "-f", cid])
+
+
+@pytest.fixture(scope="module")
+def ssh():
+ try:
+ pchk = ["docker", "run", "--name", "fsspec_test_sftp", "hello-world"]
+ subprocess.check_call(pchk)
+ stop_docker("fsspec_test_sftp")
+ except (subprocess.CalledProcessError, FileNotFoundError):
+ pytest.skip("docker run not available")
+ return
+
+ # requires docker
+ cmds = [
+ r"apt-get update",
+ r"apt-get install -y openssh-server",
+ r"mkdir /var/run/sshd",
+ "bash -c \"echo 'root:pass' | chpasswd\"",
+ (
+ r"sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' "
+ r"/etc/ssh/sshd_config"
+ ),
+ (
+ r"sed 's@session\s*required\s*pam_loginuid.so@session optional "
+ r"pam_loginuid.so@g' -i /etc/pam.d/sshd"
+ ),
+ r'bash -c "echo \"export VISIBLE=now\" >> /etc/profile"',
+ r"/usr/sbin/sshd",
+ ]
+ name = "fsspec_sftp"
+ stop_docker(name)
+ cmd = f"docker run -d -p 9200:22 --name {name} ubuntu:16.04 sleep 9000"
+ try:
+ cid = subprocess.check_output(shlex.split(cmd)).strip().decode()
+ for cmd in cmds:
+ subprocess.call(["docker", "exec", cid] + shlex.split(cmd))
+ time.sleep(1)
+ yield {
+ "host": "localhost",
+ "port": 9200,
+ "username": "root",
+ "password": "pass",
+ }
+ finally:
+ stop_docker(name)
+
+
+@pytest.fixture(scope="module")
+def root_path():
+ return "/home/someuser/"
+
+
+def test_simple(ssh, root_path):
+ f = fsspec.get_filesystem_class("sftp")(**ssh)
+ f.mkdirs(root_path + "deeper")
+ try:
+ f.touch(root_path + "deeper/afile")
+ assert f.find(root_path) == [root_path + "deeper/afile"]
+ assert f.ls(root_path + "deeper/") == [root_path + "deeper/afile"]
+ assert f.info(root_path + "deeper/afile")["type"] == "file"
+ assert f.info(root_path + "deeper/afile")["size"] == 0
+ assert f.exists(root_path)
+ finally:
+ f.rm(root_path, recursive=True)
+ assert not f.exists(root_path)
+
+
+@pytest.mark.parametrize("protocol", ["sftp", "ssh"])
+def test_with_url(protocol, ssh):
+ fo = fsspec.open(
+ protocol
+ + "://{username}:{password}@{host}:{port}/home/someuserout".format(**ssh),
+ "wb",
+ )
+ with fo as f:
+ f.write(b"hello")
+ fo = fsspec.open(
+ protocol
+ + "://{username}:{password}@{host}:{port}/home/someuserout".format(**ssh),
+ "rb",
+ )
+ with fo as f:
+ assert f.read() == b"hello"
+
+
+@pytest.mark.parametrize("protocol", ["sftp", "ssh"])
+def test_get_dir(protocol, ssh, root_path, tmpdir):
+ path = str(tmpdir)
+ f = fsspec.filesystem(protocol, **ssh)
+ f.mkdirs(root_path + "deeper", exist_ok=True)
+ f.touch(root_path + "deeper/afile")
+ f.get(root_path, path, recursive=True)
+
+ assert os.path.isdir(f"{path}/deeper")
+ assert os.path.isfile(f"{path}/deeper/afile")
+
+ f.get(
+ protocol
+ + "://{username}:{password}@{host}:{port}{root_path}".format(
+ root_path=root_path, **ssh
+ ),
+ f"{path}/test2",
+ recursive=True,
+ )
+
+ assert os.path.isdir(f"{path}/test2/deeper")
+ assert os.path.isfile(f"{path}/test2/deeper/afile")
+
+
+@pytest.fixture(scope="module")
+def netloc(ssh):
+ username = ssh.get("username")
+ password = ssh.get("password")
+ host = ssh.get("host")
+ port = ssh.get("port")
+ userpass = (
+ f"{username}:{password if password is not None else ''}@"
+ if username is not None
+ else ""
+ )
+ netloc = f"{host}:{port if port is not None else ''}"
+ return userpass + netloc
+
+
+def test_put_file(ssh, tmp_path, root_path):
+ tmp_file = tmp_path / "a.txt"
+ with open(tmp_file, mode="w") as fd:
+ fd.write("blabla")
+
+ f = fsspec.get_filesystem_class("sftp")(**ssh)
+ f.put_file(lpath=tmp_file, rpath=root_path + "a.txt")
+
+
+def test_simple_with_tar(ssh, netloc, tmp_path, root_path):
+ files_to_pack = ["a.txt", "b.txt"]
+
+ tar_filename = make_tarfile(files_to_pack, tmp_path)
+
+ f = fsspec.get_filesystem_class("sftp")(**ssh)
+ f.mkdirs(f"{root_path}deeper", exist_ok=True)
+ try:
+ remote_tar_filename = f"{root_path}deeper/somefile.tar"
+ with f.open(remote_tar_filename, mode="wb") as wfd:
+ with open(tar_filename, mode="rb") as rfd:
+ wfd.write(rfd.read())
+ fs = fsspec.open(f"tar::ssh://{netloc}{remote_tar_filename}").fs
+ files = fs.find("/")
+ assert files == files_to_pack
+ finally:
+ f.rm(root_path, recursive=True)
def make_tarfile(files_to_pack, tmp_path):
"""Create a tarfile with some files."""
- pass
+ tar_filename = tmp_path / "sometarfile.tar"
+ for filename in files_to_pack:
+ with open(tmp_path / filename, mode="w") as fd:
+ fd.write("")
+ with TarFile(tar_filename, mode="w") as tf:
+ for filename in files_to_pack:
+ tf.add(tmp_path / filename, arcname=filename)
+ return tar_filename
+
+
+def test_transaction(ssh, root_path):
+ f = fsspec.get_filesystem_class("sftp")(**ssh)
+ f.mkdirs(root_path + "deeper", exist_ok=True)
+ try:
+ f.start_transaction()
+ f.touch(root_path + "deeper/afile")
+ assert f.find(root_path) == []
+ f.end_transaction()
+ assert f.find(root_path) == [root_path + "deeper/afile"]
+
+ with f.transaction:
+ assert f._intrans
+ f.touch(root_path + "deeper/afile2")
+ assert f.find(root_path) == [root_path + "deeper/afile"]
+ assert f.find(root_path) == [
+ root_path + "deeper/afile",
+ root_path + "deeper/afile2",
+ ]
+ finally:
+ f.rm(root_path, recursive=True)
+
+
+@pytest.mark.parametrize("path", ["/a/b/c", "a/b/c"])
+def test_mkdir_create_parent(ssh, path):
+ f = fsspec.get_filesystem_class("sftp")(**ssh)
+
+ with pytest.raises(FileNotFoundError):
+ f.mkdir(path, create_parents=False)
+
+ f.mkdir(path)
+ assert f.exists(path)
+
+ with pytest.raises(FileExistsError, match=path):
+ f.mkdir(path)
+
+ f.rm(path, recursive=True)
+ assert not f.exists(path)
+
+
+@pytest.mark.parametrize("path", ["/a/b/c", "a/b/c"])
+def test_makedirs_exist_ok(ssh, path):
+ f = fsspec.get_filesystem_class("sftp")(**ssh)
+
+ f.makedirs(path, exist_ok=False)
+
+ with pytest.raises(FileExistsError, match=path):
+ f.makedirs(path, exist_ok=False)
+
+ f.makedirs(path, exist_ok=True)
+ f.rm(path, recursive=True)
+ assert not f.exists(path)
diff --git a/fsspec/implementations/tests/test_smb.py b/fsspec/implementations/tests/test_smb.py
index 625eb22..68b5957 100644
--- a/fsspec/implementations/tests/test_smb.py
+++ b/fsspec/implementations/tests/test_smb.py
@@ -1,16 +1,166 @@
"""
Test SMBFileSystem class using a docker container
"""
+
import logging
import os
import shlex
import subprocess
import time
+
import pytest
+
import fsspec
-pytest.importorskip('smbprotocol')
-if os.environ.get('WSL_INTEROP'):
+
+pytest.importorskip("smbprotocol")
+
+# ruff: noqa: F821
+
+if os.environ.get("WSL_INTEROP"):
+ # Running on WSL (Windows)
port_test = [9999]
+
else:
+ # ! pylint: disable=redefined-outer-name,missing-function-docstring
+
+ # Test standard and non-standard ports
default_port = 445
port_test = [None, default_port, 9999]
+
+
+def stop_docker(container):
+ cmd = shlex.split('docker ps -a -q --filter "name=%s"' % container)
+ cid = subprocess.check_output(cmd).strip().decode()
+ if cid:
+ subprocess.call(["docker", "rm", "-f", "-v", cid])
+
+
+@pytest.fixture(scope="module", params=port_test)
+def smb_params(request):
+ try:
+ pchk = ["docker", "run", "--name", "fsspec_test_smb", "hello-world"]
+ subprocess.check_call(pchk)
+ stop_docker("fsspec_test_smb")
+ except (subprocess.CalledProcessError, FileNotFoundError):
+ pytest.skip("docker run not available")
+
+ # requires docker
+ container = "fsspec_smb"
+ stop_docker(container)
+ cfg = "-p -u 'testuser;testpass' -s 'home;/share;no;no;no;testuser'"
+ port = request.param if request.param is not None else default_port
+ img = (
+ f"docker run --name {container} --detach -p 139:139 -p {port}:445 dperson/samba" # noqa: E231 E501
+ )
+ cmd = f"{img} {cfg}"
+ try:
+ cid = subprocess.check_output(shlex.split(cmd)).strip().decode()
+ logger = logging.getLogger("fsspec")
+ logger.debug("Container: %s", cid)
+ time.sleep(1)
+ yield {
+ "host": "localhost",
+ "port": request.param,
+ "username": "testuser",
+ "password": "testpass",
+ "register_session_retries": 100, # max ~= 10 seconds
+ }
+ finally:
+ import smbclient # pylint: disable=import-outside-toplevel
+
+ smbclient.reset_connection_cache()
+ stop_docker(container)
+
+
+@pytest.mark.flaky(reruns=2, reruns_delay=2)
+def test_simple(smb_params):
+ adir = "/home/adir"
+ adir2 = "/home/adir/otherdir/"
+ afile = "/home/adir/otherdir/afile"
+ fsmb = fsspec.get_filesystem_class("smb")(**smb_params)
+ fsmb.mkdirs(adir2)
+ fsmb.touch(afile)
+ assert fsmb.find(adir) == [afile]
+ assert fsmb.ls(adir2, detail=False) == [afile]
+ assert fsmb.info(afile)["type"] == "file"
+ assert fsmb.info(afile)["size"] == 0
+ assert fsmb.exists(adir)
+ fsmb.rm(adir, recursive=True)
+ assert not fsmb.exists(adir)
+
+
+@pytest.mark.flaky(reruns=2, reruns_delay=2)
+def test_auto_mkdir(smb_params):
+ adir = "/home/adir"
+ adir2 = "/home/adir/otherdir/"
+ afile = "/home/adir/otherdir/afile"
+ fsmb = fsspec.get_filesystem_class("smb")(**smb_params, auto_mkdir=True)
+ fsmb.touch(afile)
+ assert fsmb.exists(adir)
+ assert fsmb.exists(adir2)
+ assert fsmb.exists(afile)
+ assert fsmb.info(afile)["type"] == "file"
+
+ another_dir = "/home/another_dir"
+ another_dir2 = "/home/another_dir/another_nested_dir/"
+ another_file = "/home/another_dir/another_nested_dir/another_file"
+ fsmb.copy(afile, another_file)
+ assert fsmb.exists(another_dir)
+ assert fsmb.exists(another_dir2)
+ assert fsmb.exists(another_file)
+ assert fsmb.info(another_file)["type"] == "file"
+
+ fsmb.rm(adir, recursive=True)
+ fsmb.rm(another_dir, recursive=True)
+ assert not fsmb.exists(adir)
+ assert not fsmb.exists(another_dir)
+
+
+@pytest.mark.flaky(reruns=2, reruns_delay=2)
+def test_with_url(smb_params):
+ if smb_params["port"] is None:
+ smb_url = "smb://{username}:{password}@{host}/home/someuser.txt"
+ else:
+ smb_url = "smb://{username}:{password}@{host}:{port}/home/someuser.txt"
+ fwo = fsspec.open(smb_url.format(**smb_params), "wb")
+ with fwo as fwr:
+ fwr.write(b"hello")
+ fro = fsspec.open(smb_url.format(**smb_params), "rb")
+ with fro as frd:
+ read_result = frd.read()
+ assert read_result == b"hello"
+
+
+@pytest.mark.flaky(reruns=2, reruns_delay=2)
+def test_transaction(smb_params):
+ afile = "/home/afolder/otherdir/afile"
+ afile2 = "/home/afolder/otherdir/afile2"
+ adir = "/home/afolder"
+ adir2 = "/home/afolder/otherdir"
+ fsmb = fsspec.get_filesystem_class("smb")(**smb_params)
+ fsmb.mkdirs(adir2)
+ fsmb.start_transaction()
+ fsmb.touch(afile)
+ assert fsmb.find(adir) == []
+ fsmb.end_transaction()
+ assert fsmb.find(adir) == [afile]
+
+ with fsmb.transaction:
+ assert fsmb._intrans
+ fsmb.touch(afile2)
+ assert fsmb.find(adir) == [afile]
+ assert fsmb.find(adir) == [afile, afile2]
+
+
+@pytest.mark.flaky(reruns=2, reruns_delay=2)
+def test_makedirs_exist_ok(smb_params):
+ fsmb = fsspec.get_filesystem_class("smb")(**smb_params)
+ fsmb.makedirs("/home/a/b/c")
+ fsmb.makedirs("/home/a/b/c", exist_ok=True)
+
+
+@pytest.mark.flaky(reruns=2, reruns_delay=2)
+def test_rename_from_upath(smb_params):
+ fsmb = fsspec.get_filesystem_class("smb")(**smb_params)
+ fsmb.makedirs("/home/a/b/c", exist_ok=True)
+ fsmb.mv("/home/a/b/c", "/home/a/b/d", recursive=False, maxdepth=None)
diff --git a/fsspec/implementations/tests/test_tar.py b/fsspec/implementations/tests/test_tar.py
index 754de3b..0ec7c8a 100644
--- a/fsspec/implementations/tests/test_tar.py
+++ b/fsspec/implementations/tests/test_tar.py
@@ -1,11 +1,14 @@
from __future__ import annotations
+
import os
import shutil
import tarfile
import tempfile
from io import BytesIO
from pathlib import Path
+
import pytest
+
import fsspec
from fsspec.core import OpenFile
from fsspec.implementations.cached import WholeFileCacheFileSystem
@@ -13,50 +16,228 @@ from fsspec.implementations.tar import TarFileSystem
from fsspec.implementations.tests.test_archive import archive_data, temptar
-@pytest.mark.parametrize('recipe', [{'mode': 'w', 'suffix': '.tar', 'magic':
- b'a\x00\x00\x00\x00'}, {'mode': 'w:gz', 'suffix': '.tar.gz', 'magic':
- b'\x1f\x8b\x08\x08'}, {'mode': 'w:bz2', 'suffix': '.tar.bz2', 'magic':
- b'BZh91AY'}, {'mode': 'w:xz', 'suffix': '.tar.xz', 'magic':
- b'\xfd7zXZ\x00\x00'}], ids=['tar', 'tar-gz', 'tar-bz2', 'tar-xz'])
+def test_info():
+ with temptar(archive_data) as t:
+ fs = fsspec.filesystem("tar", fo=t)
+
+ # Iterate over all directories.
+ # Probe specific fields of Tar archives.
+ for d in fs._all_dirnames(archive_data.keys()):
+ lhs = fs.info(d)
+ del lhs["chksum"]
+ expected = {
+ "name": f"{d}",
+ "size": 0,
+ "type": "directory",
+ "devmajor": 0,
+ "devminor": 0,
+ "gname": "",
+ "linkname": "",
+ "uid": 0,
+ "gid": 0,
+ "mode": 420,
+ "mtime": 0,
+ "uname": "",
+ }
+ assert lhs == expected
+
+ # Iterate over all files.
+ for f in archive_data:
+ lhs = fs.info(f)
+
+ # Probe some specific fields of Tar archives.
+ assert "mode" in lhs
+ assert "uid" in lhs
+ assert "gid" in lhs
+ assert "mtime" in lhs
+ assert "chksum" in lhs
+
+
+@pytest.mark.parametrize(
+ "recipe",
+ [
+ {"mode": "w", "suffix": ".tar", "magic": b"a\x00\x00\x00\x00"},
+ {"mode": "w:gz", "suffix": ".tar.gz", "magic": b"\x1f\x8b\x08\x08"},
+ {"mode": "w:bz2", "suffix": ".tar.bz2", "magic": b"BZh91AY"},
+ {"mode": "w:xz", "suffix": ".tar.xz", "magic": b"\xfd7zXZ\x00\x00"},
+ ],
+ ids=["tar", "tar-gz", "tar-bz2", "tar-xz"],
+)
def test_compressions(recipe):
"""
Run tests on all available tar file compression variants.
"""
- pass
+ with temptar(archive_data, mode=recipe["mode"], suffix=recipe["suffix"]) as t:
+ fs = fsspec.filesystem("tar", fo=t)
+ # Verify that the tar archive has the correct compression.
+ with open(t, "rb") as raw:
+ assert raw.read()[:10].startswith(recipe["magic"])
-@pytest.mark.parametrize('recipe', [{'mode': 'w', 'suffix': '.tar', 'magic':
- b'a\x00\x00\x00\x00'}, {'mode': 'w:gz', 'suffix': '.tar.gz', 'magic':
- b'\x1f\x8b\x08\x08'}, {'mode': 'w:bz2', 'suffix': '.tar.bz2', 'magic':
- b'BZh91AY'}, {'mode': 'w:xz', 'suffix': '.tar.xz', 'magic':
- b'\xfd7zXZ\x00\x00'}], ids=['tar', 'tar-gz', 'tar-bz2', 'tar-xz'])
+ # Verify content of a sample file.
+ assert fs.cat("b") == b"hello"
+
+
+@pytest.mark.parametrize(
+ "recipe",
+ [
+ {"mode": "w", "suffix": ".tar", "magic": b"a\x00\x00\x00\x00"},
+ {"mode": "w:gz", "suffix": ".tar.gz", "magic": b"\x1f\x8b\x08\x08"},
+ {"mode": "w:bz2", "suffix": ".tar.bz2", "magic": b"BZh91AY"},
+ {"mode": "w:xz", "suffix": ".tar.xz", "magic": b"\xfd7zXZ\x00\x00"},
+ ],
+ ids=["tar", "tar-gz", "tar-bz2", "tar-xz"],
+)
def test_filesystem_direct(recipe, tmpdir):
"""
Run tests through a real fsspec filesystem implementation.
Here: `LocalFileSystem`.
"""
- pass
+
+ filename = os.path.join(tmpdir, f'temp{recipe["suffix"]}')
+
+ fs = fsspec.filesystem("file")
+ f = OpenFile(fs, filename, mode="wb")
+
+ with temptar(archive_data, mode=recipe["mode"], suffix=recipe["suffix"]) as tf:
+ with f as fo:
+ fo.write(open(tf, "rb").read())
+
+ # Verify that the tar archive has the correct compression.
+ with open(filename, "rb") as raw:
+ assert raw.read()[:10].startswith(recipe["magic"])
+
+ # Verify content of a sample file.
+ with fs.open(filename) as resource:
+ tarfs = fsspec.filesystem("tar", fo=resource)
+ assert tarfs.cat("b") == b"hello"
-@pytest.mark.parametrize('recipe', [{'mode': 'w', 'suffix': '.tar', 'magic':
- b'a\x00\x00\x00\x00'}, {'mode': 'w:gz', 'suffix': '.tar.gz', 'magic':
- b'\x1f\x8b\x08\x08'}, {'mode': 'w:bz2', 'suffix': '.tar.bz2', 'magic':
- b'BZh91AY'}, {'mode': 'w:xz', 'suffix': '.tar.xz', 'magic':
- b'\xfd7zXZ\x00\x00'}], ids=['tar', 'tar-gz', 'tar-bz2', 'tar-xz'])
+@pytest.mark.parametrize(
+ "recipe",
+ [
+ {"mode": "w", "suffix": ".tar", "magic": b"a\x00\x00\x00\x00"},
+ {"mode": "w:gz", "suffix": ".tar.gz", "magic": b"\x1f\x8b\x08\x08"},
+ {"mode": "w:bz2", "suffix": ".tar.bz2", "magic": b"BZh91AY"},
+ {"mode": "w:xz", "suffix": ".tar.xz", "magic": b"\xfd7zXZ\x00\x00"},
+ ],
+ ids=["tar", "tar-gz", "tar-bz2", "tar-xz"],
+)
def test_filesystem_cached(recipe, tmpdir):
"""
Run tests through a real, cached, fsspec filesystem implementation.
Here: `TarFileSystem` over `WholeFileCacheFileSystem` over `LocalFileSystem`.
"""
- pass
+
+ filename = os.path.join(tmpdir, f'temp{recipe["suffix"]}')
+
+ # Create a filesystem from test fixture.
+ fs = fsspec.filesystem("file")
+ f = OpenFile(fs, filename, mode="wb")
+
+ with temptar(archive_data, mode=recipe["mode"], suffix=recipe["suffix"]) as tf:
+ with f as fo:
+ fo.write(open(tf, "rb").read())
+
+ # Verify that the tar archive has the correct compression.
+ with open(filename, "rb") as raw:
+ assert raw.read()[:10].startswith(recipe["magic"])
+
+ # Access cached filesystem.
+ cachedir = tempfile.mkdtemp()
+ filesystem = WholeFileCacheFileSystem(fs=fs, cache_storage=cachedir)
+
+ # Verify the cache is empty beforehand.
+ assert os.listdir(cachedir) == []
+
+ # Verify content of a sample file.
+ with filesystem.open(filename) as resource:
+ tarfs = fsspec.filesystem("tar", fo=resource)
+ assert tarfs.cat("b") == b"hello"
+
+ # Verify the cache is populated afterwards.
+ assert len(os.listdir(cachedir)) == 2
+
+ # Verify that the cache is empty after clearing it.
+ filesystem.clear_cache()
+ assert os.listdir(cachedir) == []
+
+ filesystem.clear_cache()
+ shutil.rmtree(cachedir)
+
+
+@pytest.mark.parametrize(
+ "recipe",
+ [
+ {"mode": "w", "suffix": ".tar", "magic": b"a\x00\x00\x00\x00"},
+ {"mode": "w:gz", "suffix": ".tar.gz", "magic": b"\x1f\x8b\x08\x08"},
+ {"mode": "w:bz2", "suffix": ".tar.bz2", "magic": b"BZh91AY"},
+ {"mode": "w:xz", "suffix": ".tar.xz", "magic": b"\xfd7zXZ\x00\x00"},
+ ],
+ ids=["tar", "tar-gz", "tar-bz2", "tar-xz"],
+)
+def test_url_to_fs_direct(recipe, tmpdir):
+ with temptar(archive_data, mode=recipe["mode"], suffix=recipe["suffix"]) as tf:
+ url = f"tar://inner::file://{tf}"
+ fs, url = fsspec.core.url_to_fs(url=url)
+ assert fs.cat("b") == b"hello"
+
+
+@pytest.mark.parametrize(
+ "recipe",
+ [
+ {"mode": "w", "suffix": ".tar"},
+ {"mode": "w:gz", "suffix": ".tar.gz"},
+ {"mode": "w:bz2", "suffix": ".tar.bz2"},
+ {"mode": "w:xz", "suffix": ".tar.xz"},
+ ],
+ ids=["tar", "tar-gz", "tar-bz2", "tar-xz"],
+)
+def test_url_to_fs_cached(recipe, tmpdir):
+ with temptar(archive_data, mode=recipe["mode"], suffix=recipe["suffix"]) as tf:
+ url = f"tar://inner::simplecache::file://{tf}"
+ # requires same_names in order to be able to guess compression from
+ # filename
+ fs, url = fsspec.core.url_to_fs(url=url, simplecache={"same_names": True})
+ assert fs.cat("b") == b"hello"
-@pytest.mark.parametrize('compression', ['', 'gz', 'bz2', 'xz'], ids=['tar',
- 'tar-gz', 'tar-bz2', 'tar-xz'])
+@pytest.mark.parametrize(
+ "compression", ["", "gz", "bz2", "xz"], ids=["tar", "tar-gz", "tar-bz2", "tar-xz"]
+)
def test_ls_with_folders(compression: str, tmp_path: Path):
"""
Create a tar file that doesn't include the intermediate folder structure,
but make sure that the reading filesystem is still able to resolve the
intermediate folders, like the ZipFileSystem.
"""
- pass
+ tar_data: dict[str, bytes] = {
+ "a.pdf": b"Hello A!",
+ "b/c.pdf": b"Hello C!",
+ "d/e/f.pdf": b"Hello F!",
+ "d/g.pdf": b"Hello G!",
+ }
+ if compression:
+ temp_archive_file = tmp_path / f"test_tar_file.tar.{compression}"
+ else:
+ temp_archive_file = tmp_path / "test_tar_file.tar"
+ with open(temp_archive_file, "wb") as fd:
+ # We need to manually write the tarfile here, because temptar
+ # creates intermediate directories which is not how tars are always created
+ with tarfile.open(fileobj=fd, mode=f"w:{compression}") as tf:
+ for tar_file_path, data in tar_data.items():
+ content = data
+ info = tarfile.TarInfo(name=tar_file_path)
+ info.size = len(content)
+ tf.addfile(info, BytesIO(content))
+ with open(temp_archive_file, "rb") as fd:
+ fs = TarFileSystem(fd)
+ assert fs.find("/", withdirs=True) == [
+ "a.pdf",
+ "b",
+ "b/c.pdf",
+ "d",
+ "d/e",
+ "d/e/f.pdf",
+ "d/g.pdf",
+ ]
diff --git a/fsspec/implementations/tests/test_webhdfs.py b/fsspec/implementations/tests/test_webhdfs.py
index 3b3a915..fac34c7 100644
--- a/fsspec/implementations/tests/test_webhdfs.py
+++ b/fsspec/implementations/tests/test_webhdfs.py
@@ -2,7 +2,196 @@ import pickle
import shlex
import subprocess
import time
+
import pytest
+
import fsspec
-requests = pytest.importorskip('requests')
-from fsspec.implementations.webhdfs import WebHDFS
+
+requests = pytest.importorskip("requests")
+
+from fsspec.implementations.webhdfs import WebHDFS # noqa: E402
+
+
+@pytest.fixture(scope="module")
+def hdfs_cluster():
+ cmd0 = shlex.split("htcluster shutdown")
+ try:
+ subprocess.check_output(cmd0, stderr=subprocess.STDOUT)
+ except FileNotFoundError:
+ pytest.skip("htcluster not found")
+ except subprocess.CalledProcessError as ex:
+ pytest.skip(f"htcluster failed: {ex.output.decode()}")
+ cmd1 = shlex.split("htcluster startup --image base")
+ subprocess.check_output(cmd1)
+ try:
+ while True:
+ t = 90
+ try:
+ requests.get("http://localhost:50070/webhdfs/v1/?op=LISTSTATUS")
+ except: # noqa: E722
+ t -= 1
+ assert t > 0, "Timeout waiting for HDFS"
+ time.sleep(1)
+ continue
+ break
+ time.sleep(7)
+ yield "localhost"
+ finally:
+ subprocess.check_output(cmd0)
+
+
+def test_pickle(hdfs_cluster):
+ w = WebHDFS(hdfs_cluster, user="testuser")
+ w2 = pickle.loads(pickle.dumps(w))
+ assert w == w2
+
+
+def test_simple(hdfs_cluster):
+ w = WebHDFS(hdfs_cluster, user="testuser")
+ home = w.home_directory()
+ assert home == "/user/testuser"
+ with pytest.raises(PermissionError):
+ w.mkdir("/root")
+
+
+def test_url(hdfs_cluster):
+ url = "webhdfs://testuser@localhost:50070/user/testuser/myfile"
+ fo = fsspec.open(url, "wb", data_proxy={"worker.example.com": "localhost"})
+ with fo as f:
+ f.write(b"hello")
+ fo = fsspec.open(url, "rb", data_proxy={"worker.example.com": "localhost"})
+ with fo as f:
+ assert f.read() == b"hello"
+
+
+def test_workflow(hdfs_cluster):
+ w = WebHDFS(
+ hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"}
+ )
+ fn = "/user/testuser/testrun/afile"
+ w.mkdir("/user/testuser/testrun")
+ with w.open(fn, "wb") as f:
+ f.write(b"hello")
+ assert w.exists(fn)
+ info = w.info(fn)
+ assert info["size"] == 5
+ assert w.isfile(fn)
+ assert w.cat(fn) == b"hello"
+ w.rm("/user/testuser/testrun", recursive=True)
+ assert not w.exists(fn)
+
+
+def test_with_gzip(hdfs_cluster):
+ from gzip import GzipFile
+
+ w = WebHDFS(
+ hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"}
+ )
+ fn = "/user/testuser/gzfile"
+ with w.open(fn, "wb") as f:
+ gf = GzipFile(fileobj=f, mode="w")
+ gf.write(b"hello")
+ gf.close()
+ with w.open(fn, "rb") as f:
+ gf = GzipFile(fileobj=f, mode="r")
+ assert gf.read() == b"hello"
+
+
+def test_workflow_transaction(hdfs_cluster):
+ w = WebHDFS(
+ hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"}
+ )
+ fn = "/user/testuser/testrun/afile"
+ w.mkdirs("/user/testuser/testrun")
+ with w.transaction:
+ with w.open(fn, "wb") as f:
+ f.write(b"hello")
+ assert not w.exists(fn)
+ assert w.exists(fn)
+ assert w.ukey(fn)
+ files = w.ls("/user/testuser/testrun", True)
+ summ = w.content_summary("/user/testuser/testrun")
+ assert summ["length"] == files[0]["size"]
+ assert summ["fileCount"] == 1
+
+ w.rm("/user/testuser/testrun", recursive=True)
+ assert not w.exists(fn)
+
+
+def test_webhdfs_cp_file(hdfs_cluster):
+ fs = WebHDFS(
+ hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"}
+ )
+
+ src, dst = "/user/testuser/testrun/f1", "/user/testuser/testrun/f2"
+
+ fs.mkdir("/user/testuser/testrun")
+
+ with fs.open(src, "wb") as f:
+ f.write(b"hello")
+
+ fs.cp_file(src, dst)
+
+ assert fs.exists(src)
+ assert fs.exists(dst)
+ assert fs.cat(src) == fs.cat(dst)
+
+
+def test_path_with_equals(hdfs_cluster):
+ fs = WebHDFS(
+ hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"}
+ )
+ path_with_equals = "/user/testuser/some_table/datestamp=2023-11-11"
+
+ fs.mkdir(path_with_equals)
+
+ result = fs.ls(path_with_equals)
+ assert result is not None
+ assert fs.exists(path_with_equals)
+
+
+def test_error_handling_with_equals_in_path(hdfs_cluster):
+ fs = WebHDFS(hdfs_cluster, user="testuser")
+ invalid_path_with_equals = (
+ "/user/testuser/some_table/invalid_path=datestamp=2023-11-11"
+ )
+
+ with pytest.raises(FileNotFoundError):
+ fs.ls(invalid_path_with_equals)
+
+
+def test_create_and_touch_file_with_equals(hdfs_cluster):
+ fs = WebHDFS(
+ hdfs_cluster,
+ user="testuser",
+ data_proxy={"worker.example.com": "localhost"},
+ )
+ base_path = "/user/testuser/some_table/datestamp=2023-11-11"
+ file_path = f"{base_path}/testfile.txt"
+
+ fs.mkdir(base_path)
+ fs.touch(file_path, "wb")
+ assert fs.exists(file_path)
+
+
+def test_write_read_verify_file_with_equals(hdfs_cluster):
+ fs = WebHDFS(
+ hdfs_cluster,
+ user="testuser",
+ data_proxy={"worker.example.com": "localhost"},
+ )
+ base_path = "/user/testuser/some_table/datestamp=2023-11-11"
+ file_path = f"{base_path}/testfile.txt"
+ content = b"This is some content!"
+
+ fs.mkdir(base_path)
+ with fs.open(file_path, "wb") as f:
+ f.write(content)
+
+ with fs.open(file_path, "rb") as f:
+ assert f.read() == content
+
+ file_info = fs.ls(base_path, detail=True)
+ assert len(file_info) == 1
+ assert file_info[0]["name"] == file_path
+ assert file_info[0]["size"] == len(content)
diff --git a/fsspec/implementations/tests/test_zip.py b/fsspec/implementations/tests/test_zip.py
index c554e22..ec30c87 100644
--- a/fsspec/implementations/tests/test_zip.py
+++ b/fsspec/implementations/tests/test_zip.py
@@ -1,10 +1,134 @@
import collections.abc
import os.path
+
import pytest
+
import fsspec
from fsspec.implementations.tests.test_archive import archive_data, tempzip
+def test_info():
+ with tempzip(archive_data) as z:
+ fs = fsspec.filesystem("zip", fo=z)
+
+ # Iterate over all files.
+ for f in archive_data:
+ lhs = fs.info(f)
+
+ # Probe some specific fields of Zip archives.
+ assert "CRC" in lhs
+ assert "compress_size" in lhs
+
+
def test_fsspec_get_mapper():
"""Added for #788"""
- pass
+
+ with tempzip(archive_data) as z:
+ mapping = fsspec.get_mapper(f"zip::{z}")
+
+ assert isinstance(mapping, collections.abc.Mapping)
+ keys = sorted(mapping.keys())
+ assert keys == ["a", "b", "deeply/nested/path"]
+
+ # mapping.getitems() will call FSMap.fs.cat()
+ # which was not accurately implemented for zip.
+ assert isinstance(mapping, fsspec.mapping.FSMap)
+ items = dict(mapping.getitems(keys))
+ assert items == {"a": b"", "b": b"hello", "deeply/nested/path": b"stuff"}
+
+
+def test_not_cached():
+ with tempzip(archive_data) as z:
+ fs = fsspec.filesystem("zip", fo=z)
+ fs2 = fsspec.filesystem("zip", fo=z)
+ assert fs is not fs2
+
+
+def test_root_info():
+ with tempzip(archive_data) as z:
+ fs = fsspec.filesystem("zip", fo=z)
+ assert fs.info("/") == {"name": "", "type": "directory", "size": 0}
+ assert fs.info("") == {"name": "", "type": "directory", "size": 0}
+
+
+def test_write_seek(m):
+ with m.open("afile.zip", "wb") as f:
+ fs = fsspec.filesystem("zip", fo=f, mode="w")
+ fs.pipe("another", b"hi")
+ fs.zip.close()
+
+ with m.open("afile.zip", "rb") as f:
+ fs = fsspec.filesystem("zip", fo=f)
+ assert fs.cat("another") == b"hi"
+
+
+def test_rw(m):
+ # extra arg to zip means "create archive"
+ with fsspec.open(
+ "zip://afile::memory://out.zip", mode="wb", zip={"mode": "w"}
+ ) as f:
+ f.write(b"data")
+
+ with fsspec.open("zip://afile::memory://out.zip", mode="rb") as f:
+ assert f.read() == b"data"
+
+
+def test_mapper(m):
+ # extra arg to zip means "create archive"
+ mapper = fsspec.get_mapper("zip::memory://out.zip", zip={"mode": "w"})
+ with pytest.raises(KeyError):
+ mapper["a"]
+
+ mapper["a"] = b"data"
+ with pytest.raises(OSError):
+ # fails because this is write mode and we cannot also read
+ mapper["a"]
+ assert "a" in mapper # but be can list
+
+
+def test_zip_glob_star(m):
+ with fsspec.open(
+ "zip://adir/afile::memory://out.zip", mode="wb", zip={"mode": "w"}
+ ) as f:
+ f.write(b"data")
+
+ fs, _ = fsspec.core.url_to_fs("zip::memory://out.zip")
+ outfiles = fs.glob("*")
+ assert len(outfiles) == 1
+
+ fs = fsspec.filesystem("zip", fo="memory://out.zip", mode="w")
+ fs.mkdir("adir")
+ fs.pipe("adir/afile", b"data")
+ outfiles = fs.glob("*")
+ assert len(outfiles) == 1
+
+ fn = f"{os.path.dirname(os.path.abspath((__file__)))}/out.zip"
+ fs = fsspec.filesystem("zip", fo=fn, mode="r")
+ outfiles = fs.glob("*")
+ assert len(outfiles) == 1
+
+
+def test_append(m, tmpdir):
+ fs = fsspec.filesystem("zip", fo="memory://out.zip", mode="w")
+ with fs.open("afile", "wb") as f:
+ f.write(b"data")
+ fs.close()
+
+ fs = fsspec.filesystem("zip", fo="memory://out.zip", mode="a")
+ with fs.open("bfile", "wb") as f:
+ f.write(b"data")
+ fs.close()
+
+ assert len(fsspec.open_files("zip://*::memory://out.zip")) == 2
+
+ fs = fsspec.filesystem("zip", fo=f"{tmpdir}/out.zip", mode="w")
+ with fs.open("afile", "wb") as f:
+ f.write(b"data")
+ fs.close()
+
+ fs = fsspec.filesystem("zip", fo=f"{tmpdir}/out.zip", mode="a")
+ with fs.open("bfile", "wb") as f:
+ f.write(b"data")
+ fs.close()
+
+ assert len(fsspec.open_files("zip://*::memory://out.zip")) == 2
diff --git a/fsspec/implementations/webhdfs.py b/fsspec/implementations/webhdfs.py
index bc3c00b..4bac5d5 100644
--- a/fsspec/implementations/webhdfs.py
+++ b/fsspec/implementations/webhdfs.py
@@ -1,3 +1,5 @@
+# https://hadoop.apache.org/docs/r1.0.4/webhdfs.html
+
import logging
import os
import secrets
@@ -6,10 +8,13 @@ import tempfile
import uuid
from contextlib import suppress
from urllib.parse import quote
+
import requests
+
from ..spec import AbstractBufferedFile, AbstractFileSystem
from ..utils import infer_storage_options, tokenize
-logger = logging.getLogger('webhdfs')
+
+logger = logging.getLogger("webhdfs")
class WebHDFS(AbstractFileSystem):
@@ -33,13 +38,26 @@ class WebHDFS(AbstractFileSystem):
are provided.
"""
+
tempdir = str(tempfile.gettempdir())
- protocol = 'webhdfs', 'webHDFS'
+ protocol = "webhdfs", "webHDFS"
- def __init__(self, host, port=50070, kerberos=False, token=None, user=
- None, password=None, proxy_to=None, kerb_kwargs=None, data_proxy=
- None, use_https=False, session_cert=None, session_verify=True, **kwargs
- ):
+ def __init__(
+ self,
+ host,
+ port=50070,
+ kerberos=False,
+ token=None,
+ user=None,
+ password=None,
+ proxy_to=None,
+ kerb_kwargs=None,
+ data_proxy=None,
+ use_https=False,
+ session_cert=None,
+ session_verify=True,
+ **kwargs,
+ ):
"""
Parameters
----------
@@ -84,8 +102,7 @@ class WebHDFS(AbstractFileSystem):
if self._cached:
return
super().__init__(**kwargs)
- self.url = (
- f"{'https' if use_https else 'http'}://{host}:{port}/webhdfs/v1")
+ self.url = f"{'https' if use_https else 'http'}://{host}:{port}/webhdfs/v1" # noqa
self.kerb = kerberos
self.kerb_kwargs = kerb_kwargs or {}
self.pars = {}
@@ -93,31 +110,103 @@ class WebHDFS(AbstractFileSystem):
if token is not None:
if user is not None or proxy_to is not None:
raise ValueError(
- 'If passing a delegation token, must not set user or proxy_to, as these are encoded in the token'
- )
- self.pars['delegation'] = token
+ "If passing a delegation token, must not set "
+ "user or proxy_to, as these are encoded in the"
+ " token"
+ )
+ self.pars["delegation"] = token
self.user = user
self.password = password
+
if password is not None:
if user is None:
raise ValueError(
- 'If passing a password, the user must also beset in order to set up the basic-auth'
- )
- elif user is not None:
- self.pars['user.name'] = user
+ "If passing a password, the user must also be"
+ "set in order to set up the basic-auth"
+ )
+ else:
+ if user is not None:
+ self.pars["user.name"] = user
+
if proxy_to is not None:
- self.pars['doas'] = proxy_to
+ self.pars["doas"] = proxy_to
if kerberos and user is not None:
raise ValueError(
- 'If using Kerberos auth, do not specify the user, this is handled by kinit.'
- )
+ "If using Kerberos auth, do not specify the "
+ "user, this is handled by kinit."
+ )
+
self.session_cert = session_cert
self.session_verify = session_verify
+
self._connect()
- self._fsid = f'webhdfs_{tokenize(host, port)}'
- def _open(self, path, mode='rb', block_size=None, autocommit=True,
- replication=None, permissions=None, **kwargs):
+ self._fsid = f"webhdfs_{tokenize(host, port)}"
+
+ @property
+ def fsid(self):
+ return self._fsid
+
+ def _connect(self):
+ self.session = requests.Session()
+
+ if self.session_cert:
+ self.session.cert = self.session_cert
+
+ self.session.verify = self.session_verify
+
+ if self.kerb:
+ from requests_kerberos import HTTPKerberosAuth
+
+ self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs)
+
+ if self.user is not None and self.password is not None:
+ from requests.auth import HTTPBasicAuth
+
+ self.session.auth = HTTPBasicAuth(self.user, self.password)
+
+ def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs):
+ url = self._apply_proxy(self.url + quote(path or "", safe="/="))
+ args = kwargs.copy()
+ args.update(self.pars)
+ args["op"] = op.upper()
+ logger.debug("sending %s with %s", url, method)
+ out = self.session.request(
+ method=method.upper(),
+ url=url,
+ params=args,
+ data=data,
+ allow_redirects=redirect,
+ )
+ if out.status_code in [400, 401, 403, 404, 500]:
+ try:
+ err = out.json()
+ msg = err["RemoteException"]["message"]
+ exp = err["RemoteException"]["exception"]
+ except (ValueError, KeyError):
+ pass
+ else:
+ if exp in ["IllegalArgumentException", "UnsupportedOperationException"]:
+ raise ValueError(msg)
+ elif exp in ["SecurityException", "AccessControlException"]:
+ raise PermissionError(msg)
+ elif exp in ["FileNotFoundException"]:
+ raise FileNotFoundError(msg)
+ else:
+ raise RuntimeError(msg)
+ out.raise_for_status()
+ return out
+
+ def _open(
+ self,
+ path,
+ mode="rb",
+ block_size=None,
+ autocommit=True,
+ replication=None,
+ permissions=None,
+ **kwargs,
+ ):
"""
Parameters
@@ -141,19 +230,75 @@ class WebHDFS(AbstractFileSystem):
-------
WebHDFile instance
"""
- pass
+ block_size = block_size or self.blocksize
+ return WebHDFile(
+ self,
+ path,
+ mode=mode,
+ block_size=block_size,
+ tempdir=self.tempdir,
+ autocommit=autocommit,
+ replication=replication,
+ permissions=permissions,
+ )
+
+ @staticmethod
+ def _process_info(info):
+ info["type"] = info["type"].lower()
+ info["size"] = info["length"]
+ return info
+
+ @classmethod
+ def _strip_protocol(cls, path):
+ return infer_storage_options(path)["path"]
+
+ @staticmethod
+ def _get_kwargs_from_urls(urlpath):
+ out = infer_storage_options(urlpath)
+ out.pop("path", None)
+ out.pop("protocol", None)
+ if "username" in out:
+ out["user"] = out.pop("username")
+ return out
+
+ def info(self, path):
+ out = self._call("GETFILESTATUS", path=path)
+ info = out.json()["FileStatus"]
+ info["name"] = path
+ return self._process_info(info)
+
+ def ls(self, path, detail=False):
+ out = self._call("LISTSTATUS", path=path)
+ infos = out.json()["FileStatuses"]["FileStatus"]
+ for info in infos:
+ self._process_info(info)
+ info["name"] = path.rstrip("/") + "/" + info["pathSuffix"]
+ if detail:
+ return sorted(infos, key=lambda i: i["name"])
+ else:
+ return sorted(info["name"] for info in infos)
def content_summary(self, path):
"""Total numbers of files, directories and bytes under path"""
- pass
+ out = self._call("GETCONTENTSUMMARY", path=path)
+ return out.json()["ContentSummary"]
def ukey(self, path):
"""Checksum info of file, giving method and result"""
- pass
+ out = self._call("GETFILECHECKSUM", path=path, redirect=False)
+ if "Location" in out.headers:
+ location = self._apply_proxy(out.headers["Location"])
+ out2 = self.session.get(location)
+ out2.raise_for_status()
+ return out2.json()["FileChecksum"]
+ else:
+ out.raise_for_status()
+ return out.json()["FileChecksum"]
def home_directory(self):
"""Get user's home directory"""
- pass
+ out = self._call("GETHOMEDIRECTORY")
+ return out.json()["Path"]
def get_delegation_token(self, renewer=None):
"""Retrieve token which can give the same authority to other uses
@@ -163,15 +308,23 @@ class WebHDFS(AbstractFileSystem):
renewer: str or None
User who may use this token; if None, will be current user
"""
- pass
+ if renewer:
+ out = self._call("GETDELEGATIONTOKEN", renewer=renewer)
+ else:
+ out = self._call("GETDELEGATIONTOKEN")
+ t = out.json()["Token"]
+ if t is None:
+ raise ValueError("No token available for this user/security context")
+ return t["urlString"]
def renew_delegation_token(self, token):
"""Make token live longer. Returns new expiry time"""
- pass
+ out = self._call("RENEWDELEGATIONTOKEN", method="put", token=token)
+ return out.json()["long"]
def cancel_delegation_token(self, token):
"""Stop the token from being useful"""
- pass
+ self._call("CANCELDELEGATIONTOKEN", method="put", token=token)
def chmod(self, path, mod):
"""Set the permission at path
@@ -184,11 +337,16 @@ class WebHDFS(AbstractFileSystem):
posix epresentation or permission, give as oct string, e.g, '777'
or 0o777
"""
- pass
+ self._call("SETPERMISSION", method="put", path=path, permission=mod)
def chown(self, path, owner=None, group=None):
"""Change owning user and/or group"""
- pass
+ kwargs = {}
+ if owner is not None:
+ kwargs["owner"] = owner
+ if group is not None:
+ kwargs["group"] = group
+ self._call("SETOWNER", method="put", path=path, **kwargs)
def set_replication(self, path, replication):
"""
@@ -202,7 +360,52 @@ class WebHDFS(AbstractFileSystem):
Number of copies of file on the cluster. Should be smaller than
number of data nodes; normally 3 on most systems.
"""
- pass
+ self._call("SETREPLICATION", path=path, method="put", replication=replication)
+
+ def mkdir(self, path, **kwargs):
+ self._call("MKDIRS", method="put", path=path)
+
+ def makedirs(self, path, exist_ok=False):
+ if exist_ok is False and self.exists(path):
+ raise FileExistsError(path)
+ self.mkdir(path)
+
+ def mv(self, path1, path2, **kwargs):
+ self._call("RENAME", method="put", path=path1, destination=path2)
+
+ def rm(self, path, recursive=False, **kwargs):
+ self._call(
+ "DELETE",
+ method="delete",
+ path=path,
+ recursive="true" if recursive else "false",
+ )
+
+ def rm_file(self, path, **kwargs):
+ self.rm(path)
+
+ def cp_file(self, lpath, rpath, **kwargs):
+ with self.open(lpath) as lstream:
+ tmp_fname = "/".join([self._parent(rpath), f".tmp.{secrets.token_hex(16)}"])
+ # Perform an atomic copy (stream to a temporary file and
+ # move it to the actual destination).
+ try:
+ with self.open(tmp_fname, "wb") as rstream:
+ shutil.copyfileobj(lstream, rstream)
+ self.mv(tmp_fname, rpath)
+ except BaseException: # noqa
+ with suppress(FileNotFoundError):
+ self.rm(tmp_fname)
+ raise
+
+ def _apply_proxy(self, location):
+ if self.proxy and callable(self.proxy):
+ location = self.proxy(location)
+ elif self.proxy:
+ # as a dict
+ for k, v in self.proxy.items():
+ location = location.replace(k, v, 1)
+ return location
class WebHDFile(AbstractBufferedFile):
@@ -211,13 +414,13 @@ class WebHDFile(AbstractBufferedFile):
def __init__(self, fs, path, **kwargs):
super().__init__(fs, path, **kwargs)
kwargs = kwargs.copy()
- if kwargs.get('permissions', None) is None:
- kwargs.pop('permissions', None)
- if kwargs.get('replication', None) is None:
- kwargs.pop('replication', None)
- self.permissions = kwargs.pop('permissions', 511)
- tempdir = kwargs.pop('tempdir')
- if kwargs.pop('autocommit', False) is False:
+ if kwargs.get("permissions", None) is None:
+ kwargs.pop("permissions", None)
+ if kwargs.get("replication", None) is None:
+ kwargs.pop("replication", None)
+ self.permissions = kwargs.pop("permissions", 511)
+ tempdir = kwargs.pop("tempdir")
+ if kwargs.pop("autocommit", False) is False:
self.target = self.path
self.path = os.path.join(tempdir, str(uuid.uuid4()))
@@ -230,8 +433,52 @@ class WebHDFile(AbstractBufferedFile):
This is the last block, so should complete file, if
self.autocommit is True.
"""
- pass
+ out = self.fs.session.post(
+ self.location,
+ data=self.buffer.getvalue(),
+ headers={"content-type": "application/octet-stream"},
+ )
+ out.raise_for_status()
+ return True
def _initiate_upload(self):
"""Create remote file/upload"""
- pass
+ kwargs = self.kwargs.copy()
+ if "a" in self.mode:
+ op, method = "APPEND", "POST"
+ else:
+ op, method = "CREATE", "PUT"
+ kwargs["overwrite"] = "true"
+ out = self.fs._call(op, method, self.path, redirect=False, **kwargs)
+ location = self.fs._apply_proxy(out.headers["Location"])
+ if "w" in self.mode:
+ # create empty file to append to
+ out2 = self.fs.session.put(
+ location, headers={"content-type": "application/octet-stream"}
+ )
+ out2.raise_for_status()
+ # after creating empty file, change location to append to
+ out2 = self.fs._call("APPEND", "POST", self.path, redirect=False, **kwargs)
+ self.location = self.fs._apply_proxy(out2.headers["Location"])
+
+ def _fetch_range(self, start, end):
+ start = max(start, 0)
+ end = min(self.size, end)
+ if start >= end or start >= self.size:
+ return b""
+ out = self.fs._call(
+ "OPEN", path=self.path, offset=start, length=end - start, redirect=False
+ )
+ out.raise_for_status()
+ if "Location" in out.headers:
+ location = out.headers["Location"]
+ out2 = self.fs.session.get(self.fs._apply_proxy(location))
+ return out2.content
+ else:
+ return out.content
+
+ def commit(self):
+ self.fs.mv(self.path, self.target)
+
+ def discard(self):
+ self.fs.rm(self.path)
diff --git a/fsspec/implementations/zip.py b/fsspec/implementations/zip.py
index b37820c..9d9c046 100644
--- a/fsspec/implementations/zip.py
+++ b/fsspec/implementations/zip.py
@@ -1,4 +1,5 @@
import zipfile
+
import fsspec
from fsspec.archive import AbstractArchiveFileSystem
@@ -10,13 +11,22 @@ class ZipFileSystem(AbstractArchiveFileSystem):
This class is pickleable, but not necessarily thread-safe
"""
- root_marker = ''
- protocol = 'zip'
+
+ root_marker = ""
+ protocol = "zip"
cachable = False
- def __init__(self, fo='', mode='r', target_protocol=None,
- target_options=None, compression=zipfile.ZIP_STORED, allowZip64=
- True, compresslevel=None, **kwargs):
+ def __init__(
+ self,
+ fo="",
+ mode="r",
+ target_protocol=None,
+ target_options=None,
+ compression=zipfile.ZIP_STORED,
+ allowZip64=True,
+ compresslevel=None,
+ **kwargs,
+ ):
"""
Parameters
----------
@@ -35,28 +45,90 @@ class ZipFileSystem(AbstractArchiveFileSystem):
Only relevant when creating a ZIP
"""
super().__init__(self, **kwargs)
- if mode not in set('rwa'):
+ if mode not in set("rwa"):
raise ValueError(f"mode '{mode}' no understood")
self.mode = mode
if isinstance(fo, str):
- if mode == 'a':
- m = 'r+b'
+ if mode == "a":
+ m = "r+b"
else:
- m = mode + 'b'
- fo = fsspec.open(fo, mode=m, protocol=target_protocol, **
- target_options or {})
+ m = mode + "b"
+ fo = fsspec.open(
+ fo, mode=m, protocol=target_protocol, **(target_options or {})
+ )
self.force_zip_64 = allowZip64
self.of = fo
- self.fo = fo.__enter__()
- self.zip = zipfile.ZipFile(self.fo, mode=mode, compression=
- compression, allowZip64=allowZip64, compresslevel=compresslevel)
+ self.fo = fo.__enter__() # the whole instance is a context
+ self.zip = zipfile.ZipFile(
+ self.fo,
+ mode=mode,
+ compression=compression,
+ allowZip64=allowZip64,
+ compresslevel=compresslevel,
+ )
self.dir_cache = None
+ @classmethod
+ def _strip_protocol(cls, path):
+ # zip file paths are always relative to the archive root
+ return super()._strip_protocol(path).lstrip("/")
+
def __del__(self):
- if hasattr(self, 'zip'):
+ if hasattr(self, "zip"):
self.close()
del self.zip
def close(self):
"""Commits any write changes to the file. Done on ``del`` too."""
- pass
+ self.zip.close()
+
+ def _get_dirs(self):
+ if self.dir_cache is None or self.mode in set("wa"):
+ # when writing, dir_cache is always in the ZipFile's attributes,
+ # not read from the file.
+ files = self.zip.infolist()
+ self.dir_cache = {
+ dirname.rstrip("/"): {
+ "name": dirname.rstrip("/"),
+ "size": 0,
+ "type": "directory",
+ }
+ for dirname in self._all_dirnames(self.zip.namelist())
+ }
+ for z in files:
+ f = {s: getattr(z, s, None) for s in zipfile.ZipInfo.__slots__}
+ f.update(
+ {
+ "name": z.filename.rstrip("/"),
+ "size": z.file_size,
+ "type": ("directory" if z.is_dir() else "file"),
+ }
+ )
+ self.dir_cache[f["name"]] = f
+
+ def pipe_file(self, path, value, **kwargs):
+ # override upstream, because we know the exact file size in this case
+ self.zip.writestr(path, value, **kwargs)
+
+ def _open(
+ self,
+ path,
+ mode="rb",
+ block_size=None,
+ autocommit=True,
+ cache_options=None,
+ **kwargs,
+ ):
+ path = self._strip_protocol(path)
+ if "r" in mode and self.mode in set("wa"):
+ if self.exists(path):
+ raise OSError("ZipFS can only be open for reading or writing, not both")
+ raise FileNotFoundError(path)
+ if "r" in self.mode and "w" in mode:
+ raise OSError("ZipFS can only be open for reading or writing, not both")
+ out = self.zip.open(path, mode.strip("b"), force_zip64=self.force_zip_64)
+ if "r" in mode:
+ info = self.info(path)
+ out.size = info["size"]
+ out.name = info["name"]
+ return out
diff --git a/fsspec/json.py b/fsspec/json.py
index 54f0af3..69cead0 100644
--- a/fsspec/json.py
+++ b/fsspec/json.py
@@ -1,7 +1,18 @@
import json
from contextlib import suppress
from pathlib import PurePath
-from typing import Any, Callable, ClassVar, Dict, List, Mapping, Optional, Sequence, Tuple
+from typing import (
+ Any,
+ Callable,
+ ClassVar,
+ Dict,
+ List,
+ Mapping,
+ Optional,
+ Sequence,
+ Tuple,
+)
+
from .registry import _import_class, get_filesystem_class
from .spec import AbstractFileSystem
@@ -9,30 +20,102 @@ from .spec import AbstractFileSystem
class FilesystemJSONEncoder(json.JSONEncoder):
include_password: ClassVar[bool] = True
- def make_serializable(self, obj: Any) ->Any:
+ def default(self, o: Any) -> Any:
+ if isinstance(o, AbstractFileSystem):
+ return o.to_dict(include_password=self.include_password)
+ if isinstance(o, PurePath):
+ cls = type(o)
+ return {"cls": f"{cls.__module__}.{cls.__name__}", "str": str(o)}
+
+ return super().default(o)
+
+ def make_serializable(self, obj: Any) -> Any:
"""
Recursively converts an object so that it can be JSON serialized via
:func:`json.dumps` and :func:`json.dump`, without actually calling
said functions.
"""
- pass
+ if isinstance(obj, (str, int, float, bool)):
+ return obj
+ if isinstance(obj, Mapping):
+ return {k: self.make_serializable(v) for k, v in obj.items()}
+ if isinstance(obj, Sequence):
+ return [self.make_serializable(v) for v in obj]
+ return self.default(obj)
-class FilesystemJSONDecoder(json.JSONDecoder):
- def __init__(self, *, object_hook: Optional[Callable[[Dict[str, Any]],
- Any]]=None, parse_float: Optional[Callable[[str], Any]]=None,
- parse_int: Optional[Callable[[str], Any]]=None, parse_constant:
- Optional[Callable[[str], Any]]=None, strict: bool=True,
- object_pairs_hook: Optional[Callable[[List[Tuple[str, Any]]], Any]]
- =None) ->None:
+class FilesystemJSONDecoder(json.JSONDecoder):
+ def __init__(
+ self,
+ *,
+ object_hook: Optional[Callable[[Dict[str, Any]], Any]] = None,
+ parse_float: Optional[Callable[[str], Any]] = None,
+ parse_int: Optional[Callable[[str], Any]] = None,
+ parse_constant: Optional[Callable[[str], Any]] = None,
+ strict: bool = True,
+ object_pairs_hook: Optional[Callable[[List[Tuple[str, Any]]], Any]] = None,
+ ) -> None:
self.original_object_hook = object_hook
- super().__init__(object_hook=self.custom_object_hook, parse_float=
- parse_float, parse_int=parse_int, parse_constant=parse_constant,
- strict=strict, object_pairs_hook=object_pairs_hook)
- def unmake_serializable(self, obj: Any) ->Any:
+ super().__init__(
+ object_hook=self.custom_object_hook,
+ parse_float=parse_float,
+ parse_int=parse_int,
+ parse_constant=parse_constant,
+ strict=strict,
+ object_pairs_hook=object_pairs_hook,
+ )
+
+ @classmethod
+ def try_resolve_path_cls(cls, dct: Dict[str, Any]):
+ with suppress(Exception):
+ fqp = dct["cls"]
+
+ path_cls = _import_class(fqp)
+
+ if issubclass(path_cls, PurePath):
+ return path_cls
+
+ return None
+
+ @classmethod
+ def try_resolve_fs_cls(cls, dct: Dict[str, Any]):
+ with suppress(Exception):
+ if "cls" in dct:
+ try:
+ fs_cls = _import_class(dct["cls"])
+ if issubclass(fs_cls, AbstractFileSystem):
+ return fs_cls
+ except Exception:
+ if "protocol" in dct: # Fallback if cls cannot be imported
+ return get_filesystem_class(dct["protocol"])
+
+ raise
+
+ return None
+
+ def custom_object_hook(self, dct: Dict[str, Any]):
+ if "cls" in dct:
+ if (obj_cls := self.try_resolve_fs_cls(dct)) is not None:
+ return AbstractFileSystem.from_dict(dct)
+ if (obj_cls := self.try_resolve_path_cls(dct)) is not None:
+ return obj_cls(dct["str"])
+
+ if self.original_object_hook is not None:
+ return self.original_object_hook(dct)
+
+ return dct
+
+ def unmake_serializable(self, obj: Any) -> Any:
"""
Inverse function of :meth:`FilesystemJSONEncoder.make_serializable`.
"""
- pass
+ if isinstance(obj, dict):
+ obj = self.custom_object_hook(obj)
+ if isinstance(obj, dict):
+ return {k: self.unmake_serializable(v) for k, v in obj.items()}
+ if isinstance(obj, (list, tuple)):
+ return [self.unmake_serializable(v) for v in obj]
+
+ return obj
diff --git a/fsspec/mapping.py b/fsspec/mapping.py
index 05bf237..93ebd1d 100644
--- a/fsspec/mapping.py
+++ b/fsspec/mapping.py
@@ -4,8 +4,10 @@ import posixpath
import warnings
from collections.abc import MutableMapping
from functools import cached_property
+
from fsspec.core import url_to_fs
-logger = logging.getLogger('fsspec.mapping')
+
+logger = logging.getLogger("fsspec.mapping")
class FSMap(MutableMapping):
@@ -36,15 +38,16 @@ class FSMap(MutableMapping):
b'Hello World'
"""
- def __init__(self, root, fs, check=False, create=False,
- missing_exceptions=None):
+ def __init__(self, root, fs, check=False, create=False, missing_exceptions=None):
self.fs = fs
self.root = fs._strip_protocol(root)
- self._root_key_to_str = fs._strip_protocol(posixpath.join(root, 'x'))[:
- -1]
+ self._root_key_to_str = fs._strip_protocol(posixpath.join(root, "x"))[:-1]
if missing_exceptions is None:
- missing_exceptions = (FileNotFoundError, IsADirectoryError,
- NotADirectoryError)
+ missing_exceptions = (
+ FileNotFoundError,
+ IsADirectoryError,
+ NotADirectoryError,
+ )
self.missing_exceptions = missing_exceptions
self.check = check
self.create = create
@@ -54,21 +57,29 @@ class FSMap(MutableMapping):
if check:
if not self.fs.exists(root):
raise ValueError(
- f'Path {root} does not exist. Create with the ``create=True`` keyword'
- )
- self.fs.touch(root + '/a')
- self.fs.rm(root + '/a')
+ f"Path {root} does not exist. Create "
+ f" with the ``create=True`` keyword"
+ )
+ self.fs.touch(root + "/a")
+ self.fs.rm(root + "/a")
@cached_property
def dirfs(self):
"""dirfs instance that can be used with the same keys as the mapper"""
- pass
+ from .implementations.dirfs import DirFileSystem
+
+ return DirFileSystem(path=self._root_key_to_str, fs=self.fs)
def clear(self):
"""Remove all keys below root - empties out mapping"""
- pass
+ logger.info("Clear mapping at %s", self.root)
+ try:
+ self.fs.rm(self.root, True)
+ self.fs.mkdir(self.root)
+ except: # noqa: E722
+ pass
- def getitems(self, keys, on_error='raise'):
+ def getitems(self, keys, on_error="raise"):
"""Fetch multiple items from the store
If the backend is async-able, this might proceed concurrently
@@ -88,7 +99,23 @@ class FSMap(MutableMapping):
-------
dict(key, bytes|exception)
"""
- pass
+ keys2 = [self._key_to_str(k) for k in keys]
+ oe = on_error if on_error == "raise" else "return"
+ try:
+ out = self.fs.cat(keys2, on_error=oe)
+ if isinstance(out, bytes):
+ out = {keys2[0]: out}
+ except self.missing_exceptions as e:
+ raise KeyError from e
+ out = {
+ k: (KeyError() if isinstance(v, self.missing_exceptions) else v)
+ for k, v in out.items()
+ }
+ return {
+ key: out[k2]
+ for key, k2 in zip(keys, keys2)
+ if on_error == "return" or not isinstance(out[k2], BaseException)
+ }
def setitems(self, values_dict):
"""Set the values of multiple items in the store
@@ -97,19 +124,29 @@ class FSMap(MutableMapping):
----------
values_dict: dict(str, bytes)
"""
- pass
+ values = {self._key_to_str(k): maybe_convert(v) for k, v in values_dict.items()}
+ self.fs.pipe(values)
def delitems(self, keys):
"""Remove multiple keys from the store"""
- pass
+ self.fs.rm([self._key_to_str(k) for k in keys])
def _key_to_str(self, key):
"""Generate full path for the key"""
- pass
+ if not isinstance(key, str):
+ # raise TypeError("key must be of type `str`, got `{type(key).__name__}`"
+ warnings.warn(
+ "from fsspec 2023.5 onward FSMap non-str keys will raise TypeError",
+ DeprecationWarning,
+ )
+ if isinstance(key, list):
+ key = tuple(key)
+ key = str(key)
+ return f"{self._root_key_to_str}{key}".rstrip("/")
def _str_to_key(self, s):
"""Strip path of to leave key name"""
- pass
+ return s[len(self.root) :].lstrip("/")
def __getitem__(self, key, default=None):
"""Retrieve data"""
@@ -124,7 +161,12 @@ class FSMap(MutableMapping):
def pop(self, key, default=None):
"""Pop data"""
- pass
+ result = self.__getitem__(key, default)
+ try:
+ del self[key]
+ except KeyError:
+ pass
+ return result
def __setitem__(self, key, value):
"""Store value in key"""
@@ -142,7 +184,7 @@ class FSMap(MutableMapping):
"""Remove key"""
try:
self.fs.rm(self._key_to_str(key))
- except:
+ except: # noqa: E722
raise KeyError
def __contains__(self, key):
@@ -151,12 +193,28 @@ class FSMap(MutableMapping):
return self.fs.isfile(path)
def __reduce__(self):
- return FSMap, (self.root, self.fs, False, False, self.
- missing_exceptions)
-
-
-def get_mapper(url='', check=False, create=False, missing_exceptions=None,
- alternate_root=None, **kwargs):
+ return FSMap, (self.root, self.fs, False, False, self.missing_exceptions)
+
+
+def maybe_convert(value):
+ if isinstance(value, array.array) or hasattr(value, "__array__"):
+ # bytes-like things
+ if hasattr(value, "dtype") and value.dtype.kind in "Mm":
+ # The buffer interface doesn't support datetime64/timdelta64 numpy
+ # arrays
+ value = value.view("int64")
+ value = bytes(memoryview(value))
+ return value
+
+
+def get_mapper(
+ url="",
+ check=False,
+ create=False,
+ missing_exceptions=None,
+ alternate_root=None,
+ **kwargs,
+):
"""Create key-value interface for given URL and options
The URL will be of the form "protocol://location" and point to the root
@@ -187,4 +245,7 @@ def get_mapper(url='', check=False, create=False, missing_exceptions=None,
-------
``FSMap`` instance, the dict-like key-value store.
"""
- pass
+ # Removing protocol here - could defer to each open() on the backend
+ fs, urlpath = url_to_fs(url, **kwargs)
+ root = alternate_root if alternate_root is not None else urlpath
+ return FSMap(root, fs, check, create, missing_exceptions=missing_exceptions)
diff --git a/fsspec/parquet.py b/fsspec/parquet.py
index be64f8a..5a0fb95 100644
--- a/fsspec/parquet.py
+++ b/fsspec/parquet.py
@@ -1,13 +1,34 @@
import io
import json
import warnings
+
from .core import url_to_fs
from .utils import merge_offset_ranges
+# Parquet-Specific Utilities for fsspec
+#
+# Most of the functions defined in this module are NOT
+# intended for public consumption. The only exception
+# to this is `open_parquet_file`, which should be used
+# place of `fs.open()` to open parquet-formatted files
+# on remote file systems.
+
-def open_parquet_file(path, mode='rb', fs=None, metadata=None, columns=None,
- row_groups=None, storage_options=None, strict=False, engine='auto',
- max_gap=64000, max_block=256000000, footer_sample_size=1000000, **kwargs):
+def open_parquet_file(
+ path,
+ mode="rb",
+ fs=None,
+ metadata=None,
+ columns=None,
+ row_groups=None,
+ storage_options=None,
+ strict=False,
+ engine="auto",
+ max_gap=64_000,
+ max_block=256_000_000,
+ footer_sample_size=1_000_000,
+ **kwargs,
+):
"""
Return a file-like object for a single Parquet file.
@@ -71,40 +92,450 @@ def open_parquet_file(path, mode='rb', fs=None, metadata=None, columns=None,
**kwargs :
Optional key-word arguments to pass to `fs.open`
"""
- pass
+ # Make sure we have an `AbstractFileSystem` object
+ # to work with
+ if fs is None:
+ fs = url_to_fs(path, **(storage_options or {}))[0]
+
+ # For now, `columns == []` not supported. Just use
+ # default `open` command with `path` input
+ if columns is not None and len(columns) == 0:
+ return fs.open(path, mode=mode)
+
+ # Set the engine
+ engine = _set_engine(engine)
+
+ # Fetch the known byte ranges needed to read
+ # `columns` and/or `row_groups`
+ data = _get_parquet_byte_ranges(
+ [path],
+ fs,
+ metadata=metadata,
+ columns=columns,
+ row_groups=row_groups,
+ engine=engine,
+ max_gap=max_gap,
+ max_block=max_block,
+ footer_sample_size=footer_sample_size,
+ )
+
+ # Extract file name from `data`
+ fn = next(iter(data)) if data else path
-def _get_parquet_byte_ranges(paths, fs, metadata=None, columns=None,
- row_groups=None, max_gap=64000, max_block=256000000, footer_sample_size
- =1000000, engine='auto'):
+ # Call self.open with "parts" caching
+ options = kwargs.pop("cache_options", {}).copy()
+ return fs.open(
+ fn,
+ mode=mode,
+ cache_type="parts",
+ cache_options={
+ **options,
+ "data": data.get(fn, {}),
+ "strict": strict,
+ },
+ **kwargs,
+ )
+
+
+def _get_parquet_byte_ranges(
+ paths,
+ fs,
+ metadata=None,
+ columns=None,
+ row_groups=None,
+ max_gap=64_000,
+ max_block=256_000_000,
+ footer_sample_size=1_000_000,
+ engine="auto",
+):
"""Get a dictionary of the known byte ranges needed
to read a specific column/row-group selection from a
Parquet dataset. Each value in the output dictionary
is intended for use as the `data` argument for the
`KnownPartsOfAFile` caching strategy of a single path.
"""
- pass
+ # Set engine if necessary
+ if isinstance(engine, str):
+ engine = _set_engine(engine)
+
+ # Pass to specialized function if metadata is defined
+ if metadata is not None:
+ # Use the provided parquet metadata object
+ # to avoid transferring/parsing footer metadata
+ return _get_parquet_byte_ranges_from_metadata(
+ metadata,
+ fs,
+ engine,
+ columns=columns,
+ row_groups=row_groups,
+ max_gap=max_gap,
+ max_block=max_block,
+ )
+
+ # Get file sizes asynchronously
+ file_sizes = fs.sizes(paths)
+
+ # Populate global paths, starts, & ends
+ result = {}
+ data_paths = []
+ data_starts = []
+ data_ends = []
+ add_header_magic = True
+ if columns is None and row_groups is None:
+ # We are NOT selecting specific columns or row-groups.
+ #
+ # We can avoid sampling the footers, and just transfer
+ # all file data with cat_ranges
+ for i, path in enumerate(paths):
+ result[path] = {}
+ for b in range(0, file_sizes[i], max_block):
+ data_paths.append(path)
+ data_starts.append(b)
+ data_ends.append(min(b + max_block, file_sizes[i]))
+ add_header_magic = False # "Magic" should already be included
+ else:
+ # We ARE selecting specific columns or row-groups.
+ #
+ # Gather file footers.
+ # We just take the last `footer_sample_size` bytes of each
+ # file (or the entire file if it is smaller than that)
+ footer_starts = []
+ footer_ends = []
+ for i, path in enumerate(paths):
+ footer_ends.append(file_sizes[i])
+ sample_size = max(0, file_sizes[i] - footer_sample_size)
+ footer_starts.append(sample_size)
+ footer_samples = fs.cat_ranges(paths, footer_starts, footer_ends)
+
+ # Check our footer samples and re-sample if necessary.
+ missing_footer_starts = footer_starts.copy()
+ large_footer = 0
+ for i, path in enumerate(paths):
+ footer_size = int.from_bytes(footer_samples[i][-8:-4], "little")
+ real_footer_start = file_sizes[i] - (footer_size + 8)
+ if real_footer_start < footer_starts[i]:
+ missing_footer_starts[i] = real_footer_start
+ large_footer = max(large_footer, (footer_size + 8))
+ if large_footer:
+ warnings.warn(
+ f"Not enough data was used to sample the parquet footer. "
+ f"Try setting footer_sample_size >= {large_footer}."
+ )
+ for i, block in enumerate(
+ fs.cat_ranges(
+ paths,
+ missing_footer_starts,
+ footer_starts,
+ )
+ ):
+ footer_samples[i] = block + footer_samples[i]
+ footer_starts[i] = missing_footer_starts[i]
-def _get_parquet_byte_ranges_from_metadata(metadata, fs, engine, columns=
- None, row_groups=None, max_gap=64000, max_block=256000000):
+ # Calculate required byte ranges for each path
+ for i, path in enumerate(paths):
+ # Deal with small-file case.
+ # Just include all remaining bytes of the file
+ # in a single range.
+ if file_sizes[i] < max_block:
+ if footer_starts[i] > 0:
+ # Only need to transfer the data if the
+ # footer sample isn't already the whole file
+ data_paths.append(path)
+ data_starts.append(0)
+ data_ends.append(footer_starts[i])
+ continue
+
+ # Use "engine" to collect data byte ranges
+ path_data_starts, path_data_ends = engine._parquet_byte_ranges(
+ columns,
+ row_groups=row_groups,
+ footer=footer_samples[i],
+ footer_start=footer_starts[i],
+ )
+
+ data_paths += [path] * len(path_data_starts)
+ data_starts += path_data_starts
+ data_ends += path_data_ends
+
+ # Merge adjacent offset ranges
+ data_paths, data_starts, data_ends = merge_offset_ranges(
+ data_paths,
+ data_starts,
+ data_ends,
+ max_gap=max_gap,
+ max_block=max_block,
+ sort=False, # Should already be sorted
+ )
+
+ # Start by populating `result` with footer samples
+ for i, path in enumerate(paths):
+ result[path] = {(footer_starts[i], footer_ends[i]): footer_samples[i]}
+
+ # Transfer the data byte-ranges into local memory
+ _transfer_ranges(fs, result, data_paths, data_starts, data_ends)
+
+ # Add b"PAR1" to header if necessary
+ if add_header_magic:
+ _add_header_magic(result)
+
+ return result
+
+
+def _get_parquet_byte_ranges_from_metadata(
+ metadata,
+ fs,
+ engine,
+ columns=None,
+ row_groups=None,
+ max_gap=64_000,
+ max_block=256_000_000,
+):
"""Simplified version of `_get_parquet_byte_ranges` for
the case that an engine-specific `metadata` object is
provided, and the remote footer metadata does not need to
be transferred before calculating the required byte ranges.
"""
- pass
+
+ # Use "engine" to collect data byte ranges
+ data_paths, data_starts, data_ends = engine._parquet_byte_ranges(
+ columns,
+ row_groups=row_groups,
+ metadata=metadata,
+ )
+
+ # Merge adjacent offset ranges
+ data_paths, data_starts, data_ends = merge_offset_ranges(
+ data_paths,
+ data_starts,
+ data_ends,
+ max_gap=max_gap,
+ max_block=max_block,
+ sort=False, # Should be sorted
+ )
+
+ # Transfer the data byte-ranges into local memory
+ result = {fn: {} for fn in list(set(data_paths))}
+ _transfer_ranges(fs, result, data_paths, data_starts, data_ends)
+
+ # Add b"PAR1" to header
+ _add_header_magic(result)
+
+ return result
+
+
+def _transfer_ranges(fs, blocks, paths, starts, ends):
+ # Use cat_ranges to gather the data byte_ranges
+ ranges = (paths, starts, ends)
+ for path, start, stop, data in zip(*ranges, fs.cat_ranges(*ranges)):
+ blocks[path][(start, stop)] = data
+
+
+def _add_header_magic(data):
+ # Add b"PAR1" to file headers
+ for path in list(data.keys()):
+ add_magic = True
+ for k in data[path].keys():
+ if k[0] == 0 and k[1] >= 4:
+ add_magic = False
+ break
+ if add_magic:
+ data[path][(0, 4)] = b"PAR1"
+
+
+def _set_engine(engine_str):
+ # Define a list of parquet engines to try
+ if engine_str == "auto":
+ try_engines = ("fastparquet", "pyarrow")
+ elif not isinstance(engine_str, str):
+ raise ValueError(
+ "Failed to set parquet engine! "
+ "Please pass 'fastparquet', 'pyarrow', or 'auto'"
+ )
+ elif engine_str not in ("fastparquet", "pyarrow"):
+ raise ValueError(f"{engine_str} engine not supported by `fsspec.parquet`")
+ else:
+ try_engines = [engine_str]
+
+ # Try importing the engines in `try_engines`,
+ # and choose the first one that succeeds
+ for engine in try_engines:
+ try:
+ if engine == "fastparquet":
+ return FastparquetEngine()
+ elif engine == "pyarrow":
+ return PyarrowEngine()
+ except ImportError:
+ pass
+
+ # Raise an error if a supported parquet engine
+ # was not found
+ raise ImportError(
+ f"The following parquet engines are not installed "
+ f"in your python environment: {try_engines}."
+ f"Please install 'fastparquert' or 'pyarrow' to "
+ f"utilize the `fsspec.parquet` module."
+ )
class FastparquetEngine:
+ # The purpose of the FastparquetEngine class is
+ # to check if fastparquet can be imported (on initialization)
+ # and to define a `_parquet_byte_ranges` method. In the
+ # future, this class may also be used to define other
+ # methods/logic that are specific to fastparquet.
def __init__(self):
import fastparquet as fp
+
self.fp = fp
+ def _row_group_filename(self, row_group, pf):
+ return pf.row_group_filename(row_group)
+
+ def _parquet_byte_ranges(
+ self,
+ columns,
+ row_groups=None,
+ metadata=None,
+ footer=None,
+ footer_start=None,
+ ):
+ # Initialize offset ranges and define ParqetFile metadata
+ pf = metadata
+ data_paths, data_starts, data_ends = [], [], []
+ if pf is None:
+ pf = self.fp.ParquetFile(io.BytesIO(footer))
+
+ # Convert columns to a set and add any index columns
+ # specified in the pandas metadata (just in case)
+ column_set = None if columns is None else set(columns)
+ if column_set is not None and hasattr(pf, "pandas_metadata"):
+ md_index = [
+ ind
+ for ind in pf.pandas_metadata.get("index_columns", [])
+ # Ignore RangeIndex information
+ if not isinstance(ind, dict)
+ ]
+ column_set |= set(md_index)
+
+ # Check if row_groups is a list of integers
+ # or a list of row-group metadata
+ if row_groups and not isinstance(row_groups[0], int):
+ # Input row_groups contains row-group metadata
+ row_group_indices = None
+ else:
+ # Input row_groups contains row-group indices
+ row_group_indices = row_groups
+ row_groups = pf.row_groups
+
+ # Loop through column chunks to add required byte ranges
+ for r, row_group in enumerate(row_groups):
+ # Skip this row-group if we are targeting
+ # specific row-groups
+ if row_group_indices is None or r in row_group_indices:
+ # Find the target parquet-file path for `row_group`
+ fn = self._row_group_filename(row_group, pf)
+
+ for column in row_group.columns:
+ name = column.meta_data.path_in_schema[0]
+ # Skip this column if we are targeting a
+ # specific columns
+ if column_set is None or name in column_set:
+ file_offset0 = column.meta_data.dictionary_page_offset
+ if file_offset0 is None:
+ file_offset0 = column.meta_data.data_page_offset
+ num_bytes = column.meta_data.total_compressed_size
+ if footer_start is None or file_offset0 < footer_start:
+ data_paths.append(fn)
+ data_starts.append(file_offset0)
+ data_ends.append(
+ min(
+ file_offset0 + num_bytes,
+ footer_start or (file_offset0 + num_bytes),
+ )
+ )
+
+ if metadata:
+ # The metadata in this call may map to multiple
+ # file paths. Need to include `data_paths`
+ return data_paths, data_starts, data_ends
+ return data_starts, data_ends
+
class PyarrowEngine:
+ # The purpose of the PyarrowEngine class is
+ # to check if pyarrow can be imported (on initialization)
+ # and to define a `_parquet_byte_ranges` method. In the
+ # future, this class may also be used to define other
+ # methods/logic that are specific to pyarrow.
def __init__(self):
import pyarrow.parquet as pq
+
self.pq = pq
+
+ def _row_group_filename(self, row_group, metadata):
+ raise NotImplementedError
+
+ def _parquet_byte_ranges(
+ self,
+ columns,
+ row_groups=None,
+ metadata=None,
+ footer=None,
+ footer_start=None,
+ ):
+ if metadata is not None:
+ raise ValueError("metadata input not supported for PyarrowEngine")
+
+ data_starts, data_ends = [], []
+ md = self.pq.ParquetFile(io.BytesIO(footer)).metadata
+
+ # Convert columns to a set and add any index columns
+ # specified in the pandas metadata (just in case)
+ column_set = None if columns is None else set(columns)
+ if column_set is not None:
+ schema = md.schema.to_arrow_schema()
+ has_pandas_metadata = (
+ schema.metadata is not None and b"pandas" in schema.metadata
+ )
+ if has_pandas_metadata:
+ md_index = [
+ ind
+ for ind in json.loads(
+ schema.metadata[b"pandas"].decode("utf8")
+ ).get("index_columns", [])
+ # Ignore RangeIndex information
+ if not isinstance(ind, dict)
+ ]
+ column_set |= set(md_index)
+
+ # Loop through column chunks to add required byte ranges
+ for r in range(md.num_row_groups):
+ # Skip this row-group if we are targeting
+ # specific row-groups
+ if row_groups is None or r in row_groups:
+ row_group = md.row_group(r)
+ for c in range(row_group.num_columns):
+ column = row_group.column(c)
+ name = column.path_in_schema
+ # Skip this column if we are targeting a
+ # specific columns
+ split_name = name.split(".")[0]
+ if (
+ column_set is None
+ or name in column_set
+ or split_name in column_set
+ ):
+ file_offset0 = column.dictionary_page_offset
+ if file_offset0 is None:
+ file_offset0 = column.data_page_offset
+ num_bytes = column.total_compressed_size
+ if file_offset0 < footer_start:
+ data_starts.append(file_offset0)
+ data_ends.append(
+ min(file_offset0 + num_bytes, footer_start)
+ )
+ return data_starts, data_ends
diff --git a/fsspec/registry.py b/fsspec/registry.py
index e2de702..c261b9b 100644
--- a/fsspec/registry.py
+++ b/fsspec/registry.py
@@ -1,11 +1,17 @@
from __future__ import annotations
+
import importlib
import types
import warnings
-__all__ = ['registry', 'get_filesystem_class', 'default']
+
+__all__ = ["registry", "get_filesystem_class", "default"]
+
+# internal, mutable
_registry: dict[str, type] = {}
+
+# external, immutable
registry = types.MappingProxyType(_registry)
-default = 'file'
+default = "file"
def register_implementation(name, cls, clobber=False, errtxt=None):
@@ -28,94 +34,189 @@ def register_implementation(name, cls, clobber=False, errtxt=None):
If given, then a failure to import the given class will result in this
text being given.
"""
- pass
-
-
-known_implementations = {'abfs': {'class': 'adlfs.AzureBlobFileSystem',
- 'err':
- 'Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage'},
- 'adl': {'class': 'adlfs.AzureDatalakeFileSystem', 'err':
- 'Install adlfs to access Azure Datalake Gen1'}, 'arrow_hdfs': {'class':
- 'fsspec.implementations.arrow.HadoopFileSystem', 'err':
- 'pyarrow and local java libraries required for HDFS'}, 'asynclocal': {
- 'class': 'morefs.asyn_local.AsyncLocalFileSystem', 'err':
- "Install 'morefs[asynclocalfs]' to use AsyncLocalFileSystem"}, 'az': {
- 'class': 'adlfs.AzureBlobFileSystem', 'err':
- 'Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage'},
- 'blockcache': {'class':
- 'fsspec.implementations.cached.CachingFileSystem'}, 'box': {'class':
- 'boxfs.BoxFileSystem', 'err':
- 'Please install boxfs to access BoxFileSystem'}, 'cached': {'class':
- 'fsspec.implementations.cached.CachingFileSystem'}, 'dask': {'class':
- 'fsspec.implementations.dask.DaskWorkerFileSystem', 'err':
- 'Install dask distributed to access worker file system'}, 'data': {
- 'class': 'fsspec.implementations.data.DataFileSystem'}, 'dbfs': {
- 'class': 'fsspec.implementations.dbfs.DatabricksFileSystem', 'err':
- 'Install the requests package to use the DatabricksFileSystem'}, 'dir':
- {'class': 'fsspec.implementations.dirfs.DirFileSystem'}, 'dropbox': {
- 'class': 'dropboxdrivefs.DropboxDriveFileSystem', 'err':
- 'DropboxFileSystem requires "dropboxdrivefs","requests" and ""dropbox" to be installed'
- }, 'dvc': {'class': 'dvc.api.DVCFileSystem', 'err':
- 'Install dvc to access DVCFileSystem'}, 'file': {'class':
- 'fsspec.implementations.local.LocalFileSystem'}, 'filecache': {'class':
- 'fsspec.implementations.cached.WholeFileCacheFileSystem'}, 'ftp': {
- 'class': 'fsspec.implementations.ftp.FTPFileSystem'}, 'gcs': {'class':
- 'gcsfs.GCSFileSystem', 'err':
- 'Please install gcsfs to access Google Storage'}, 'gdrive': {'class':
- 'gdrivefs.GoogleDriveFileSystem', 'err':
- 'Please install gdrivefs for access to Google Drive'}, 'generic': {
- 'class': 'fsspec.generic.GenericFileSystem'}, 'git': {'class':
- 'fsspec.implementations.git.GitFileSystem', 'err':
- 'Install pygit2 to browse local git repos'}, 'github': {'class':
- 'fsspec.implementations.github.GithubFileSystem', 'err':
- 'Install the requests package to use the github FS'}, 'gs': {'class':
- 'gcsfs.GCSFileSystem', 'err':
- 'Please install gcsfs to access Google Storage'}, 'hdfs': {'class':
- 'fsspec.implementations.arrow.HadoopFileSystem', 'err':
- 'pyarrow and local java libraries required for HDFS'}, 'hf': {'class':
- 'huggingface_hub.HfFileSystem', 'err':
- 'Install huggingface_hub to access HfFileSystem'}, 'http': {'class':
- 'fsspec.implementations.http.HTTPFileSystem', 'err':
- 'HTTPFileSystem requires "requests" and "aiohttp" to be installed'},
- 'https': {'class': 'fsspec.implementations.http.HTTPFileSystem', 'err':
- 'HTTPFileSystem requires "requests" and "aiohttp" to be installed'},
- 'jlab': {'class': 'fsspec.implementations.jupyter.JupyterFileSystem',
- 'err': 'Jupyter FS requires requests to be installed'}, 'jupyter': {
- 'class': 'fsspec.implementations.jupyter.JupyterFileSystem', 'err':
- 'Jupyter FS requires requests to be installed'}, 'lakefs': {'class':
- 'lakefs_spec.LakeFSFileSystem', 'err':
- 'Please install lakefs-spec to access LakeFSFileSystem'}, 'libarchive':
- {'class': 'fsspec.implementations.libarchive.LibArchiveFileSystem',
- 'err': 'LibArchive requires to be installed'}, 'local': {'class':
- 'fsspec.implementations.local.LocalFileSystem'}, 'memory': {'class':
- 'fsspec.implementations.memory.MemoryFileSystem'}, 'oci': {'class':
- 'ocifs.OCIFileSystem', 'err':
- 'Install ocifs to access OCI Object Storage'}, 'ocilake': {'class':
- 'ocifs.OCIFileSystem', 'err': 'Install ocifs to access OCI Data Lake'},
- 'oss': {'class': 'ossfs.OSSFileSystem', 'err':
- 'Install ossfs to access Alibaba Object Storage System'}, 'reference':
- {'class': 'fsspec.implementations.reference.ReferenceFileSystem'},
- 'root': {'class': 'fsspec_xrootd.XRootDFileSystem', 'err':
- "Install fsspec-xrootd to access xrootd storage system. Note: 'root' is the protocol name for xrootd storage systems, not referring to root directories"
- }, 's3': {'class': 's3fs.S3FileSystem', 'err':
- 'Install s3fs to access S3'}, 's3a': {'class': 's3fs.S3FileSystem',
- 'err': 'Install s3fs to access S3'}, 'sftp': {'class':
- 'fsspec.implementations.sftp.SFTPFileSystem', 'err':
- 'SFTPFileSystem requires "paramiko" to be installed'}, 'simplecache': {
- 'class': 'fsspec.implementations.cached.SimpleCacheFileSystem'}, 'smb':
- {'class': 'fsspec.implementations.smb.SMBFileSystem', 'err':
- 'SMB requires "smbprotocol" or "smbprotocol[kerberos]" installed'},
- 'ssh': {'class': 'fsspec.implementations.sftp.SFTPFileSystem', 'err':
- 'SFTPFileSystem requires "paramiko" to be installed'}, 'tar': {'class':
- 'fsspec.implementations.tar.TarFileSystem'}, 'wandb': {'class':
- 'wandbfs.WandbFS', 'err': 'Install wandbfs to access wandb'}, 'webdav':
- {'class': 'webdav4.fsspec.WebdavFileSystem', 'err':
- 'Install webdav4 to access WebDAV'}, 'webhdfs': {'class':
- 'fsspec.implementations.webhdfs.WebHDFS', 'err':
- 'webHDFS access requires "requests" to be installed'}, 'zip': {'class':
- 'fsspec.implementations.zip.ZipFileSystem'}}
-assert list(known_implementations) == sorted(known_implementations
- ), 'Not in alphabetical order'
+ if isinstance(cls, str):
+ if name in known_implementations and clobber is False:
+ if cls != known_implementations[name]["class"]:
+ raise ValueError(
+ f"Name ({name}) already in the known_implementations and clobber "
+ f"is False"
+ )
+ else:
+ known_implementations[name] = {
+ "class": cls,
+ "err": errtxt or f"{cls} import failed for protocol {name}",
+ }
+
+ else:
+ if name in registry and clobber is False:
+ if _registry[name] is not cls:
+ raise ValueError(
+ f"Name ({name}) already in the registry and clobber is False"
+ )
+ else:
+ _registry[name] = cls
+
+
+# protocols mapped to the class which implements them. This dict can be
+# updated with register_implementation
+known_implementations = {
+ "abfs": {
+ "class": "adlfs.AzureBlobFileSystem",
+ "err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage",
+ },
+ "adl": {
+ "class": "adlfs.AzureDatalakeFileSystem",
+ "err": "Install adlfs to access Azure Datalake Gen1",
+ },
+ "arrow_hdfs": {
+ "class": "fsspec.implementations.arrow.HadoopFileSystem",
+ "err": "pyarrow and local java libraries required for HDFS",
+ },
+ "asynclocal": {
+ "class": "morefs.asyn_local.AsyncLocalFileSystem",
+ "err": "Install 'morefs[asynclocalfs]' to use AsyncLocalFileSystem",
+ },
+ "az": {
+ "class": "adlfs.AzureBlobFileSystem",
+ "err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage",
+ },
+ "blockcache": {"class": "fsspec.implementations.cached.CachingFileSystem"},
+ "box": {
+ "class": "boxfs.BoxFileSystem",
+ "err": "Please install boxfs to access BoxFileSystem",
+ },
+ "cached": {"class": "fsspec.implementations.cached.CachingFileSystem"},
+ "dask": {
+ "class": "fsspec.implementations.dask.DaskWorkerFileSystem",
+ "err": "Install dask distributed to access worker file system",
+ },
+ "data": {"class": "fsspec.implementations.data.DataFileSystem"},
+ "dbfs": {
+ "class": "fsspec.implementations.dbfs.DatabricksFileSystem",
+ "err": "Install the requests package to use the DatabricksFileSystem",
+ },
+ "dir": {"class": "fsspec.implementations.dirfs.DirFileSystem"},
+ "dropbox": {
+ "class": "dropboxdrivefs.DropboxDriveFileSystem",
+ "err": (
+ 'DropboxFileSystem requires "dropboxdrivefs","requests" and "'
+ '"dropbox" to be installed'
+ ),
+ },
+ "dvc": {
+ "class": "dvc.api.DVCFileSystem",
+ "err": "Install dvc to access DVCFileSystem",
+ },
+ "file": {"class": "fsspec.implementations.local.LocalFileSystem"},
+ "filecache": {"class": "fsspec.implementations.cached.WholeFileCacheFileSystem"},
+ "ftp": {"class": "fsspec.implementations.ftp.FTPFileSystem"},
+ "gcs": {
+ "class": "gcsfs.GCSFileSystem",
+ "err": "Please install gcsfs to access Google Storage",
+ },
+ "gdrive": {
+ "class": "gdrivefs.GoogleDriveFileSystem",
+ "err": "Please install gdrivefs for access to Google Drive",
+ },
+ "generic": {"class": "fsspec.generic.GenericFileSystem"},
+ "git": {
+ "class": "fsspec.implementations.git.GitFileSystem",
+ "err": "Install pygit2 to browse local git repos",
+ },
+ "github": {
+ "class": "fsspec.implementations.github.GithubFileSystem",
+ "err": "Install the requests package to use the github FS",
+ },
+ "gs": {
+ "class": "gcsfs.GCSFileSystem",
+ "err": "Please install gcsfs to access Google Storage",
+ },
+ "hdfs": {
+ "class": "fsspec.implementations.arrow.HadoopFileSystem",
+ "err": "pyarrow and local java libraries required for HDFS",
+ },
+ "hf": {
+ "class": "huggingface_hub.HfFileSystem",
+ "err": "Install huggingface_hub to access HfFileSystem",
+ },
+ "http": {
+ "class": "fsspec.implementations.http.HTTPFileSystem",
+ "err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed',
+ },
+ "https": {
+ "class": "fsspec.implementations.http.HTTPFileSystem",
+ "err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed',
+ },
+ "jlab": {
+ "class": "fsspec.implementations.jupyter.JupyterFileSystem",
+ "err": "Jupyter FS requires requests to be installed",
+ },
+ "jupyter": {
+ "class": "fsspec.implementations.jupyter.JupyterFileSystem",
+ "err": "Jupyter FS requires requests to be installed",
+ },
+ "lakefs": {
+ "class": "lakefs_spec.LakeFSFileSystem",
+ "err": "Please install lakefs-spec to access LakeFSFileSystem",
+ },
+ "libarchive": {
+ "class": "fsspec.implementations.libarchive.LibArchiveFileSystem",
+ "err": "LibArchive requires to be installed",
+ },
+ "local": {"class": "fsspec.implementations.local.LocalFileSystem"},
+ "memory": {"class": "fsspec.implementations.memory.MemoryFileSystem"},
+ "oci": {
+ "class": "ocifs.OCIFileSystem",
+ "err": "Install ocifs to access OCI Object Storage",
+ },
+ "ocilake": {
+ "class": "ocifs.OCIFileSystem",
+ "err": "Install ocifs to access OCI Data Lake",
+ },
+ "oss": {
+ "class": "ossfs.OSSFileSystem",
+ "err": "Install ossfs to access Alibaba Object Storage System",
+ },
+ "reference": {"class": "fsspec.implementations.reference.ReferenceFileSystem"},
+ "root": {
+ "class": "fsspec_xrootd.XRootDFileSystem",
+ "err": (
+ "Install fsspec-xrootd to access xrootd storage system. "
+ "Note: 'root' is the protocol name for xrootd storage systems, "
+ "not referring to root directories"
+ ),
+ },
+ "s3": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"},
+ "s3a": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"},
+ "sftp": {
+ "class": "fsspec.implementations.sftp.SFTPFileSystem",
+ "err": 'SFTPFileSystem requires "paramiko" to be installed',
+ },
+ "simplecache": {"class": "fsspec.implementations.cached.SimpleCacheFileSystem"},
+ "smb": {
+ "class": "fsspec.implementations.smb.SMBFileSystem",
+ "err": 'SMB requires "smbprotocol" or "smbprotocol[kerberos]" installed',
+ },
+ "ssh": {
+ "class": "fsspec.implementations.sftp.SFTPFileSystem",
+ "err": 'SFTPFileSystem requires "paramiko" to be installed',
+ },
+ "tar": {"class": "fsspec.implementations.tar.TarFileSystem"},
+ "wandb": {"class": "wandbfs.WandbFS", "err": "Install wandbfs to access wandb"},
+ "webdav": {
+ "class": "webdav4.fsspec.WebdavFileSystem",
+ "err": "Install webdav4 to access WebDAV",
+ },
+ "webhdfs": {
+ "class": "fsspec.implementations.webhdfs.WebHDFS",
+ "err": 'webHDFS access requires "requests" to be installed',
+ },
+ "zip": {"class": "fsspec.implementations.zip.ZipFileSystem"},
+}
+
+assert list(known_implementations) == sorted(
+ known_implementations
+), "Not in alphabetical order"
def get_filesystem_class(protocol):
@@ -130,7 +231,22 @@ def get_filesystem_class(protocol):
import may fail. In this case, the string in the "err" field of the
``known_implementations`` will be given as the error message.
"""
- pass
+ if not protocol:
+ protocol = default
+
+ if protocol not in registry:
+ if protocol not in known_implementations:
+ raise ValueError(f"Protocol not known: {protocol}")
+ bit = known_implementations[protocol]
+ try:
+ register_implementation(protocol, _import_class(bit["class"]))
+ except ImportError as e:
+ raise ImportError(bit["err"]) from e
+ cls = registry[protocol]
+ if getattr(cls, "protocol", None) in ("abstract", None):
+ cls.protocol = protocol
+
+ return cls
s3_msg = """Your installed version of s3fs is very old and known to cause
@@ -152,7 +268,22 @@ def _import_class(fqp: str):
This can import arbitrary modules. Make sure you haven't installed any modules
that may execute malicious code at import time.
"""
- pass
+ if ":" in fqp:
+ mod, name = fqp.rsplit(":", 1)
+ else:
+ mod, name = fqp.rsplit(".", 1)
+
+ is_s3 = mod == "s3fs"
+ mod = importlib.import_module(mod)
+ if is_s3 and mod.__version__.split(".") < ["0", "5"]:
+ warnings.warn(s3_msg)
+ for part in name.split("."):
+ mod = getattr(mod, part)
+
+ if not isinstance(mod, type):
+ raise TypeError(f"{fqp} is not a class")
+
+ return mod
def filesystem(protocol, **storage_options):
@@ -161,7 +292,15 @@ def filesystem(protocol, **storage_options):
``storage_options`` are specific to the protocol being chosen, and are
passed directly to the class.
"""
- pass
+ if protocol == "arrow_hdfs":
+ warnings.warn(
+ "The 'arrow_hdfs' protocol has been deprecated and will be "
+ "removed in the future. Specify it as 'hdfs'.",
+ DeprecationWarning,
+ )
+
+ cls = get_filesystem_class(protocol)
+ return cls(**storage_options)
def available_protocols():
@@ -169,4 +308,4 @@ def available_protocols():
Note that any given protocol may require extra packages to be importable.
"""
- pass
+ return list(known_implementations)
diff --git a/fsspec/spec.py b/fsspec/spec.py
index 106214a..1463a44 100644
--- a/fsspec/spec.py
+++ b/fsspec/spec.py
@@ -1,4 +1,5 @@
from __future__ import annotations
+
import io
import json
import logging
@@ -10,12 +11,26 @@ from errno import ESPIPE
from glob import has_magic
from hashlib import sha256
from typing import Any, ClassVar, Dict, Tuple
+
from .callbacks import DEFAULT_CALLBACK
from .config import apply_config, conf
from .dircache import DirCache
from .transaction import Transaction
-from .utils import _unstrip_protocol, glob_translate, isfilelike, other_paths, read_block, stringify_path, tokenize
-logger = logging.getLogger('fsspec')
+from .utils import (
+ _unstrip_protocol,
+ glob_translate,
+ isfilelike,
+ other_paths,
+ read_block,
+ stringify_path,
+ tokenize,
+)
+
+logger = logging.getLogger("fsspec")
+
+
+def make_instance(cls, args, kwargs):
+ return cls(*args, **kwargs)
class _Cached(type):
@@ -37,7 +52,11 @@ class _Cached(type):
def __init__(cls, *args, **kwargs):
super().__init__(*args, **kwargs)
- if conf.get('weakref_instance_cache'):
+ # Note: we intentionally create a reference here, to avoid garbage
+ # collecting instances when all other references are gone. To really
+ # delete a FileSystem, the cache must be cleared.
+ if conf.get("weakref_instance_cache"): # pragma: no cover
+ # debug option for analysing fork/spawn conditions
cls._cache = weakref.WeakValueDictionary()
else:
cls._cache = {}
@@ -45,11 +64,13 @@ class _Cached(type):
def __call__(cls, *args, **kwargs):
kwargs = apply_config(cls, kwargs)
- extra_tokens = tuple(getattr(cls, attr, None) for attr in cls.
- _extra_tokenize_attributes)
- token = tokenize(cls, cls._pid, threading.get_ident(), *args, *
- extra_tokens, **kwargs)
- skip = kwargs.pop('skip_instance_cache', False)
+ extra_tokens = tuple(
+ getattr(cls, attr, None) for attr in cls._extra_tokenize_attributes
+ )
+ token = tokenize(
+ cls, cls._pid, threading.get_ident(), *args, *extra_tokens, **kwargs
+ )
+ skip = kwargs.pop("skip_instance_cache", False)
if os.getpid() != cls._pid:
cls._cache.clear()
cls._pid = os.getpid()
@@ -58,12 +79,15 @@ class _Cached(type):
return cls._cache[token]
else:
obj = super().__call__(*args, **kwargs)
+ # Setting _fs_token here causes some static linters to complain.
obj._fs_token_ = token
obj.storage_args = args
obj.storage_options = kwargs
if obj.async_impl and obj.mirror_sync_methods:
from .asyn import mirror_sync_methods
+
mirror_sync_methods(obj)
+
if cls.cachable and not skip:
cls._latest = token
cls._cache[token] = obj
@@ -77,17 +101,22 @@ class AbstractFileSystem(metaclass=_Cached):
Implementations are expected to be compatible with or, better, subclass
from here.
"""
- cachable = True
+
+ cachable = True # this class can be cached, instances reused
_cached = False
- blocksize = 2 ** 22
- sep = '/'
- protocol: ClassVar[str | tuple[str, ...]] = 'abstract'
+ blocksize = 2**22
+ sep = "/"
+ protocol: ClassVar[str | tuple[str, ...]] = "abstract"
_latest = None
async_impl = False
mirror_sync_methods = False
- root_marker = ''
+ root_marker = "" # For some FSs, may require leading '/' or other character
transaction_type = Transaction
+
+ #: Extra *class attributes* that should be considered when hashing.
_extra_tokenize_attributes = ()
+
+ # Set by _Cached metaclass
storage_args: Tuple[Any, ...]
storage_options: Dict[str, Any]
@@ -116,16 +145,20 @@ class AbstractFileSystem(metaclass=_Cached):
loop: asyncio-compatible IOLoop or None
"""
if self._cached:
+ # reusing instance, don't change
return
self._cached = True
self._intrans = False
self._transaction = None
self._invalidated_caches_in_transaction = []
self.dircache = DirCache(**storage_options)
- if storage_options.pop('add_docs', None):
- warnings.warn('add_docs is no longer supported.', FutureWarning)
- if storage_options.pop('add_aliases', None):
- warnings.warn('add_aliases has been removed.', FutureWarning)
+
+ if storage_options.pop("add_docs", None):
+ warnings.warn("add_docs is no longer supported.", FutureWarning)
+
+ if storage_options.pop("add_aliases", None):
+ warnings.warn("add_aliases has been removed.", FutureWarning)
+ # This is set in _Cached
self._fs_token_ = None
@property
@@ -133,7 +166,11 @@ class AbstractFileSystem(metaclass=_Cached):
"""Persistent filesystem id that can be used to compare filesystems
across sessions.
"""
- pass
+ raise NotImplementedError
+
+ @property
+ def _fs_token(self):
+ return self._fs_token_
def __dask_tokenize__(self):
return self._fs_token
@@ -142,12 +179,10 @@ class AbstractFileSystem(metaclass=_Cached):
return int(self._fs_token, 16)
def __eq__(self, other):
- return isinstance(other, type(self)
- ) and self._fs_token == other._fs_token
+ return isinstance(other, type(self)) and self._fs_token == other._fs_token
def __reduce__(self):
- return make_instance, (type(self), self.storage_args, self.
- storage_options)
+ return make_instance, (type(self), self.storage_args, self.storage_options)
@classmethod
def _strip_protocol(cls, path):
@@ -155,11 +190,26 @@ class AbstractFileSystem(metaclass=_Cached):
May require FS-specific handling, e.g., for relative paths or links.
"""
- pass
-
- def unstrip_protocol(self, name: str) ->str:
+ if isinstance(path, list):
+ return [cls._strip_protocol(p) for p in path]
+ path = stringify_path(path)
+ protos = (cls.protocol,) if isinstance(cls.protocol, str) else cls.protocol
+ for protocol in protos:
+ if path.startswith(protocol + "://"):
+ path = path[len(protocol) + 3 :]
+ elif path.startswith(protocol + "::"):
+ path = path[len(protocol) + 2 :]
+ path = path.rstrip("/")
+ # use of root_marker to make minimum required path, e.g., "/"
+ return path or cls.root_marker
+
+ def unstrip_protocol(self, name: str) -> str:
"""Format FS-specific path to generic, including protocol"""
- pass
+ protos = (self.protocol,) if isinstance(self.protocol, str) else self.protocol
+ for protocol in protos:
+ if name.startswith(f"{protocol}://"):
+ return name
+ return f"{protos[0]}://{name}"
@staticmethod
def _get_kwargs_from_urls(path):
@@ -171,7 +221,8 @@ class AbstractFileSystem(metaclass=_Cached):
Examples may look like an sftp path "sftp://user@host:/my/path", where
the user and host should become kwargs and later get stripped.
"""
- pass
+ # by default, nothing happens
+ return {}
@classmethod
def current(cls):
@@ -179,7 +230,9 @@ class AbstractFileSystem(metaclass=_Cached):
If no instance has been created, then create one with defaults
"""
- pass
+ if cls._latest in cls._cache:
+ return cls._cache[cls._latest]
+ return cls()
@property
def transaction(self):
@@ -188,15 +241,24 @@ class AbstractFileSystem(metaclass=_Cached):
Requires the file class to implement `.commit()` and `.discard()`
for the normal and exception cases.
"""
- pass
+ if self._transaction is None:
+ self._transaction = self.transaction_type(self)
+ return self._transaction
def start_transaction(self):
"""Begin write transaction for deferring files, non-context version"""
- pass
+ self._intrans = True
+ self._transaction = self.transaction_type(self)
+ return self.transaction
def end_transaction(self):
"""Finish write transaction, non-context version"""
- pass
+ self.transaction.complete()
+ self._transaction = None
+ # The invalid cache must be cleared after the transaction is completed.
+ for path in self._invalidated_caches_in_transaction:
+ self.invalidate_cache(path)
+ self._invalidated_caches_in_transaction.clear()
def invalidate_cache(self, path=None):
"""
@@ -208,7 +270,12 @@ class AbstractFileSystem(metaclass=_Cached):
If None, clear all listings cached else listings at or under given
path.
"""
- pass
+ # Not necessary to implement invalidation mechanism, may have no cache.
+ # But if have, you should call this method of parent class from your
+ # subclass to ensure expiring caches after transacations correctly.
+ # See the implementation of FTPFileSystem in ftp.py
+ if self._intrans:
+ self._invalidated_caches_in_transaction.append(path)
def mkdir(self, path, create_parents=True, **kwargs):
"""
@@ -226,7 +293,7 @@ class AbstractFileSystem(metaclass=_Cached):
kwargs:
may be permissions, etc.
"""
- pass
+ pass # not necessary to implement, may not have directories
def makedirs(self, path, exist_ok=False):
"""Recursively make directories
@@ -242,11 +309,11 @@ class AbstractFileSystem(metaclass=_Cached):
exist_ok: bool (False)
If False, will error if the target already exists
"""
- pass
+ pass # not necessary to implement, may not have directories
def rmdir(self, path):
"""Remove a directory, if empty"""
- pass
+ pass # not necessary to implement, may not have directories
def ls(self, path, detail=True, **kwargs):
"""List objects at path.
@@ -287,7 +354,7 @@ class AbstractFileSystem(metaclass=_Cached):
List of strings if detail is False, or list of directory information
dicts if detail is True.
"""
- pass
+ raise NotImplementedError
def _ls_from_cache(self, path):
"""Check cache for listing
@@ -295,10 +362,26 @@ class AbstractFileSystem(metaclass=_Cached):
Returns listing, if found (may be empty list for a directly that exists
but contains nothing), None if not in cache.
"""
- pass
-
- def walk(self, path, maxdepth=None, topdown=True, on_error='omit', **kwargs
- ):
+ parent = self._parent(path)
+ try:
+ return self.dircache[path.rstrip("/")]
+ except KeyError:
+ pass
+ try:
+ files = [
+ f
+ for f in self.dircache[parent]
+ if f["name"] == path
+ or (f["name"] == path.rstrip("/") and f["type"] == "directory")
+ ]
+ if len(files) == 0:
+ # parent dir was listed but did not contain this file
+ raise FileNotFoundError(path)
+ return files
+ except KeyError:
+ pass
+
+ def walk(self, path, maxdepth=None, topdown=True, on_error="omit", **kwargs):
"""Return all files belows path
List all files, recursing into subdirectories; output is iterator-style,
@@ -331,10 +414,70 @@ class AbstractFileSystem(metaclass=_Cached):
if callable, it will be called with a single OSError instance as argument
kwargs: passed to ``ls``
"""
- pass
-
- def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs
- ):
+ if maxdepth is not None and maxdepth < 1:
+ raise ValueError("maxdepth must be at least 1")
+
+ path = self._strip_protocol(path)
+ full_dirs = {}
+ dirs = {}
+ files = {}
+
+ detail = kwargs.pop("detail", False)
+ try:
+ listing = self.ls(path, detail=True, **kwargs)
+ except (FileNotFoundError, OSError) as e:
+ if on_error == "raise":
+ raise
+ elif callable(on_error):
+ on_error(e)
+ if detail:
+ return path, {}, {}
+ return path, [], []
+
+ for info in listing:
+ # each info name must be at least [path]/part , but here
+ # we check also for names like [path]/part/
+ pathname = info["name"].rstrip("/")
+ name = pathname.rsplit("/", 1)[-1]
+ if info["type"] == "directory" and pathname != path:
+ # do not include "self" path
+ full_dirs[name] = pathname
+ dirs[name] = info
+ elif pathname == path:
+ # file-like with same name as give path
+ files[""] = info
+ else:
+ files[name] = info
+
+ if not detail:
+ dirs = list(dirs)
+ files = list(files)
+
+ if topdown:
+ # Yield before recursion if walking top down
+ yield path, dirs, files
+
+ if maxdepth is not None:
+ maxdepth -= 1
+ if maxdepth < 1:
+ if not topdown:
+ yield path, dirs, files
+ return
+
+ for d in dirs:
+ yield from self.walk(
+ full_dirs[d],
+ maxdepth=maxdepth,
+ detail=detail,
+ topdown=topdown,
+ **kwargs,
+ )
+
+ if not topdown:
+ # Yield after recursion if walking bottom up
+ yield path, dirs, files
+
+ def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
"""List all files below path.
Like posix ``find`` command without conditions
@@ -349,7 +492,28 @@ class AbstractFileSystem(metaclass=_Cached):
when used by glob, but users usually only want files.
kwargs are passed to ``ls``.
"""
- pass
+ # TODO: allow equivalent of -name parameter
+ path = self._strip_protocol(path)
+ out = {}
+
+ # Add the root directory if withdirs is requested
+ # This is needed for posix glob compliance
+ if withdirs and path != "" and self.isdir(path):
+ out[path] = self.info(path)
+
+ for _, dirs, files in self.walk(path, maxdepth, detail=True, **kwargs):
+ if withdirs:
+ files.update(dirs)
+ out.update({info["name"]: info for name, info in files.items()})
+ if not out and self.isfile(path):
+ # walk works on directories, but find should also return [path]
+ # when path happens to be a file
+ out[path] = {}
+ names = sorted(out)
+ if not detail:
+ return names
+ else:
+ return {name: out[name] for name in names}
def du(self, path, total=True, maxdepth=None, withdirs=False, **kwargs):
"""Space used by files and optionally directories within a path
@@ -372,7 +536,18 @@ class AbstractFileSystem(metaclass=_Cached):
Dict of {path: size} if total=False, or int otherwise, where numbers
refer to bytes used.
"""
- pass
+ sizes = {}
+ if withdirs and self.isdir(path):
+ # Include top-level directory in output
+ info = self.info(path)
+ sizes[info["name"]] = info["size"]
+ for f in self.find(path, maxdepth=maxdepth, withdirs=withdirs, **kwargs):
+ info = self.info(f)
+ sizes[info["name"]] = info["size"]
+ if total:
+ return sum(sizes.values())
+ else:
+ return sizes
def glob(self, path, maxdepth=None, **kwargs):
"""
@@ -387,16 +562,87 @@ class AbstractFileSystem(metaclass=_Cached):
kwargs are passed to ``ls``.
"""
- pass
+ if maxdepth is not None and maxdepth < 1:
+ raise ValueError("maxdepth must be at least 1")
+
+ import re
+
+ seps = (os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)
+ ends_with_sep = path.endswith(seps) # _strip_protocol strips trailing slash
+ path = self._strip_protocol(path)
+ append_slash_to_dirname = ends_with_sep or path.endswith(
+ tuple(sep + "**" for sep in seps)
+ )
+ idx_star = path.find("*") if path.find("*") >= 0 else len(path)
+ idx_qmark = path.find("?") if path.find("?") >= 0 else len(path)
+ idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
+
+ min_idx = min(idx_star, idx_qmark, idx_brace)
+
+ detail = kwargs.pop("detail", False)
+
+ if not has_magic(path):
+ if self.exists(path, **kwargs):
+ if not detail:
+ return [path]
+ else:
+ return {path: self.info(path, **kwargs)}
+ else:
+ if not detail:
+ return [] # glob of non-existent returns empty
+ else:
+ return {}
+ elif "/" in path[:min_idx]:
+ min_idx = path[:min_idx].rindex("/")
+ root = path[: min_idx + 1]
+ depth = path[min_idx + 1 :].count("/") + 1
+ else:
+ root = ""
+ depth = path[min_idx + 1 :].count("/") + 1
+
+ if "**" in path:
+ if maxdepth is not None:
+ idx_double_stars = path.find("**")
+ depth_double_stars = path[idx_double_stars:].count("/") + 1
+ depth = depth - depth_double_stars + maxdepth
+ else:
+ depth = None
+
+ allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
+
+ pattern = glob_translate(path + ("/" if ends_with_sep else ""))
+ pattern = re.compile(pattern)
+
+ out = {
+ p: info
+ for p, info in sorted(allpaths.items())
+ if pattern.match(
+ (
+ p + "/"
+ if append_slash_to_dirname and info["type"] == "directory"
+ else p
+ )
+ )
+ }
+
+ if detail:
+ return out
+ else:
+ return list(out)
def exists(self, path, **kwargs):
"""Is there a file at the given path"""
- pass
+ try:
+ self.info(path, **kwargs)
+ return True
+ except: # noqa: E722
+ # any exception allowed bar FileNotFoundError?
+ return False
def lexists(self, path, **kwargs):
"""If there is a file at the given path (including
broken links)"""
- pass
+ return self.exists(path)
def info(self, path, **kwargs):
"""Give details of entry at path
@@ -415,7 +661,22 @@ class AbstractFileSystem(metaclass=_Cached):
dict with keys: name (full path in the FS), size (in bytes), type (file,
directory, or something else) and other FS-specific keys.
"""
- pass
+ path = self._strip_protocol(path)
+ out = self.ls(self._parent(path), detail=True, **kwargs)
+ out = [o for o in out if o["name"].rstrip("/") == path]
+ if out:
+ return out[0]
+ out = self.ls(path, detail=True, **kwargs)
+ path = path.rstrip("/")
+ out1 = [o for o in out if o["name"].rstrip("/") == path]
+ if len(out1) == 1:
+ if "size" not in out1[0]:
+ out1[0]["size"] = None
+ return out1[0]
+ elif len(out1) > 1 or out:
+ return {"name": path, "size": 0, "type": "directory"}
+ else:
+ raise FileNotFoundError(path)
def checksum(self, path):
"""Unique value for current version of file
@@ -428,26 +689,31 @@ class AbstractFileSystem(metaclass=_Cached):
creation/modification timestamp (which would be good) or maybe
access timestamp (which would be bad)
"""
- pass
+ return int(tokenize(self.info(path)), 16)
def size(self, path):
"""Size in bytes of file"""
- pass
+ return self.info(path).get("size", None)
def sizes(self, paths):
"""Size in bytes of each file in a list of paths"""
- pass
+ return [self.size(p) for p in paths]
def isdir(self, path):
"""Is this entry directory-like?"""
- pass
+ try:
+ return self.info(path)["type"] == "directory"
+ except OSError:
+ return False
def isfile(self, path):
"""Is this entry file-like?"""
- pass
+ try:
+ return self.info(path)["type"] == "file"
+ except: # noqa: E722
+ return False
- def read_text(self, path, encoding=None, errors=None, newline=None, **
- kwargs):
+ def read_text(self, path, encoding=None, errors=None, newline=None, **kwargs):
"""Get the contents of the file as a string.
Parameters
@@ -456,10 +722,19 @@ class AbstractFileSystem(metaclass=_Cached):
URL of file on this filesystems
encoding, errors, newline: same as `open`.
"""
- pass
-
- def write_text(self, path, value, encoding=None, errors=None, newline=
- None, **kwargs):
+ with self.open(
+ path,
+ mode="r",
+ encoding=encoding,
+ errors=errors,
+ newline=newline,
+ **kwargs,
+ ) as f:
+ return f.read()
+
+ def write_text(
+ self, path, value, encoding=None, errors=None, newline=None, **kwargs
+ ):
"""Write the text to the given file.
An existing file will be overwritten.
@@ -472,7 +747,15 @@ class AbstractFileSystem(metaclass=_Cached):
Text to write.
encoding, errors, newline: same as `open`.
"""
- pass
+ with self.open(
+ path,
+ mode="w",
+ encoding=encoding,
+ errors=errors,
+ newline=newline,
+ **kwargs,
+ ) as f:
+ return f.write(value)
def cat_file(self, path, start=None, end=None, **kwargs):
"""Get the content of a file
@@ -486,11 +769,23 @@ class AbstractFileSystem(metaclass=_Cached):
end of file, respectively
kwargs: passed to ``open()``.
"""
- pass
+ # explicitly set buffering off?
+ with self.open(path, "rb", **kwargs) as f:
+ if start is not None:
+ if start >= 0:
+ f.seek(start)
+ else:
+ f.seek(max(0, f.size + start))
+ if end is not None:
+ if end < 0:
+ end = f.size + end
+ return f.read(end - f.tell())
+ return f.read()
def pipe_file(self, path, value, **kwargs):
"""Set the bytes of given file"""
- pass
+ with self.open(path, "wb", **kwargs) as f:
+ f.write(value)
def pipe(self, path, value=None, **kwargs):
"""Put value into path
@@ -506,10 +801,17 @@ class AbstractFileSystem(metaclass=_Cached):
If using a single path, these are the bytes to put there. Ignored if
``path`` is a dict
"""
- pass
+ if isinstance(path, str):
+ self.pipe_file(self._strip_protocol(path), value, **kwargs)
+ elif isinstance(path, dict):
+ for k, v in path.items():
+ self.pipe_file(self._strip_protocol(k), v, **kwargs)
+ else:
+ raise ValueError("path must be str or dict")
- def cat_ranges(self, paths, starts, ends, max_gap=None, on_error=
- 'return', **kwargs):
+ def cat_ranges(
+ self, paths, starts, ends, max_gap=None, on_error="return", **kwargs
+ ):
"""Get the contents of byte ranges from one or more files
Parameters
@@ -520,9 +822,28 @@ class AbstractFileSystem(metaclass=_Cached):
Bytes limits of the read. If using a single int, the same value will be
used to read all the specified files.
"""
- pass
-
- def cat(self, path, recursive=False, on_error='raise', **kwargs):
+ if max_gap is not None:
+ raise NotImplementedError
+ if not isinstance(paths, list):
+ raise TypeError
+ if not isinstance(starts, list):
+ starts = [starts] * len(paths)
+ if not isinstance(ends, list):
+ ends = [ends] * len(paths)
+ if len(starts) != len(paths) or len(ends) != len(paths):
+ raise ValueError
+ out = []
+ for p, s, e in zip(paths, starts, ends):
+ try:
+ out.append(self.cat_file(p, s, e))
+ except Exception as e:
+ if on_error == "return":
+ out.append(e)
+ else:
+ raise
+ return out
+
+ def cat(self, path, recursive=False, on_error="raise", **kwargs):
"""Fetch (potentially multiple) paths' contents
Parameters
@@ -543,15 +864,64 @@ class AbstractFileSystem(metaclass=_Cached):
dict of {path: contents} if there are multiple paths
or the path has been otherwise expanded
"""
- pass
+ paths = self.expand_path(path, recursive=recursive)
+ if (
+ len(paths) > 1
+ or isinstance(path, list)
+ or paths[0] != self._strip_protocol(path)
+ ):
+ out = {}
+ for path in paths:
+ try:
+ out[path] = self.cat_file(path, **kwargs)
+ except Exception as e:
+ if on_error == "raise":
+ raise
+ if on_error == "return":
+ out[path] = e
+ return out
+ else:
+ return self.cat_file(paths[0], **kwargs)
- def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, outfile=
- None, **kwargs):
+ def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, outfile=None, **kwargs):
"""Copy single remote file to local"""
- pass
-
- def get(self, rpath, lpath, recursive=False, callback=DEFAULT_CALLBACK,
- maxdepth=None, **kwargs):
+ from .implementations.local import LocalFileSystem
+
+ if isfilelike(lpath):
+ outfile = lpath
+ elif self.isdir(rpath):
+ os.makedirs(lpath, exist_ok=True)
+ return None
+
+ fs = LocalFileSystem(auto_mkdir=True)
+ fs.makedirs(fs._parent(lpath), exist_ok=True)
+
+ with self.open(rpath, "rb", **kwargs) as f1:
+ if outfile is None:
+ outfile = open(lpath, "wb")
+
+ try:
+ callback.set_size(getattr(f1, "size", None))
+ data = True
+ while data:
+ data = f1.read(self.blocksize)
+ segment_len = outfile.write(data)
+ if segment_len is None:
+ segment_len = len(data)
+ callback.relative_update(segment_len)
+ finally:
+ if not isfilelike(lpath):
+ outfile.close()
+
+ def get(
+ self,
+ rpath,
+ lpath,
+ recursive=False,
+ callback=DEFAULT_CALLBACK,
+ maxdepth=None,
+ **kwargs,
+ ):
"""Copy file(s) to local.
Copies a specific file or tree of files (if recursive=True). If lpath
@@ -561,14 +931,79 @@ class AbstractFileSystem(metaclass=_Cached):
Calls get_file for each source.
"""
- pass
+ if isinstance(lpath, list) and isinstance(rpath, list):
+ # No need to expand paths when both source and destination
+ # are provided as lists
+ rpaths = rpath
+ lpaths = lpath
+ else:
+ from .implementations.local import (
+ LocalFileSystem,
+ make_path_posix,
+ trailing_sep,
+ )
+
+ source_is_str = isinstance(rpath, str)
+ rpaths = self.expand_path(rpath, recursive=recursive, maxdepth=maxdepth)
+ if source_is_str and (not recursive or maxdepth is not None):
+ # Non-recursive glob does not copy directories
+ rpaths = [p for p in rpaths if not (trailing_sep(p) or self.isdir(p))]
+ if not rpaths:
+ return
+
+ if isinstance(lpath, str):
+ lpath = make_path_posix(lpath)
+
+ source_is_file = len(rpaths) == 1
+ dest_is_dir = isinstance(lpath, str) and (
+ trailing_sep(lpath) or LocalFileSystem().isdir(lpath)
+ )
+
+ exists = source_is_str and (
+ (has_magic(rpath) and source_is_file)
+ or (not has_magic(rpath) and dest_is_dir and not trailing_sep(rpath))
+ )
+ lpaths = other_paths(
+ rpaths,
+ lpath,
+ exists=exists,
+ flatten=not source_is_str,
+ )
+
+ callback.set_size(len(lpaths))
+ for lpath, rpath in callback.wrap(zip(lpaths, rpaths)):
+ with callback.branched(rpath, lpath) as child:
+ self.get_file(rpath, lpath, callback=child, **kwargs)
def put_file(self, lpath, rpath, callback=DEFAULT_CALLBACK, **kwargs):
"""Copy single file to remote"""
- pass
-
- def put(self, lpath, rpath, recursive=False, callback=DEFAULT_CALLBACK,
- maxdepth=None, **kwargs):
+ if os.path.isdir(lpath):
+ self.makedirs(rpath, exist_ok=True)
+ return None
+
+ with open(lpath, "rb") as f1:
+ size = f1.seek(0, 2)
+ callback.set_size(size)
+ f1.seek(0)
+
+ self.mkdirs(self._parent(os.fspath(rpath)), exist_ok=True)
+ with self.open(rpath, "wb", **kwargs) as f2:
+ while f1.tell() < size:
+ data = f1.read(self.blocksize)
+ segment_len = f2.write(data)
+ if segment_len is None:
+ segment_len = len(data)
+ callback.relative_update(segment_len)
+
+ def put(
+ self,
+ lpath,
+ rpath,
+ recursive=False,
+ callback=DEFAULT_CALLBACK,
+ maxdepth=None,
+ **kwargs,
+ ):
"""Copy file(s) from local.
Copies a specific file or tree of files (if recursive=True). If rpath
@@ -577,18 +1012,72 @@ class AbstractFileSystem(metaclass=_Cached):
Calls put_file for each source.
"""
- pass
+ if isinstance(lpath, list) and isinstance(rpath, list):
+ # No need to expand paths when both source and destination
+ # are provided as lists
+ rpaths = rpath
+ lpaths = lpath
+ else:
+ from .implementations.local import (
+ LocalFileSystem,
+ make_path_posix,
+ trailing_sep,
+ )
+
+ source_is_str = isinstance(lpath, str)
+ if source_is_str:
+ lpath = make_path_posix(lpath)
+ fs = LocalFileSystem()
+ lpaths = fs.expand_path(lpath, recursive=recursive, maxdepth=maxdepth)
+ if source_is_str and (not recursive or maxdepth is not None):
+ # Non-recursive glob does not copy directories
+ lpaths = [p for p in lpaths if not (trailing_sep(p) or fs.isdir(p))]
+ if not lpaths:
+ return
+
+ source_is_file = len(lpaths) == 1
+ dest_is_dir = isinstance(rpath, str) and (
+ trailing_sep(rpath) or self.isdir(rpath)
+ )
+
+ rpath = (
+ self._strip_protocol(rpath)
+ if isinstance(rpath, str)
+ else [self._strip_protocol(p) for p in rpath]
+ )
+ exists = source_is_str and (
+ (has_magic(lpath) and source_is_file)
+ or (not has_magic(lpath) and dest_is_dir and not trailing_sep(lpath))
+ )
+ rpaths = other_paths(
+ lpaths,
+ rpath,
+ exists=exists,
+ flatten=not source_is_str,
+ )
+
+ callback.set_size(len(rpaths))
+ for lpath, rpath in callback.wrap(zip(lpaths, rpaths)):
+ with callback.branched(lpath, rpath) as child:
+ self.put_file(lpath, rpath, callback=child, **kwargs)
def head(self, path, size=1024):
"""Get the first ``size`` bytes from file"""
- pass
+ with self.open(path, "rb") as f:
+ return f.read(size)
def tail(self, path, size=1024):
"""Get the last ``size`` bytes from file"""
- pass
+ with self.open(path, "rb") as f:
+ f.seek(max(-size, -f.size), 2)
+ return f.read()
+
+ def cp_file(self, path1, path2, **kwargs):
+ raise NotImplementedError
- def copy(self, path1, path2, recursive=False, maxdepth=None, on_error=
- None, **kwargs):
+ def copy(
+ self, path1, path2, recursive=False, maxdepth=None, on_error=None, **kwargs
+ ):
"""Copy within two locations in the filesystem
on_error : "raise", "ignore"
@@ -596,7 +1085,49 @@ class AbstractFileSystem(metaclass=_Cached):
not-found exceptions will cause the path to be skipped; defaults to
raise unless recursive is true, where the default is ignore
"""
- pass
+ if on_error is None and recursive:
+ on_error = "ignore"
+ elif on_error is None:
+ on_error = "raise"
+
+ if isinstance(path1, list) and isinstance(path2, list):
+ # No need to expand paths when both source and destination
+ # are provided as lists
+ paths1 = path1
+ paths2 = path2
+ else:
+ from .implementations.local import trailing_sep
+
+ source_is_str = isinstance(path1, str)
+ paths1 = self.expand_path(path1, recursive=recursive, maxdepth=maxdepth)
+ if source_is_str and (not recursive or maxdepth is not None):
+ # Non-recursive glob does not copy directories
+ paths1 = [p for p in paths1 if not (trailing_sep(p) or self.isdir(p))]
+ if not paths1:
+ return
+
+ source_is_file = len(paths1) == 1
+ dest_is_dir = isinstance(path2, str) and (
+ trailing_sep(path2) or self.isdir(path2)
+ )
+
+ exists = source_is_str and (
+ (has_magic(path1) and source_is_file)
+ or (not has_magic(path1) and dest_is_dir and not trailing_sep(path1))
+ )
+ paths2 = other_paths(
+ paths1,
+ path2,
+ exists=exists,
+ flatten=not source_is_str,
+ )
+
+ for p1, p2 in zip(paths1, paths2):
+ try:
+ self.cp_file(p1, p2, **kwargs)
+ except FileNotFoundError:
+ if on_error == "raise":
+ raise
def expand_path(self, path, recursive=False, maxdepth=None, **kwargs):
"""Turn one or more globs or directories into a list of all matching paths
@@ -604,19 +1135,67 @@ class AbstractFileSystem(metaclass=_Cached):
kwargs are passed to ``glob`` or ``find``, which may in turn call ``ls``
"""
- pass
+
+ if maxdepth is not None and maxdepth < 1:
+ raise ValueError("maxdepth must be at least 1")
+
+ if isinstance(path, (str, os.PathLike)):
+ out = self.expand_path([path], recursive, maxdepth)
+ else:
+ out = set()
+ path = [self._strip_protocol(p) for p in path]
+ for p in path:
+ if has_magic(p):
+ bit = set(self.glob(p, maxdepth=maxdepth, **kwargs))
+ out |= bit
+ if recursive:
+ # glob call above expanded one depth so if maxdepth is defined
+ # then decrement it in expand_path call below. If it is zero
+ # after decrementing then avoid expand_path call.
+ if maxdepth is not None and maxdepth <= 1:
+ continue
+ out |= set(
+ self.expand_path(
+ list(bit),
+ recursive=recursive,
+ maxdepth=maxdepth - 1 if maxdepth is not None else None,
+ **kwargs,
+ )
+ )
+ continue
+ elif recursive:
+ rec = set(
+ self.find(
+ p, maxdepth=maxdepth, withdirs=True, detail=False, **kwargs
+ )
+ )
+ out |= rec
+ if p not in out and (recursive is False or self.exists(p)):
+ # should only check once, for the root
+ out.add(p)
+ if not out:
+ raise FileNotFoundError(path)
+ return sorted(out)
def mv(self, path1, path2, recursive=False, maxdepth=None, **kwargs):
"""Move file(s) from one location to another"""
- pass
+ if path1 == path2:
+ logger.debug("%s mv: The paths are the same, so no files were moved.", self)
+ else:
+ # explicitly raise exception to prevent data corruption
+ self.copy(
+ path1, path2, recursive=recursive, maxdepth=maxdepth, onerror="raise"
+ )
+ self.rm(path1, recursive=recursive)
def rm_file(self, path):
"""Delete a file"""
- pass
+ self._rm(path)
def _rm(self, path):
"""Delete one file"""
- pass
+ # this is the old name for the method, prefer rm_file
+ raise NotImplementedError
def rm(self, path, recursive=False, maxdepth=None):
"""Delete files.
@@ -633,15 +1212,48 @@ class AbstractFileSystem(metaclass=_Cached):
If None, there will be no limit and infinite recursion may be
possible.
"""
- pass
+ path = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
+ for p in reversed(path):
+ self.rm_file(p)
- def _open(self, path, mode='rb', block_size=None, autocommit=True,
- cache_options=None, **kwargs):
+ @classmethod
+ def _parent(cls, path):
+ path = cls._strip_protocol(path)
+ if "/" in path:
+ parent = path.rsplit("/", 1)[0].lstrip(cls.root_marker)
+ return cls.root_marker + parent
+ else:
+ return cls.root_marker
+
+ def _open(
+ self,
+ path,
+ mode="rb",
+ block_size=None,
+ autocommit=True,
+ cache_options=None,
+ **kwargs,
+ ):
"""Return raw bytes-mode file-like from the file-system"""
- pass
-
- def open(self, path, mode='rb', block_size=None, cache_options=None,
- compression=None, **kwargs):
+ return AbstractBufferedFile(
+ self,
+ path,
+ mode,
+ block_size,
+ autocommit,
+ cache_options=cache_options,
+ **kwargs,
+ )
+
+ def open(
+ self,
+ path,
+ mode="rb",
+ block_size=None,
+ cache_options=None,
+ compression=None,
+ **kwargs,
+ ):
"""
Return a file-like object from the filesystem
@@ -664,7 +1276,49 @@ class AbstractFileSystem(metaclass=_Cached):
compression from the filename suffix.
encoding, errors, newline: passed on to TextIOWrapper for text mode
"""
- pass
+ import io
+
+ path = self._strip_protocol(path)
+ if "b" not in mode:
+ mode = mode.replace("t", "") + "b"
+
+ text_kwargs = {
+ k: kwargs.pop(k)
+ for k in ["encoding", "errors", "newline"]
+ if k in kwargs
+ }
+ return io.TextIOWrapper(
+ self.open(
+ path,
+ mode,
+ block_size=block_size,
+ cache_options=cache_options,
+ compression=compression,
+ **kwargs,
+ ),
+ **text_kwargs,
+ )
+ else:
+ ac = kwargs.pop("autocommit", not self._intrans)
+ f = self._open(
+ path,
+ mode=mode,
+ block_size=block_size,
+ autocommit=ac,
+ cache_options=cache_options,
+ **kwargs,
+ )
+ if compression is not None:
+ from fsspec.compression import compr
+ from fsspec.core import get_compression
+
+ compression = get_compression(path, compression)
+ compress = compr[compression]
+ f = compress(f, mode=mode[0])
+
+ if not ac and "r" not in mode:
+ self.transaction.files.append(f)
+ return f
def touch(self, path, truncate=True, **kwargs):
"""Create empty file, or update timestamp
@@ -677,11 +1331,15 @@ class AbstractFileSystem(metaclass=_Cached):
If True, always set file size to 0; if False, update timestamp and
leave file unchanged, if backend allows this
"""
- pass
+ if truncate or not self.exists(path):
+ with self.open(path, "wb", **kwargs):
+ pass
+ else:
+ raise NotImplementedError # update timestamp, if possible
def ukey(self, path):
"""Hash of file properties, to tell if it has changed"""
- pass
+ return sha256(str(self.info(path)).encode()).hexdigest()
def read_block(self, fn, offset, length, delimiter=None):
"""Read a block of bytes from
@@ -720,9 +1378,15 @@ class AbstractFileSystem(metaclass=_Cached):
--------
:func:`fsspec.utils.read_block`
"""
- pass
-
- def to_json(self, *, include_password: bool=True) ->str:
+ with self.open(fn, "rb") as f:
+ size = f.size
+ if length is None:
+ length = size
+ if size is not None and offset + length > size:
+ length = size - offset
+ return read_block(f, offset, length, delimiter)
+
+ def to_json(self, *, include_password: bool = True) -> str:
"""
JSON representation of this filesystem instance.
@@ -744,10 +1408,19 @@ class AbstractFileSystem(metaclass=_Cached):
passed to the constructor, such as passwords and tokens. Make sure you
store and send them in a secure environment!
"""
- pass
+ from .json import FilesystemJSONEncoder
+
+ return json.dumps(
+ self,
+ cls=type(
+ "_FilesystemJSONEncoder",
+ (FilesystemJSONEncoder,),
+ {"include_password": include_password},
+ ),
+ )
@staticmethod
- def from_json(blob: str) ->AbstractFileSystem:
+ def from_json(blob: str) -> AbstractFileSystem:
"""
Recreate a filesystem instance from JSON representation.
@@ -767,9 +1440,11 @@ class AbstractFileSystem(metaclass=_Cached):
Make sure you haven't installed any modules that may execute malicious code
at import time.
"""
- pass
+ from .json import FilesystemJSONDecoder
+
+ return json.loads(blob, cls=FilesystemJSONDecoder)
- def to_dict(self, *, include_password: bool=True) ->Dict[str, Any]:
+ def to_dict(self, *, include_password: bool = True) -> Dict[str, Any]:
"""
JSON-serializable dictionary representation of this filesystem instance.
@@ -791,10 +1466,26 @@ class AbstractFileSystem(metaclass=_Cached):
passed to the constructor, such as passwords and tokens. Make sure you
store and send them in a secure environment!
"""
- pass
+ from .json import FilesystemJSONEncoder
+
+ json_encoder = FilesystemJSONEncoder()
+
+ cls = type(self)
+ proto = self.protocol
+
+ storage_options = dict(self.storage_options)
+ if not include_password:
+ storage_options.pop("password", None)
+
+ return dict(
+ cls=f"{cls.__module__}:{cls.__name__}",
+ protocol=proto[0] if isinstance(proto, (tuple, list)) else proto,
+ args=json_encoder.make_serializable(self.storage_args),
+ **json_encoder.make_serializable(storage_options),
+ )
@staticmethod
- def from_dict(dct: Dict[str, Any]) ->AbstractFileSystem:
+ def from_dict(dct: Dict[str, Any]) -> AbstractFileSystem:
"""
Recreate a filesystem instance from dictionary representation.
@@ -814,22 +1505,46 @@ class AbstractFileSystem(metaclass=_Cached):
Make sure you haven't installed any modules that may execute malicious code
at import time.
"""
- pass
+ from .json import FilesystemJSONDecoder
+
+ json_decoder = FilesystemJSONDecoder()
+
+ dct = dict(dct) # Defensive copy
+
+ cls = FilesystemJSONDecoder.try_resolve_fs_cls(dct)
+ if cls is None:
+ raise ValueError("Not a serialized AbstractFileSystem")
+
+ dct.pop("cls", None)
+ dct.pop("protocol", None)
+
+ return cls(
+ *json_decoder.unmake_serializable(dct.pop("args", ())),
+ **json_decoder.unmake_serializable(dct),
+ )
def _get_pyarrow_filesystem(self):
"""
Make a version of the FS instance which will be acceptable to pyarrow
"""
- pass
+ # all instances already also derive from pyarrow
+ return self
- def get_mapper(self, root='', check=False, create=False,
- missing_exceptions=None):
+ def get_mapper(self, root="", check=False, create=False, missing_exceptions=None):
"""Create key/value store based on this file-system
Makes a MutableMapping interface to the FS at the given root path.
See ``fsspec.mapping.FSMap`` for further details.
"""
- pass
+ from .mapping import FSMap
+
+ return FSMap(
+ root,
+ self,
+ check=check,
+ create=create,
+ missing_exceptions=missing_exceptions,
+ )
@classmethod
def clear_instance_cache(cls):
@@ -844,67 +1559,70 @@ class AbstractFileSystem(metaclass=_Cached):
since the instances refcount will not drop to zero until
``clear_instance_cache`` is called.
"""
- pass
+ cls._cache.clear()
def created(self, path):
"""Return the created timestamp of a file as a datetime.datetime"""
- pass
+ raise NotImplementedError
def modified(self, path):
"""Return the modified timestamp of a file as a datetime.datetime"""
- pass
+ raise NotImplementedError
+
+ # ------------------------------------------------------------------------
+ # Aliases
def read_bytes(self, path, start=None, end=None, **kwargs):
"""Alias of `AbstractFileSystem.cat_file`."""
- pass
+ return self.cat_file(path, start=start, end=end, **kwargs)
def write_bytes(self, path, value, **kwargs):
"""Alias of `AbstractFileSystem.pipe_file`."""
- pass
+ self.pipe_file(path, value, **kwargs)
def makedir(self, path, create_parents=True, **kwargs):
"""Alias of `AbstractFileSystem.mkdir`."""
- pass
+ return self.mkdir(path, create_parents=create_parents, **kwargs)
def mkdirs(self, path, exist_ok=False):
"""Alias of `AbstractFileSystem.makedirs`."""
- pass
+ return self.makedirs(path, exist_ok=exist_ok)
def listdir(self, path, detail=True, **kwargs):
"""Alias of `AbstractFileSystem.ls`."""
- pass
+ return self.ls(path, detail=detail, **kwargs)
def cp(self, path1, path2, **kwargs):
"""Alias of `AbstractFileSystem.copy`."""
- pass
+ return self.copy(path1, path2, **kwargs)
def move(self, path1, path2, **kwargs):
"""Alias of `AbstractFileSystem.mv`."""
- pass
+ return self.mv(path1, path2, **kwargs)
def stat(self, path, **kwargs):
"""Alias of `AbstractFileSystem.info`."""
- pass
+ return self.info(path, **kwargs)
def disk_usage(self, path, total=True, maxdepth=None, **kwargs):
"""Alias of `AbstractFileSystem.du`."""
- pass
+ return self.du(path, total=total, maxdepth=maxdepth, **kwargs)
def rename(self, path1, path2, **kwargs):
"""Alias of `AbstractFileSystem.mv`."""
- pass
+ return self.mv(path1, path2, **kwargs)
def delete(self, path, recursive=False, maxdepth=None):
"""Alias of `AbstractFileSystem.rm`."""
- pass
+ return self.rm(path, recursive=recursive, maxdepth=maxdepth)
def upload(self, lpath, rpath, recursive=False, **kwargs):
"""Alias of `AbstractFileSystem.put`."""
- pass
+ return self.put(lpath, rpath, recursive=recursive, **kwargs)
def download(self, rpath, lpath, recursive=False, **kwargs):
"""Alias of `AbstractFileSystem.get`."""
- pass
+ return self.get(rpath, lpath, recursive=recursive, **kwargs)
def sign(self, path, expiration=100, **kwargs):
"""Create a signed URL representing the given path
@@ -928,7 +1646,14 @@ class AbstractFileSystem(metaclass=_Cached):
------
NotImplementedError : if method is not implemented for a filesystem
"""
- pass
+ raise NotImplementedError("Sign is not implemented for this filesystem")
+
+ def _isfilestore(self):
+ # Originally inherited from pyarrow DaskFileSystem. Keeping this
+ # here for backwards compatibility as long as pyarrow uses its
+ # legacy fsspec-compatible filesystems and thus accepts fsspec
+ # filesystems as well
+ return False
class AbstractBufferedFile(io.IOBase):
@@ -939,12 +1664,22 @@ class AbstractBufferedFile(io.IOBase):
methods that need to be overridden are ``_upload_chunk``,
``_initiate_upload`` and ``_fetch_range``.
"""
- DEFAULT_BLOCK_SIZE = 5 * 2 ** 20
+
+ DEFAULT_BLOCK_SIZE = 5 * 2**20
_details = None
- def __init__(self, fs, path, mode='rb', block_size='default',
- autocommit=True, cache_type='readahead', cache_options=None, size=
- None, **kwargs):
+ def __init__(
+ self,
+ fs,
+ path,
+ mode="rb",
+ block_size="default",
+ autocommit=True,
+ cache_type="readahead",
+ cache_options=None,
+ size=None,
+ **kwargs,
+ ):
"""
Template for files with buffered reading and writing
@@ -972,41 +1707,75 @@ class AbstractBufferedFile(io.IOBase):
Gets stored as self.kwargs
"""
from .core import caches
+
self.path = path
self.fs = fs
self.mode = mode
- self.blocksize = self.DEFAULT_BLOCK_SIZE if block_size in ['default',
- None] else block_size
+ self.blocksize = (
+ self.DEFAULT_BLOCK_SIZE if block_size in ["default", None] else block_size
+ )
self.loc = 0
self.autocommit = autocommit
self.end = None
self.start = None
self.closed = False
+
if cache_options is None:
cache_options = {}
- if 'trim' in kwargs:
+
+ if "trim" in kwargs:
warnings.warn(
- "Passing 'trim' to control the cache behavior has been deprecated. Specify it within the 'cache_options' argument instead."
- , FutureWarning)
- cache_options['trim'] = kwargs.pop('trim')
+ "Passing 'trim' to control the cache behavior has been deprecated. "
+ "Specify it within the 'cache_options' argument instead.",
+ FutureWarning,
+ )
+ cache_options["trim"] = kwargs.pop("trim")
+
self.kwargs = kwargs
- if mode not in {'ab', 'rb', 'wb'}:
- raise NotImplementedError('File mode not supported')
- if mode == 'rb':
+
+ if mode not in {"ab", "rb", "wb"}:
+ raise NotImplementedError("File mode not supported")
+ if mode == "rb":
if size is not None:
self.size = size
else:
- self.size = self.details['size']
- self.cache = caches[cache_type](self.blocksize, self.
- _fetch_range, self.size, **cache_options)
+ self.size = self.details["size"]
+ self.cache = caches[cache_type](
+ self.blocksize, self._fetch_range, self.size, **cache_options
+ )
else:
self.buffer = io.BytesIO()
self.offset = None
self.forced = False
self.location = None
+ @property
+ def details(self):
+ if self._details is None:
+ self._details = self.fs.info(self.path)
+ return self._details
+
+ @details.setter
+ def details(self, value):
+ self._details = value
+ self.size = value["size"]
+
+ @property
+ def full_name(self):
+ return _unstrip_protocol(self.path, self.fs)
+
+ @property
+ def closed(self):
+ # get around this attr being read-only in IOBase
+ # use getattr here, since this can be called during del
+ return getattr(self, "_closed", True)
+
+ @closed.setter
+ def closed(self, c):
+ self._closed = c
+
def __hash__(self):
- if 'w' in self.mode:
+ if "w" in self.mode:
return id(self)
else:
return int(tokenize(self.details), 16)
@@ -1015,25 +1784,29 @@ class AbstractBufferedFile(io.IOBase):
"""Files are equal if they have the same checksum, only in read mode"""
if self is other:
return True
- return isinstance(other, type(self)
- ) and self.mode == 'rb' and other.mode == 'rb' and hash(self
- ) == hash(other)
+ return (
+ isinstance(other, type(self))
+ and self.mode == "rb"
+ and other.mode == "rb"
+ and hash(self) == hash(other)
+ )
def commit(self):
"""Move from temp to final destination"""
- pass
def discard(self):
"""Throw away temporary file"""
- pass
def info(self):
"""File information about this path"""
- pass
+ if "r" in self.mode:
+ return self.details
+ else:
+ raise ValueError("Info not available while writing")
def tell(self):
"""Current file location"""
- pass
+ return self.loc
def seek(self, loc, whence=0):
"""Set current file location
@@ -1045,7 +1818,21 @@ class AbstractBufferedFile(io.IOBase):
whence: {0, 1, 2}
from start of file, current location or end of file, resp.
"""
- pass
+ loc = int(loc)
+ if not self.mode == "rb":
+ raise OSError(ESPIPE, "Seek only available in read mode")
+ if whence == 0:
+ nloc = loc
+ elif whence == 1:
+ nloc = self.loc + loc
+ elif whence == 2:
+ nloc = self.size + loc
+ else:
+ raise ValueError(f"invalid whence ({whence}, should be 0, 1 or 2)")
+ if nloc < 0:
+ raise ValueError("Seek before start of file")
+ self.loc = nloc
+ return self.loc
def write(self, data):
"""
@@ -1059,7 +1846,17 @@ class AbstractBufferedFile(io.IOBase):
data: bytes
Set of bytes to be written.
"""
- pass
+ if self.mode not in {"wb", "ab"}:
+ raise ValueError("File not in write mode")
+ if self.closed:
+ raise ValueError("I/O operation on closed file.")
+ if self.forced:
+ raise ValueError("This file has been force-flushed, can only close")
+ out = self.buffer.write(data)
+ self.loc += out
+ if self.buffer.tell() >= self.blocksize:
+ self.flush()
+ return out
def flush(self, force=False):
"""
@@ -1074,7 +1871,34 @@ class AbstractBufferedFile(io.IOBase):
When closing, write the last block even if it is smaller than
blocks are allowed to be. Disallows further writing to this file.
"""
- pass
+
+ if self.closed:
+ raise ValueError("Flush on closed file")
+ if force and self.forced:
+ raise ValueError("Force flush cannot be called more than once")
+ if force:
+ self.forced = True
+
+ if self.mode not in {"wb", "ab"}:
+ # no-op to flush on read-mode
+ return
+
+ if not force and self.buffer.tell() < self.blocksize:
+ # Defer write on small block
+ return
+
+ if self.offset is None:
+ # Initialize a multipart upload
+ self.offset = 0
+ try:
+ self._initiate_upload()
+ except: # noqa: E722
+ self.closed = True
+ raise
+
+ if self._upload_chunk(final=force) is not False:
+ self.offset += self.buffer.seek(0, 2)
+ self.buffer = io.BytesIO()
def _upload_chunk(self, final=False):
"""Write one part of a multi-block file upload
@@ -1085,7 +1909,7 @@ class AbstractBufferedFile(io.IOBase):
This is the last block, so should complete file, if
self.autocommit is True.
"""
- pass
+ # may not yet have been initialized, may need to call _initialize_upload
def _initiate_upload(self):
"""Create remote file/upload"""
@@ -1093,7 +1917,7 @@ class AbstractBufferedFile(io.IOBase):
def _fetch_range(self, start, end):
"""Get the specified set of bytes from remote"""
- pass
+ raise NotImplementedError
def read(self, length=-1):
"""
@@ -1104,16 +1928,39 @@ class AbstractBufferedFile(io.IOBase):
length: int (-1)
Number of bytes to read; if <0, all remaining bytes.
"""
- pass
+ length = -1 if length is None else int(length)
+ if self.mode != "rb":
+ raise ValueError("File not in read mode")
+ if length < 0:
+ length = self.size - self.loc
+ if self.closed:
+ raise ValueError("I/O operation on closed file.")
+ if length == 0:
+ # don't even bother calling fetch
+ return b""
+ out = self.cache._fetch(self.loc, self.loc + length)
+
+ logger.debug(
+ "%s read: %i - %i %s",
+ self,
+ self.loc,
+ self.loc + length,
+ self.cache._log_stats(),
+ )
+ self.loc += len(out)
+ return out
def readinto(self, b):
"""mirrors builtin file's readinto method
https://docs.python.org/3/library/io.html#io.RawIOBase.readinto
"""
- pass
+ out = memoryview(b).cast("B")
+ data = self.read(out.nbytes)
+ out[: len(data)] = data
+ return len(data)
- def readuntil(self, char=b'\n', blocks=None):
+ def readuntil(self, char=b"\n", blocks=None):
"""Return data between current position and first occurrence of char
char is included in the output, except if the end of the tile is
@@ -1127,7 +1974,19 @@ class AbstractBufferedFile(io.IOBase):
How much to read in each go. Defaults to file blocksize - which may
mean a new read on every call.
"""
- pass
+ out = []
+ while True:
+ start = self.tell()
+ part = self.read(blocks or self.blocksize)
+ if len(part) == 0:
+ break
+ found = part.find(char)
+ if found > -1:
+ out.append(part[: found + len(char)])
+ self.seek(start + found + len(char))
+ break
+ out.append(part)
+ return b"".join(out)
def readline(self):
"""Read until first occurrence of newline character
@@ -1135,7 +1994,7 @@ class AbstractBufferedFile(io.IOBase):
Note that, because of character encoding, this is not necessarily a
true line ending.
"""
- pass
+ return self.readuntil(b"\n")
def __next__(self):
out = self.readline()
@@ -1148,33 +2007,58 @@ class AbstractBufferedFile(io.IOBase):
def readlines(self):
"""Return all data, split by the newline character"""
- pass
+ data = self.read()
+ lines = data.split(b"\n")
+ out = [l + b"\n" for l in lines[:-1]]
+ if data.endswith(b"\n"):
+ return out
+ else:
+ return out + [lines[-1]]
+ # return list(self) ???
+
+ def readinto1(self, b):
+ return self.readinto(b)
def close(self):
"""Close file
Finalizes writes, discards cache
"""
- pass
+ if getattr(self, "_unclosable", False):
+ return
+ if self.closed:
+ return
+ if self.mode == "rb":
+ self.cache = None
+ else:
+ if not self.forced:
+ self.flush(force=True)
+
+ if self.fs is not None:
+ self.fs.invalidate_cache(self.path)
+ self.fs.invalidate_cache(self.fs._parent(self.path))
+
+ self.closed = True
def readable(self):
"""Whether opened for reading"""
- pass
+ return self.mode == "rb" and not self.closed
def seekable(self):
"""Whether is seekable (only in read mode)"""
- pass
+ return self.readable()
def writable(self):
"""Whether opened for writing"""
- pass
+ return self.mode in {"wb", "ab"} and not self.closed
def __del__(self):
if not self.closed:
self.close()
def __str__(self):
- return f'<File-like object {type(self.fs).__name__}, {self.path}>'
+ return f"<File-like object {type(self.fs).__name__}, {self.path}>"
+
__repr__ = __str__
def __enter__(self):
diff --git a/fsspec/transaction.py b/fsspec/transaction.py
index 9a060ac..77293f6 100644
--- a/fsspec/transaction.py
+++ b/fsspec/transaction.py
@@ -24,6 +24,7 @@ class Transaction:
def __exit__(self, exc_type, exc_val, exc_tb):
"""End transaction and commit, if exit is not due to exception"""
+ # only commit if there was no exception
self.complete(commit=exc_type is None)
if self.fs:
self.fs._intrans = False
@@ -32,21 +33,41 @@ class Transaction:
def start(self):
"""Start a transaction on this FileSystem"""
- pass
+ self.files = deque() # clean up after previous failed completions
+ self.fs._intrans = True
def complete(self, commit=True):
"""Finish transaction: commit or discard all deferred files"""
- pass
+ while self.files:
+ f = self.files.popleft()
+ if commit:
+ f.commit()
+ else:
+ f.discard()
+ self.fs._intrans = False
+ self.fs._transaction = None
+ self.fs = None
class FileActor:
-
def __init__(self):
self.files = []
+ def commit(self):
+ for f in self.files:
+ f.commit()
+ self.files.clear()
-class DaskTransaction(Transaction):
+ def discard(self):
+ for f in self.files:
+ f.discard()
+ self.files.clear()
+
+ def append(self, f):
+ self.files.append(f)
+
+class DaskTransaction(Transaction):
def __init__(self, fs):
"""
Parameters
@@ -54,10 +75,16 @@ class DaskTransaction(Transaction):
fs: FileSystem instance
"""
import distributed
+
super().__init__(fs)
client = distributed.default_client()
self.files = client.submit(FileActor, actor=True).result()
def complete(self, commit=True):
"""Finish transaction: commit or discard all deferred files"""
- pass
+ if commit:
+ self.files.commit().result()
+ else:
+ self.files.discard().result()
+ self.fs._intrans = False
+ self.fs = None
diff --git a/fsspec/utils.py b/fsspec/utils.py
index 7257878..703d55f 100644
--- a/fsspec/utils.py
+++ b/fsspec/utils.py
@@ -1,4 +1,5 @@
from __future__ import annotations
+
import contextlib
import logging
import math
@@ -10,17 +11,32 @@ import tempfile
from functools import partial
from hashlib import md5
from importlib.metadata import version
-from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, TypeVar
+from typing import (
+ IO,
+ TYPE_CHECKING,
+ Any,
+ Callable,
+ Iterable,
+ Iterator,
+ Sequence,
+ TypeVar,
+)
from urllib.parse import urlsplit
+
if TYPE_CHECKING:
from typing_extensions import TypeGuard
+
from fsspec.spec import AbstractFileSystem
-DEFAULT_BLOCK_SIZE = 5 * 2 ** 20
-T = TypeVar('T')
-def infer_storage_options(urlpath: str, inherit_storage_options: (dict[str,
- Any] | None)=None) ->dict[str, Any]:
+DEFAULT_BLOCK_SIZE = 5 * 2**20
+
+T = TypeVar("T")
+
+
+def infer_storage_options(
+ urlpath: str, inherit_storage_options: dict[str, Any] | None = None
+) -> dict[str, Any]:
"""Infer storage options from URL path and merge it with existing storage
options.
@@ -48,23 +64,94 @@ def infer_storage_options(urlpath: str, inherit_storage_options: (dict[str,
"host": "node", "port": 123, "path": "/mnt/datasets/test.csv",
"url_query": "q=1", "extra": "value"}
"""
- pass
-
-
+ # Handle Windows paths including disk name in this special case
+ if (
+ re.match(r"^[a-zA-Z]:[\\/]", urlpath)
+ or re.match(r"^[a-zA-Z0-9]+://", urlpath) is None
+ ):
+ return {"protocol": "file", "path": urlpath}
+
+ parsed_path = urlsplit(urlpath)
+ protocol = parsed_path.scheme or "file"
+ if parsed_path.fragment:
+ path = "#".join([parsed_path.path, parsed_path.fragment])
+ else:
+ path = parsed_path.path
+ if protocol == "file":
+ # Special case parsing file protocol URL on Windows according to:
+ # https://msdn.microsoft.com/en-us/library/jj710207.aspx
+ windows_path = re.match(r"^/([a-zA-Z])[:|]([\\/].*)$", path)
+ if windows_path:
+ path = "%s:%s" % windows_path.groups()
+
+ if protocol in ["http", "https"]:
+ # for HTTP, we don't want to parse, as requests will anyway
+ return {"protocol": protocol, "path": urlpath}
+
+ options: dict[str, Any] = {"protocol": protocol, "path": path}
+
+ if parsed_path.netloc:
+ # Parse `hostname` from netloc manually because `parsed_path.hostname`
+ # lowercases the hostname which is not always desirable (e.g. in S3):
+ # https://github.com/dask/dask/issues/1417
+ options["host"] = parsed_path.netloc.rsplit("@", 1)[-1].rsplit(":", 1)[0]
+
+ if protocol in ("s3", "s3a", "gcs", "gs"):
+ options["path"] = options["host"] + options["path"]
+ else:
+ options["host"] = options["host"]
+ if parsed_path.port:
+ options["port"] = parsed_path.port
+ if parsed_path.username:
+ options["username"] = parsed_path.username
+ if parsed_path.password:
+ options["password"] = parsed_path.password
+
+ if parsed_path.query:
+ options["url_query"] = parsed_path.query
+ if parsed_path.fragment:
+ options["url_fragment"] = parsed_path.fragment
+
+ if inherit_storage_options:
+ update_storage_options(options, inherit_storage_options)
+
+ return options
+
+
+def update_storage_options(
+ options: dict[str, Any], inherited: dict[str, Any] | None = None
+) -> None:
+ if not inherited:
+ inherited = {}
+ collisions = set(options) & set(inherited)
+ if collisions:
+ for collision in collisions:
+ if options.get(collision) != inherited.get(collision):
+ raise KeyError(
+ f"Collision between inferred and specified storage "
+ f"option:\n{collision}"
+ )
+ options.update(inherited)
+
+
+# Compression extensions registered via fsspec.compression.register_compression
compressions: dict[str, str] = {}
-def infer_compression(filename: str) ->(str | None):
+def infer_compression(filename: str) -> str | None:
"""Infer compression, if available, from filename.
Infer a named compression type, if registered and available, from filename
extension. This includes builtin (gz, bz2, zip) compressions, as well as
optional compressions. See fsspec.compression.register_compression.
"""
- pass
+ extension = os.path.splitext(filename)[-1].strip(".").lower()
+ if extension in compressions:
+ return compressions[extension]
+ return None
-def build_name_function(max_int: float) ->Callable[[int], str]:
+def build_name_function(max_int: float) -> Callable[[int], str]:
"""Returns a function that receives a single integer
and returns it as a string padded by enough zero characters
to align with maximum possible integer
@@ -82,11 +169,19 @@ def build_name_function(max_int: float) ->Callable[[int], str]:
>>> build_name_function(0)(0)
'0'
"""
- pass
+ # handle corner cases max_int is 0 or exact power of 10
+ max_int += 1e-8
+
+ pad_length = int(math.ceil(math.log10(max_int)))
+
+ def name_function(i: int) -> str:
+ return str(i).zfill(pad_length)
+ return name_function
-def seek_delimiter(file: IO[bytes], delimiter: bytes, blocksize: int) ->bool:
- """Seek current file to file start, file end, or byte after delimiter seq.
+
+def seek_delimiter(file: IO[bytes], delimiter: bytes, blocksize: int) -> bool:
+ r"""Seek current file to file start, file end, or byte after delimiter seq.
Seeks file to next chunk delimiter, where chunks are defined on file start,
a delimiting sequence, and file end. Use file.tell() to see location afterwards.
@@ -97,7 +192,7 @@ def seek_delimiter(file: IO[bytes], delimiter: bytes, blocksize: int) ->bool:
----------
file: a file
delimiter: bytes
- a delimiter like ``b'\\n'`` or message sentinel, matching file .read() type
+ a delimiter like ``b'\n'`` or message sentinel, matching file .read() type
blocksize: int
Number of bytes to read from the file at once.
@@ -107,11 +202,40 @@ def seek_delimiter(file: IO[bytes], delimiter: bytes, blocksize: int) ->bool:
Returns True if a delimiter was found, False if at file start or end.
"""
- pass
-
-def read_block(f: IO[bytes], offset: int, length: (int | None), delimiter:
- (bytes | None)=None, split_before: bool=False) ->bytes:
+ if file.tell() == 0:
+ # beginning-of-file, return without seek
+ return False
+
+ # Interface is for binary IO, with delimiter as bytes, but initialize last
+ # with result of file.read to preserve compatibility with text IO.
+ last: bytes | None = None
+ while True:
+ current = file.read(blocksize)
+ if not current:
+ # end-of-file without delimiter
+ return False
+ full = last + current if last else current
+ try:
+ if delimiter in full:
+ i = full.index(delimiter)
+ file.seek(file.tell() - (len(full) - i) + len(delimiter))
+ return True
+ elif len(current) < blocksize:
+ # end-of-file without delimiter
+ return False
+ except (OSError, ValueError):
+ pass
+ last = full[-len(delimiter) :]
+
+
+def read_block(
+ f: IO[bytes],
+ offset: int,
+ length: int | None,
+ delimiter: bytes | None = None,
+ split_before: bool = False,
+) -> bytes:
"""Read a block of bytes from a file
Parameters
@@ -148,10 +272,38 @@ def read_block(f: IO[bytes], offset: int, length: (int | None), delimiter:
>>> read_block(f, 10, 10, delimiter=b'\\n') # doctest: +SKIP
b'Bob, 200\\nCharlie, 300'
"""
- pass
+ if delimiter:
+ f.seek(offset)
+ found_start_delim = seek_delimiter(f, delimiter, 2**16)
+ if length is None:
+ return f.read()
+ start = f.tell()
+ length -= start - offset
+
+ f.seek(start + length)
+ found_end_delim = seek_delimiter(f, delimiter, 2**16)
+ end = f.tell()
+
+ # Adjust split location to before delimiter if seek found the
+ # delimiter sequence, not start or end of file.
+ if found_start_delim and split_before:
+ start -= len(delimiter)
+
+ if found_end_delim and split_before:
+ end -= len(delimiter)
+ offset = start
+ length = end - start
-def tokenize(*args: Any, **kwargs: Any) ->str:
+ f.seek(offset)
+
+ # TODO: allow length to be None and read to the end of the file?
+ assert length is not None
+ b = f.read(length)
+ return b
+
+
+def tokenize(*args: Any, **kwargs: Any) -> str:
"""Deterministic token
(modified from dask.base)
@@ -162,10 +314,17 @@ def tokenize(*args: Any, **kwargs: Any) ->str:
>>> tokenize('Hello') == tokenize('Hello')
True
"""
- pass
+ if kwargs:
+ args += (kwargs,)
+ try:
+ h = md5(str(args).encode())
+ except ValueError:
+ # FIPS systems: https://github.com/fsspec/filesystem_spec/issues/380
+ h = md5(str(args).encode(), usedforsecurity=False)
+ return h.hexdigest()
-def stringify_path(filepath: (str | os.PathLike[str] | pathlib.Path)) ->str:
+def stringify_path(filepath: str | os.PathLike[str] | pathlib.Path) -> str:
"""Attempt to convert a path-like object to a string.
Parameters
@@ -187,16 +346,43 @@ def stringify_path(filepath: (str | os.PathLike[str] | pathlib.Path)) ->str:
Any other object is passed through unchanged, which includes bytes,
strings, buffers, or anything else that's not even path-like.
"""
- pass
+ if isinstance(filepath, str):
+ return filepath
+ elif hasattr(filepath, "__fspath__"):
+ return filepath.__fspath__()
+ elif hasattr(filepath, "path"):
+ return filepath.path
+ else:
+ return filepath # type: ignore[return-value]
-def common_prefix(paths: Iterable[str]) ->str:
- """For a list of paths, find the shortest prefix common to all"""
- pass
+def make_instance(
+ cls: Callable[..., T], args: Sequence[Any], kwargs: dict[str, Any]
+) -> T:
+ inst = cls(*args, **kwargs)
+ inst._determine_worker() # type: ignore[attr-defined]
+ return inst
-def other_paths(paths: list[str], path2: (str | list[str]), exists: bool=
- False, flatten: bool=False) ->list[str]:
+def common_prefix(paths: Iterable[str]) -> str:
+ """For a list of paths, find the shortest prefix common to all"""
+ parts = [p.split("/") for p in paths]
+ lmax = min(len(p) for p in parts)
+ end = 0
+ for i in range(lmax):
+ end = all(p[i] == parts[0][i] for p in parts)
+ if not end:
+ break
+ i += end
+ return "/".join(parts[0][:i])
+
+
+def other_paths(
+ paths: list[str],
+ path2: str | list[str],
+ exists: bool = False,
+ flatten: bool = False,
+) -> list[str]:
"""In bulk file operations, construct a new file tree from a list of files
Parameters
@@ -217,15 +403,56 @@ def other_paths(paths: list[str], path2: (str | list[str]), exists: bool=
-------
list of str
"""
- pass
+
+ if isinstance(path2, str):
+ path2 = path2.rstrip("/")
+
+ if flatten:
+ path2 = ["/".join((path2, p.split("/")[-1])) for p in paths]
+ else:
+ cp = common_prefix(paths)
+ if exists:
+ cp = cp.rsplit("/", 1)[0]
+ if not cp and all(not s.startswith("/") for s in paths):
+ path2 = ["/".join([path2, p]) for p in paths]
+ else:
+ path2 = [p.replace(cp, path2, 1) for p in paths]
+ else:
+ assert len(paths) == len(path2)
+ return path2
+
+
+def is_exception(obj: Any) -> bool:
+ return isinstance(obj, BaseException)
+
+
+def isfilelike(f: Any) -> TypeGuard[IO[bytes]]:
+ for attr in ["read", "close", "tell"]:
+ if not hasattr(f, attr):
+ return False
+ return True
+
+
+def get_protocol(url: str) -> str:
+ url = stringify_path(url)
+ parts = re.split(r"(\:\:|\://)", url, maxsplit=1)
+ if len(parts) > 1:
+ return parts[0]
+ return "file"
-def can_be_local(path: str) ->bool:
+def can_be_local(path: str) -> bool:
"""Can the given URL be used with open_local?"""
- pass
+ from fsspec import get_filesystem_class
+ try:
+ return getattr(get_filesystem_class(get_protocol(path)), "local_file", False)
+ except (ValueError, ImportError):
+ # not in registry or import failed
+ return False
-def get_package_version_without_import(name: str) ->(str | None):
+
+def get_package_version_without_import(name: str) -> str | None:
"""For given package name, try to find the version without importing it
Import and package.__version__ is still the backup here, so an import
@@ -234,20 +461,81 @@ def get_package_version_without_import(name: str) ->(str | None):
Returns either the version string, or None if the package
or the version was not readily found.
"""
- pass
-
-
-def mirror_from(origin_name: str, methods: Iterable[str]) ->Callable[[type[
- T]], type[T]]:
+ if name in sys.modules:
+ mod = sys.modules[name]
+ if hasattr(mod, "__version__"):
+ return mod.__version__
+ try:
+ return version(name)
+ except: # noqa: E722
+ pass
+ try:
+ import importlib
+
+ mod = importlib.import_module(name)
+ return mod.__version__
+ except (ImportError, AttributeError):
+ return None
+
+
+def setup_logging(
+ logger: logging.Logger | None = None,
+ logger_name: str | None = None,
+ level: str = "DEBUG",
+ clear: bool = True,
+) -> logging.Logger:
+ if logger is None and logger_name is None:
+ raise ValueError("Provide either logger object or logger name")
+ logger = logger or logging.getLogger(logger_name)
+ handle = logging.StreamHandler()
+ formatter = logging.Formatter(
+ "%(asctime)s - %(name)s - %(levelname)s - %(funcName)s -- %(message)s"
+ )
+ handle.setFormatter(formatter)
+ if clear:
+ logger.handlers.clear()
+ logger.addHandler(handle)
+ logger.setLevel(level)
+ return logger
+
+
+def _unstrip_protocol(name: str, fs: AbstractFileSystem) -> str:
+ return fs.unstrip_protocol(name)
+
+
+def mirror_from(
+ origin_name: str, methods: Iterable[str]
+) -> Callable[[type[T]], type[T]]:
"""Mirror attributes and methods from the given
origin_name attribute of the instance to the
decorated class"""
- pass
+ def origin_getter(method: str, self: Any) -> Any:
+ origin = getattr(self, origin_name)
+ return getattr(origin, method)
+
+ def wrapper(cls: type[T]) -> type[T]:
+ for method in methods:
+ wrapped_method = partial(origin_getter, method)
+ setattr(cls, method, property(wrapped_method))
+ return cls
+
+ return wrapper
-def merge_offset_ranges(paths: list[str], starts: (list[int] | int), ends:
- (list[int] | int), max_gap: int=0, max_block: (int | None)=None, sort:
- bool=True) ->tuple[list[str], list[int], list[int]]:
+
+@contextlib.contextmanager
+def nullcontext(obj: T) -> Iterator[T]:
+ yield obj
+
+
+def merge_offset_ranges(
+ paths: list[str],
+ starts: list[int] | int,
+ ends: list[int] | int,
+ max_gap: int = 0,
+ max_block: int | None = None,
+ sort: bool = True,
+) -> tuple[list[str], list[int], list[int]]:
"""Merge adjacent byte-offset ranges when the inter-range
gap is <= `max_gap`, and when the merged byte range does not
exceed `max_block` (if specified). By default, this function
@@ -255,24 +543,198 @@ def merge_offset_ranges(paths: list[str], starts: (list[int] | int), ends:
order. If the user can guarantee that the inputs are already
sorted, passing `sort=False` will skip the re-ordering.
"""
- pass
-
-
-def file_size(filelike: IO[bytes]) ->int:
+ # Check input
+ if not isinstance(paths, list):
+ raise TypeError
+ if not isinstance(starts, list):
+ starts = [starts] * len(paths)
+ if not isinstance(ends, list):
+ ends = [ends] * len(paths)
+ if len(starts) != len(paths) or len(ends) != len(paths):
+ raise ValueError
+
+ # Early Return
+ if len(starts) <= 1:
+ return paths, starts, ends
+
+ starts = [s or 0 for s in starts]
+ # Sort by paths and then ranges if `sort=True`
+ if sort:
+ paths, starts, ends = (
+ list(v)
+ for v in zip(
+ *sorted(
+ zip(paths, starts, ends),
+ )
+ )
+ )
+
+ if paths:
+ # Loop through the coupled `paths`, `starts`, and
+ # `ends`, and merge adjacent blocks when appropriate
+ new_paths = paths[:1]
+ new_starts = starts[:1]
+ new_ends = ends[:1]
+ for i in range(1, len(paths)):
+ if paths[i] == paths[i - 1] and new_ends[-1] is None:
+ continue
+ elif (
+ paths[i] != paths[i - 1]
+ or ((starts[i] - new_ends[-1]) > max_gap)
+ or (max_block is not None and (ends[i] - new_starts[-1]) > max_block)
+ ):
+ # Cannot merge with previous block.
+ # Add new `paths`, `starts`, and `ends` elements
+ new_paths.append(paths[i])
+ new_starts.append(starts[i])
+ new_ends.append(ends[i])
+ else:
+ # Merge with previous block by updating the
+ # last element of `ends`
+ new_ends[-1] = ends[i]
+ return new_paths, new_starts, new_ends
+
+ # `paths` is empty. Just return input lists
+ return paths, starts, ends
+
+
+def file_size(filelike: IO[bytes]) -> int:
"""Find length of any open read-mode file-like"""
- pass
+ pos = filelike.tell()
+ try:
+ return filelike.seek(0, 2)
+ finally:
+ filelike.seek(pos)
@contextlib.contextmanager
-def atomic_write(path: str, mode: str='wb'):
+def atomic_write(path: str, mode: str = "wb"):
"""
A context manager that opens a temporary file next to `path` and, on exit,
replaces `path` with the temporary file, thereby updating `path`
atomically.
"""
- pass
+ fd, fn = tempfile.mkstemp(
+ dir=os.path.dirname(path), prefix=os.path.basename(path) + "-"
+ )
+ try:
+ with open(fd, mode) as fp:
+ yield fp
+ except BaseException:
+ with contextlib.suppress(FileNotFoundError):
+ os.unlink(fn)
+ raise
+ else:
+ os.replace(fn, path)
+
+
+def _translate(pat, STAR, QUESTION_MARK):
+ # Copied from: https://github.com/python/cpython/pull/106703.
+ res: list[str] = []
+ add = res.append
+ i, n = 0, len(pat)
+ while i < n:
+ c = pat[i]
+ i = i + 1
+ if c == "*":
+ # compress consecutive `*` into one
+ if (not res) or res[-1] is not STAR:
+ add(STAR)
+ elif c == "?":
+ add(QUESTION_MARK)
+ elif c == "[":
+ j = i
+ if j < n and pat[j] == "!":
+ j = j + 1
+ if j < n and pat[j] == "]":
+ j = j + 1
+ while j < n and pat[j] != "]":
+ j = j + 1
+ if j >= n:
+ add("\\[")
+ else:
+ stuff = pat[i:j]
+ if "-" not in stuff:
+ stuff = stuff.replace("\\", r"\\")
+ else:
+ chunks = []
+ k = i + 2 if pat[i] == "!" else i + 1
+ while True:
+ k = pat.find("-", k, j)
+ if k < 0:
+ break
+ chunks.append(pat[i:k])
+ i = k + 1
+ k = k + 3
+ chunk = pat[i:j]
+ if chunk:
+ chunks.append(chunk)
+ else:
+ chunks[-1] += "-"
+ # Remove empty ranges -- invalid in RE.
+ for k in range(len(chunks) - 1, 0, -1):
+ if chunks[k - 1][-1] > chunks[k][0]:
+ chunks[k - 1] = chunks[k - 1][:-1] + chunks[k][1:]
+ del chunks[k]
+ # Escape backslashes and hyphens for set difference (--).
+ # Hyphens that create ranges shouldn't be escaped.
+ stuff = "-".join(
+ s.replace("\\", r"\\").replace("-", r"\-") for s in chunks
+ )
+ # Escape set operations (&&, ~~ and ||).
+ stuff = re.sub(r"([&~|])", r"\\\1", stuff)
+ i = j + 1
+ if not stuff:
+ # Empty range: never match.
+ add("(?!)")
+ elif stuff == "!":
+ # Negated empty range: match any character.
+ add(".")
+ else:
+ if stuff[0] == "!":
+ stuff = "^" + stuff[1:]
+ elif stuff[0] in ("^", "["):
+ stuff = "\\" + stuff
+ add(f"[{stuff}]")
+ else:
+ add(re.escape(c))
+ assert i == n
+ return res
def glob_translate(pat):
+ # Copied from: https://github.com/python/cpython/pull/106703.
+ # The keyword parameters' values are fixed to:
+ # recursive=True, include_hidden=True, seps=None
"""Translate a pathname with shell wildcards to a regular expression."""
- pass
+ if os.path.altsep:
+ seps = os.path.sep + os.path.altsep
+ else:
+ seps = os.path.sep
+ escaped_seps = "".join(map(re.escape, seps))
+ any_sep = f"[{escaped_seps}]" if len(seps) > 1 else escaped_seps
+ not_sep = f"[^{escaped_seps}]"
+ one_last_segment = f"{not_sep}+"
+ one_segment = f"{one_last_segment}{any_sep}"
+ any_segments = f"(?:.+{any_sep})?"
+ any_last_segments = ".*"
+ results = []
+ parts = re.split(any_sep, pat)
+ last_part_idx = len(parts) - 1
+ for idx, part in enumerate(parts):
+ if part == "*":
+ results.append(one_segment if idx < last_part_idx else one_last_segment)
+ continue
+ if part == "**":
+ results.append(any_segments if idx < last_part_idx else any_last_segments)
+ continue
+ elif "**" in part:
+ raise ValueError(
+ "Invalid pattern: '**' can only be an entire path component"
+ )
+ if part:
+ results.extend(_translate(part, f"{not_sep}*", not_sep))
+ if idx < last_part_idx:
+ results.append(any_sep)
+ res = "".join(results)
+ return rf"(?s:{res})\Z"