From 2c6b917fc53ef2cba2000349458160e3d2c87706 Mon Sep 17 00:00:00 2001 From: skshetry <18718008+skshetry@users.noreply.github.com> Date: Thu, 9 Jan 2025 21:12:34 +0545 Subject: [PATCH] ls-url: add support for `--tree`/`--level` (#10664) --- dvc/commands/ls/__init__.py | 5 +-- dvc/commands/ls_url.py | 40 +++++++++++++++++-- dvc/repo/ls.py | 31 +++++++++------ dvc/repo/ls_url.py | 21 +++++++--- dvc/testing/workspace_tests.py | 33 ++++++++++++++++ tests/func/test_ls.py | 64 ++++++++++++++++++++++++++++++- tests/unit/command/test_ls_url.py | 22 +++++++++-- 7 files changed, 188 insertions(+), 28 deletions(-) diff --git a/dvc/commands/ls/__init__.py b/dvc/commands/ls/__init__.py index 12a6e1a217..550b5820b0 100644 --- a/dvc/commands/ls/__init__.py +++ b/dvc/commands/ls/__init__.py @@ -77,10 +77,7 @@ def _build_tree_structure( num_entries = len(entries) for i, (name, entry) in enumerate(entries.items()): - # show full path for root, otherwise only show the name - if _depth > 0: - entry["path"] = name - + entry["path"] = name is_last = i >= num_entries - 1 tree_part = "" if _depth > 0: diff --git a/dvc/commands/ls_url.py b/dvc/commands/ls_url.py index 7bd2db8944..7ba0fbd49c 100644 --- a/dvc/commands/ls_url.py +++ b/dvc/commands/ls_url.py @@ -3,26 +3,45 @@ from dvc.cli.utils import DictAction, append_doc_link from dvc.log import logger -from .ls import show_entries +from .ls import show_entries, show_tree logger = logger.getChild(__name__) class CmdListUrl(CmdBaseNoRepo): - def run(self): - from dvc.config import Config + def _show_tree(self, config): + from dvc.fs import parse_external_url + from dvc.repo.ls import _ls_tree + + fs, fs_path = parse_external_url( + self.args.url, fs_config=self.args.fs_config, config=config + ) + entries = _ls_tree(fs, fs_path, maxdepth=self.args.level) + show_tree(entries, with_color=True, with_size=self.args.size) + return 0 + + def _show_list(self, config): from dvc.repo import Repo entries = Repo.ls_url( self.args.url, recursive=self.args.recursive, + maxdepth=self.args.level, fs_config=self.args.fs_config, - config=Config.from_cwd(), + config=config, ) if entries: show_entries(entries, with_color=True, with_size=self.args.size) return 0 + def run(self): + from dvc.config import Config + + config = Config.from_cwd() + if self.args.tree: + return self._show_tree(config=config) + return self._show_list(config=config) + def add_parser(subparsers, parent_parser): LS_HELP = "List directory contents from URL." @@ -40,6 +59,19 @@ def add_parser(subparsers, parent_parser): lsurl_parser.add_argument( "-R", "--recursive", action="store_true", help="Recursively list files." ) + lsurl_parser.add_argument( + "-T", + "--tree", + action="store_true", + help="Recurse into directories as a tree.", + ) + lsurl_parser.add_argument( + "-L", + "--level", + metavar="depth", + type=int, + help="Limit the depth of recursion.", + ) lsurl_parser.add_argument("--size", action="store_true", help="Show sizes.") lsurl_parser.add_argument( "--fs-config", diff --git a/dvc/repo/ls.py b/dvc/repo/ls.py index 57e799d9e5..030128253f 100644 --- a/dvc/repo/ls.py +++ b/dvc/repo/ls.py @@ -99,7 +99,9 @@ def ls_tree( path = path or "" fs: DVCFileSystem = repo.dvcfs fs_path = fs.from_os_path(path) - return _ls_tree(fs, fs_path, dvc_only, maxdepth) + return _ls_tree( + fs, fs_path, maxdepth=maxdepth, dvc_only=dvc_only, dvcfiles=True + ) def _ls( @@ -145,27 +147,34 @@ def _ls( return ret_list -def _ls_tree( - fs, path, dvc_only: bool = False, maxdepth: Optional[int] = None, _info=None -): - ret = {} +def _ls_tree(fs, path, maxdepth=None, _info=None, **fs_kwargs): info = _info or fs.info(path) + if _info is None: + # preserve the original path name + name = path + if not name: + name = os.curdir if fs.protocol == "local" else fs.root_marker + path = info["name"] + else: + name = path.rsplit(fs.sep, 1)[-1] - path = info["name"].rstrip(fs.sep) or os.curdir - name = path.rsplit("/", 1)[-1] + ret = {} ls_info = _adapt_info(info) ls_info["path"] = path recurse = maxdepth is None or maxdepth > 0 if recurse and info["type"] == "directory": - infos = fs.ls(path, dvcfiles=True, dvc_only=dvc_only, detail=True) + try: + infos = fs.ls(path, detail=True, **fs_kwargs) + except FileNotFoundError: + # broken symlink? + infos = [] + infos.sort(key=lambda f: f["name"]) maxdepth = maxdepth - 1 if maxdepth is not None else None contents = {} for info in infos: - d = _ls_tree( - fs, info["name"], dvc_only=dvc_only, maxdepth=maxdepth, _info=info - ) + d = _ls_tree(fs, info["name"], maxdepth=maxdepth, _info=info, **fs_kwargs) contents.update(d) ls_info["contents"] = contents diff --git a/dvc/repo/ls_url.py b/dvc/repo/ls_url.py index f51177a707..479e5b5cb0 100644 --- a/dvc/repo/ls_url.py +++ b/dvc/repo/ls_url.py @@ -1,19 +1,30 @@ +from fsspec.implementations.local import LocalFileSystem as _LocalFileSystem + from dvc.exceptions import URLMissingError -from dvc.fs import parse_external_url +from dvc.fs import LocalFileSystem, parse_external_url -def ls_url(url, *, fs_config=None, recursive=False, config=None): +def ls_url(url, *, fs_config=None, recursive=False, maxdepth=None, config=None): fs, fs_path = parse_external_url(url, fs_config=fs_config, config=config) try: info = fs.info(fs_path) except FileNotFoundError as exc: raise URLMissingError(url) from exc - if info["type"] != "directory": + if maxdepth == 0 or info["type"] != "directory": return [{"path": info["name"], "isdir": False}] + if isinstance(fs, LocalFileSystem): + # dvc's LocalFileSystem does not support maxdepth yet + walk = _LocalFileSystem().walk + else: + walk = fs.walk + ret = [] - for _, dirs, files in fs.walk(fs_path, detail=True): - if not recursive: + for root, dirs, files in walk(fs_path, detail=True, maxdepth=maxdepth): + parts = fs.relparts(root, fs_path) + if parts == (".",): + parts = () + if not recursive or (maxdepth and len(parts) >= maxdepth - 1): files.update(dirs) for info in files.values(): diff --git a/dvc/testing/workspace_tests.py b/dvc/testing/workspace_tests.py index 22232fe71e..ba72f7cefb 100644 --- a/dvc/testing/workspace_tests.py +++ b/dvc/testing/workspace_tests.py @@ -225,6 +225,39 @@ def test_recursive(self, cloud): ], ) + result = ls_url( + str(cloud / "dir"), fs_config=cloud.config, recursive=True, maxdepth=0 + ) + match_files( + fs, + result, + [{"path": (cloud / "dir").fs_path, "isdir": False}], + ) + + result = ls_url( + str(cloud / "dir"), fs_config=cloud.config, recursive=True, maxdepth=1 + ) + match_files( + fs, + result, + [ + {"path": "foo", "isdir": False}, + {"path": "subdir", "isdir": True}, + ], + ) + + result = ls_url( + str(cloud / "dir"), fs_config=cloud.config, recursive=True, maxdepth=2 + ) + match_files( + fs, + result, + [ + {"path": "foo", "isdir": False}, + {"path": "subdir/bar", "isdir": False}, + ], + ) + def test_nonexistent(self, cloud): with pytest.raises(URLMissingError): ls_url(str(cloud / "dir"), fs_config=cloud.config) diff --git a/tests/func/test_ls.py b/tests/func/test_ls.py index dcf8c209bb..9b42582688 100644 --- a/tests/func/test_ls.py +++ b/tests/func/test_ls.py @@ -6,8 +6,9 @@ import pytest +from dvc.fs import MemoryFileSystem from dvc.repo import Repo -from dvc.repo.ls import ls_tree +from dvc.repo.ls import _ls_tree, ls_tree from dvc.scm import CloneError FS_STRUCTURE = { @@ -998,3 +999,64 @@ def test_ls_tree_maxdepth(M, tmp_dir, scm, dvc): "structure.xml.dvc": None, } } + + +def test_fs_ls_tree(): + fs = MemoryFileSystem(global_store=False) + fs.pipe({f: content.encode() for f, content in FS_STRUCTURE.items()}) + root = fs.root_marker + + files = _ls_tree(fs, "README.md") + assert _simplify_tree(files) == {"README.md": None} + files = _ls_tree(fs, root) + expected = { + root: { + ".gitignore": None, + "README.md": None, + "model": { + "script.py": None, + "train.py": None, + }, + } + } + assert _simplify_tree(files) == expected + + files = _ls_tree(fs, "model") + assert _simplify_tree(files) == { + "model": { + "script.py": None, + "train.py": None, + } + } + + +def test_fs_ls_tree_maxdepth(): + fs = MemoryFileSystem(global_store=False) + fs.pipe({f: content.encode() for f, content in FS_STRUCTURE.items()}) + + files = _ls_tree(fs, "/", maxdepth=0) + assert _simplify_tree(files) == {"/": None} + + files = _ls_tree(fs, "/", maxdepth=1) + assert _simplify_tree(files) == { + "/": { + ".gitignore": None, + "README.md": None, + "model": None, + } + } + + files = _ls_tree(fs, "/", maxdepth=2) + assert _simplify_tree(files) == { + "/": { + ".gitignore": None, + "README.md": None, + "model": { + "script.py": None, + "train.py": None, + }, + } + } + + files = _ls_tree(fs, "README.md", maxdepth=3) + assert _simplify_tree(files) == {"README.md": None} diff --git a/tests/unit/command/test_ls_url.py b/tests/unit/command/test_ls_url.py index 252c0061e8..33dde6f068 100644 --- a/tests/unit/command/test_ls_url.py +++ b/tests/unit/command/test_ls_url.py @@ -1,6 +1,7 @@ from dvc.cli import parse_args from dvc.commands.ls_url import CmdListUrl from dvc.config import Config +from dvc.fs import LocalFileSystem def test_ls_url(mocker, M): @@ -12,12 +13,16 @@ def test_ls_url(mocker, M): assert cmd.run() == 0 m.assert_called_once_with( - "src", recursive=False, fs_config=None, config=M.instance_of(Config) + "src", + recursive=False, + maxdepth=None, + fs_config=None, + config=M.instance_of(Config), ) def test_recursive(mocker, M): - cli_args = parse_args(["ls-url", "-R", "src"]) + cli_args = parse_args(["ls-url", "-R", "-L", "2", "src"]) assert cli_args.func == CmdListUrl cmd = cli_args.func(cli_args) m = mocker.patch("dvc.repo.Repo.ls_url", autospec=True) @@ -25,5 +30,16 @@ def test_recursive(mocker, M): assert cmd.run() == 0 m.assert_called_once_with( - "src", recursive=True, fs_config=None, config=M.instance_of(Config) + "src", recursive=True, maxdepth=2, fs_config=None, config=M.instance_of(Config) ) + + +def test_tree(mocker, M): + cli_args = parse_args(["ls-url", "--tree", "--level", "2", "src"]) + assert cli_args.func == CmdListUrl + cmd = cli_args.func(cli_args) + m = mocker.patch("dvc.repo.ls._ls_tree", autospec=True) + + assert cmd.run() == 0 + + m.assert_called_once_with(M.instance_of(LocalFileSystem), "src", maxdepth=2)