Source code for hfutils.entry.tree

"""
This module provides functionality for listing and displaying files from a HuggingFace repository in a tree-like structure.

It includes functions for parsing repository paths, retrieving file information, and formatting the output as a tree.
The module also defines a CLI command for easy interaction with the tree functionality.

Key components:

- TreeItem: A dataclass representing an item (file or folder) in the tree structure.
- _get_tree: Function to retrieve the tree structure of files in a HuggingFace repository.
- _add_tree_subcommand: Function to add the 'tree' subcommand to a Click CLI group.

Usage:
This module is typically used as part of a larger CLI application for interacting with HuggingFace repositories.
The 'tree' command can be used to visualize the structure of files in a repository.
"""

import dataclasses
import os
import re
from typing import Optional, List, Union

import click
from hbutils.string import format_tree
from huggingface_hub import configure_http_backend
from huggingface_hub.hf_api import RepoFile
from natsort import natsorted

from .base import CONTEXT_SETTINGS
from ..operate.base import REPO_TYPES, list_files_in_repository, RepoTypeTyping, get_hf_client
from ..utils import get_requests_session, hf_normpath, get_file_type, hf_fs_path, FileItemType


[docs]@dataclasses.dataclass class TreeItem: """ Represents an item (file or folder) in the tree structure. :param name: The name of the item. :type name: str :param type_: The type of the item (file or folder). :type type_: FileItemType :param children: List of child items if this is a folder. :type children: Optional[List[TreeItem]] :param exist: Whether the item exists in the repository. :type exist: bool :ivar name: The name of the item. :ivar type_: The type of the item. :ivar children: List of child items. :ivar exist: Existence status of the item. """ name: str type_: FileItemType children: Optional[List['TreeItem']] exist: bool = True def get_name(self): """ Get the formatted name of the item for display. :return: Formatted name string with color and strike-through if applicable. :rtype: str """ return click.style( self.name, fg=self.type_.render_color if self.exist else None, strikethrough=not self.exist, ) + ('' if self.exist else ' <NOT EXIST>') def get_children(self): """ Get the children of this item if it's a folder. :return: List of child items if folder, empty list otherwise. :rtype: List[TreeItem] """ return self.children if self.type_ == FileItemType.FOLDER else []
def _get_tree(repo_id: str, repo_type: RepoTypeTyping, dir_in_repo: str, revision: Optional[str] = None, show_all: bool = False) -> TreeItem: """ Retrieve the tree structure of files in a HuggingFace repository. :param repo_id: The ID of the repository. :type repo_id: str :param repo_type: The type of the repository. :type repo_type: RepoTypeTyping :param dir_in_repo: The directory in the repository to start from. :type dir_in_repo: str :param revision: The revision of the repository to use. :type revision: Optional[str] :param show_all: Whether to show hidden files. :type show_all: bool :return: The root TreeItem representing the directory structure. :rtype: TreeItem """ root = {} for filepath in list_files_in_repository( repo_id=repo_id, repo_type=repo_type, subdir=dir_in_repo, revision=revision, ignore_patterns=[], ): filename = hf_normpath(os.path.relpath(filepath, dir_in_repo)) segments = re.split(r'[\\/]+', filename) if any(segment.startswith('.') and segment != '.' for segment in segments) and not show_all: continue current_node = root for i, segment in enumerate(segments): if segment not in current_node: if i == (len(segments) - 1): current_node[segment] = get_file_type(segment) else: current_node[segment] = {} current_node = current_node[segment] root_name = hf_fs_path( repo_id=repo_id, repo_type=repo_type, filename=dir_in_repo, revision=revision, ) def _recursion(cur_node: Union[dict, FileItemType], parent_name: str, is_exist: bool = False): if isinstance(cur_node, dict): return TreeItem( name=parent_name, type_=FileItemType.FOLDER, children=[ _recursion(cur_node=value, parent_name=name, is_exist=is_exist) for name, value in natsorted(cur_node.items()) ], exist=is_exist, ) else: return TreeItem( name=parent_name, type_=cur_node, children=[], exist=is_exist, ) exist = True if not root: hf_client = get_hf_client() paths = hf_client.get_paths_info( repo_id=repo_id, repo_type=repo_type, revision=revision, paths=[dir_in_repo], ) if len(paths) == 0: exist = False elif len(paths) == 1: pathobj = paths[0] if isinstance(pathobj, RepoFile): # the subdir is a file root = get_file_type(dir_in_repo) else: assert len(paths) == 1, \ f'Multiple path {dir_in_repo!r} found in repo {root_name!r}, ' \ f'this must be caused by HuggingFace API.' # pragma: no cover return _recursion( cur_node=root, parent_name=root_name, is_exist=exist, ) def _add_tree_subcommand(cli: click.Group) -> click.Group: """ Add the 'tree' subcommand to a Click CLI group. This function defines a new 'tree' command that lists files from a HuggingFace repository in a tree-like structure. :param cli: The Click CLI group to add the command to. :type cli: click.Group :return: The modified CLI group with the 'tree' command added. :rtype: click.Group Usage: This function is typically called when setting up a CLI application: cli = click.Group() cli = _add_tree_subcommand(cli) """ @cli.command('tree', help='List files as a tree from HuggingFace repository.\n\n' 'Set environment $HF_TOKEN to use your own access token.', context_settings=CONTEXT_SETTINGS) @click.option('-r', '--repository', 'repo_id', type=str, required=True, help='Repository to download from.') @click.option('-t', '--type', 'repo_type', type=click.Choice(REPO_TYPES), default='dataset', help='Type of the HuggingFace repository.', show_default=True) @click.option('-d', '--directory', 'dir_in_repo', type=str, default=None, help='Directory in repository to download the full directory tree.') @click.option('-R', '--revision', 'revision', type=str, default='main', help='Revision of repository.', show_default=True) @click.option('-a', '--all', 'show_all', is_flag=True, type=bool, default=False, help='Show all files, including hidden files.', show_default=True) def tree(repo_id: str, repo_type: RepoTypeTyping, dir_in_repo, revision: str, show_all: bool): """ List files as a tree from a HuggingFace repository in a tree-like structure. :param repo_id: The ID of the repository. :type repo_id: str :param repo_type: The type of the repository. :type repo_type: RepoTypeTyping :param dir_in_repo: The directory in the repository to start from. :type dir_in_repo: str :param revision: The revision of the repository to use. :type revision: str :param show_all: Whether to show hidden files. :type show_all: bool """ configure_http_backend(get_requests_session) _tree = _get_tree( repo_id=repo_id, repo_type=repo_type, dir_in_repo=dir_in_repo or '.', revision=revision, show_all=show_all, ) print(format_tree( _tree, format_node=TreeItem.get_name, get_children=TreeItem.get_children, )) return cli