"""
This module provides utilities for working with Hugging Face filesystem paths.
It includes functions for normalizing paths, constructing Hugging Face filesystem paths,
and parsing those paths into structured components. Additionally, it manages
irregular repositories through caching and provides a data class for representing
Hugging Face filesystem paths.
"""
import json
import os
import re
from dataclasses import dataclass
from functools import lru_cache
from typing import Optional, Dict, Set, Literal
RepoTypeTyping = Literal['dataset', 'model', 'space']
[docs]def hf_normpath(path) -> str:
"""
Normalize a given path.
This function takes a path as input and normalizes it by removing any redundant
separators and converting it to a relative path. This can be useful for ensuring
consistent path formatting across different operating systems.
:param path: The path to normalize.
:type path: Any
:return: The normalized path.
:rtype: str
"""
return re.sub(
r'[\\/]+', '/',
os.path.relpath(os.path.normpath(os.path.join(os.sep, path)), os.sep)
)
[docs]def hf_fs_path(repo_id: str, filename: str,
repo_type: RepoTypeTyping = 'dataset', revision: Optional[str] = None) -> str:
"""
Get the huggingface filesystem path.
This function constructs a Hugging Face filesystem path from the given repository ID,
filename, repository type, and optional revision. It ensures that the path is formatted
correctly according to the Hugging Face conventions.
:param repo_id: The repository ID.
:type repo_id: str
:param filename: The filename.
:type filename: str
:param repo_type: The type of repository. (default: 'dataset')
:type repo_type: RepoTypeTyping
:param revision: The revision of the repository. (default: None)
:type revision: Optional[str]
:return: The huggingface filesystem path.
:rtype: str
"""
filename = hf_normpath(filename)
if repo_type == 'dataset':
prefix = 'datasets/'
elif repo_type == 'space':
prefix = 'spaces/'
else:
prefix = ''
if revision is not None:
revision_text = f'@{revision}'
else:
revision_text = ''
return f'{prefix}{repo_id}{revision_text}/{filename}'
@lru_cache()
def _irregular_repos() -> Dict[RepoTypeTyping, Set[str]]:
"""
Get irregular repositories.
This function reads a JSON file containing a list of irregular repositories for
different types (models, datasets, spaces) and returns them as a dictionary.
It caches the results for efficiency.
:return: A dictionary containing irregular repositories.
:rtype: Dict[RepoTypeTyping, Set[str]]
"""
with open(os.path.join(os.path.dirname(__file__), 'irregular_repo.json'), 'r') as f:
data = json.load(f)
return {
'model': set(data['models']),
'dataset': set(data['datasets']),
'space': set(data['spaces']),
}
_RE_IR_PATH = re.compile(
r'^(?P<repo_id>[^@/]+)(@(?P<revision>[^@/]+))?(/(?P<filename>[\s\S]+))?$')
_RE_PATH = re.compile(
r'^(?P<repo_id>[^@/]+/[^@/]+)(@(?P<revision>(refs/pr/\d+|[^@/]+)))?(/(?P<filename>[\s\S]+))?$')
[docs]@dataclass
class HfFileSystemPath:
"""
Huggingface FileSystem Path.
This data class represents a Hugging Face filesystem path, encapsulating the
repository ID, filename, repository type, and optional revision. It is used
to provide a structured representation of filesystem paths used in Hugging Face.
:param repo_id: The repository ID.
:type repo_id: str
:param filename: The filename.
:type filename: str
:param repo_type: The type of repository.
:type repo_type: RepoTypeTyping
:param revision: The revision of the repository.
:type revision: Optional[str]
"""
repo_id: str
filename: str
repo_type: RepoTypeTyping
revision: Optional[str]
[docs]def parse_hf_fs_path(path: str) -> HfFileSystemPath:
"""
Parse the huggingface filesystem path.
This function takes a Hugging Face filesystem path as input and parses it into
its components: repository ID, filename, repository type, and revision. It validates
the path format and raises an error if the path is invalid.
:param path: The path to parse.
:type path: str
:return: The parsed huggingface filesystem path.
:rtype: HfFileSystemPath
:raises ValueError: If this path is invalid.
"""
origin_path = path
repo_type: RepoTypeTyping
if path.startswith('datasets/'):
repo_type = 'dataset'
path = path[len('datasets/'):]
elif path.startswith('spaces/'):
repo_type = 'space'
path = path[len('spaces/'):]
else:
repo_type = 'model'
matching = _RE_IR_PATH.fullmatch(path)
if matching:
if matching.group('repo_id') not in _irregular_repos()[repo_type]:
matching = None
if not matching:
matching = _RE_PATH.fullmatch(path)
if matching:
repo_id = matching.group('repo_id')
revision = matching.group('revision') or None
filename = hf_normpath(matching.group('filename') or '.')
return HfFileSystemPath(repo_id, filename, repo_type, revision)
else:
raise ValueError(f'Invalid huggingface filesystem path - {origin_path!r}.')