Source code for hfutils.index.validate

from typing import Optional

from huggingface_hub.hf_api import RepoFile
from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError

from .fetch import hf_tar_get_index
from ..operate.base import RepoTypeTyping, get_hf_client


[docs]def hf_tar_item_validate(file_item: RepoFile, size: int, hash_: Optional[str] = None, hash_lfs: Optional[str] = None): """ Validate a file item in a tar archive. This function checks if the file item matches the expected size and hash. :param file_item: The file item from the Hugging Face repository. :type file_item: RepoFile :param size: The expected size of the file. :type size: int :param hash_: The expected SHA-1 hash of the file. :type hash_: str, optional :param hash_lfs: The expected SHA-256 hash of the file if stored in LFS. :type hash_lfs: str, optional :return: True if the file item is valid, False otherwise. :rtype: bool """ # size not match if (file_item.lfs and size != file_item.lfs.size) or \ (not file_item.lfs and size != file_item.size): return False # compare tar file hash item_hashes = [file_item.blob_id] if file_item.lfs: item_hashes.append(file_item.lfs.sha256) item_hashes = set(filter(bool, item_hashes)) cmp_hashes = [hash_, hash_lfs] cmp_hashes = set(filter(bool, cmp_hashes)) return bool(cmp_hashes & item_hashes)
[docs]def hf_tar_validate(repo_id: str, archive_in_repo: str, repo_type: RepoTypeTyping = 'dataset', revision: str = 'main', idx_repo_id: Optional[str] = None, idx_file_in_repo: Optional[str] = None, idx_repo_type: Optional[RepoTypeTyping] = None, idx_revision: Optional[str] = None, hf_token: Optional[str] = None): """ Validate a tar archive in a Hugging Face repository. This function validates if the tar archive in the Hugging Face repository matches the expected size and hash. :param repo_id: The ID of the Hugging Face repository. :type repo_id: str :param archive_in_repo: The path to the tar archive in the repository. :type archive_in_repo: str :param repo_type: The type of the Hugging Face repository, defaults to 'dataset'. :type repo_type: RepoTypeTyping, optional :param revision: The revision of the repository, defaults to 'main'. :type revision: str, optional :param idx_repo_id: The ID of the repository where the index file is stored. :type idx_repo_id: Optional[str], optional :param idx_file_in_repo: The path to the index file in the repository. :type idx_file_in_repo: Optional[str], optional :param idx_repo_type: The type of the repository where the index file is stored. :type idx_repo_type: Optional[RepoTypeTyping], optional :param idx_revision: The revision of the repository where the index file is stored. :type idx_revision: Optional[str], optional :param hf_token: The Hugging Face token for authentication, defaults to None. :type hf_token: Optional[str], optional :raises EntryNotFoundError: If the specified entry is not found in the repository. :raises IsADirectoryError: If the specified entry is a directory. :return: True if the tar archive is valid, False otherwise. :rtype: bool """ hf_client = get_hf_client(hf_token) items = list(hf_client.get_paths_info( repo_id=repo_id, repo_type=repo_type, paths=[archive_in_repo], revision=revision, )) if len(items) == 0: raise EntryNotFoundError(f'Entry {repo_type}s/{repo_id}/{archive_in_repo} not found.') elif not isinstance(items[0], RepoFile): raise IsADirectoryError(f'Entry {repo_type}s/{repo_id}/{archive_in_repo} is a directory, not a file.') else: item = items[0] try: index = hf_tar_get_index( repo_id=repo_id, archive_in_repo=archive_in_repo, repo_type=repo_type, revision=revision, idx_repo_id=idx_repo_id, idx_file_in_repo=idx_file_in_repo, idx_repo_type=idx_repo_type, idx_revision=idx_revision, hf_token=hf_token, ) except (EntryNotFoundError, RepositoryNotFoundError): return False return hf_tar_item_validate( file_item=item, size=index['filesize'], hash_=index.get('hash'), hash_lfs=index.get('hash_lfs'), )