Source code for hfutils.utils.session

"""
This module provides functionality for creating and managing HTTP sessions with customizable retry logic,
timeout settings, and user-agent rotation using random user-agent generation. It is designed to help with
robust web scraping and API consumption by handling common HTTP errors and timeouts gracefully.

Main Features:

- Automatic retries on specified HTTP response status codes.
- Configurable request timeout.
- Rotating user-agent for each session to mimic different browsers and operating systems.
- Optional SSL verification.
"""

from functools import lru_cache
from typing import Optional, Dict

import requests
from random_user_agent.params import SoftwareName, OperatingSystem
from random_user_agent.user_agent import UserAgent
from requests.adapters import HTTPAdapter, Retry

DEFAULT_TIMEOUT = 15  # seconds


[docs]class TimeoutHTTPAdapter(HTTPAdapter): """ A custom HTTPAdapter that enforces a default timeout on all requests. :param args: Variable length argument list for HTTPAdapter. :param kwargs: Arbitrary keyword arguments. 'timeout' can be specified to set a custom timeout. """
[docs] def __init__(self, *args, **kwargs): self.timeout = DEFAULT_TIMEOUT if "timeout" in kwargs: self.timeout = kwargs["timeout"] del kwargs["timeout"] super().__init__(*args, **kwargs)
[docs] def send(self, request, **kwargs): """ Sends the Request object, applying the timeout setting. :param request: The Request object to send. :type request: requests.PreparedRequest :param kwargs: Keyword arguments that may contain 'timeout'. :return: The response to the request. """ timeout = kwargs.get("timeout") if timeout is None: kwargs["timeout"] = self.timeout return super().send(request, **kwargs)
[docs]def get_requests_session(max_retries: int = 5, timeout: int = DEFAULT_TIMEOUT, verify: bool = True, headers: Optional[Dict[str, str]] = None, session: Optional[requests.Session] = None) \ -> requests.Session: """ Creates a requests session with retry logic, timeout settings, and random user-agent headers. :param max_retries: Maximum number of retries on failed requests. :type max_retries: int :param timeout: Request timeout in seconds. :type timeout: int :param verify: Whether to verify SSL certificates. :type verify: bool :param headers: Additional headers to include in the requests. :type headers: Optional[Dict[str, str]] :param session: An existing requests.Session instance to use. :type session: Optional[requests.Session] :return: A configured requests.Session object. :rtype: requests.Session """ session = session or requests.session() retries = Retry( total=max_retries, backoff_factor=1, status_forcelist=[408, 429, 500, 501, 502, 503, 504, 505, 506, 507, 509, 510, 511], allowed_methods=["HEAD", "GET", "POST", "PUT", "DELETE", "OPTIONS", "TRACE"], ) adapter = TimeoutHTTPAdapter(max_retries=retries, timeout=timeout, pool_connections=32, pool_maxsize=32) session.mount('http://', adapter) session.mount('https://', adapter) session.headers.update({ "User-Agent": get_random_ua(), **dict(headers or {}), }) if not verify: session.verify = False return session
@lru_cache() def _ua_pool(): """ Creates and caches a UserAgent rotator instance with a specified number of user agents. :return: A UserAgent rotator instance. :rtype: UserAgent """ software_names = [SoftwareName.CHROME.value, SoftwareName.FIREFOX.value, SoftwareName.EDGE.value] operating_systems = [OperatingSystem.WINDOWS.value, OperatingSystem.MACOS.value] user_agent_rotator = UserAgent(software_names=software_names, operating_systems=operating_systems, limit=1000) return user_agent_rotator
[docs]def get_random_ua(): """ Retrieves a random user agent string from the cached UserAgent rotator. :return: A random user agent string. :rtype: str """ return _ua_pool().get_random_user_agent()