repo_utils¶

`src.scrapers.repo_utils` ¶

`check_url_cached(url: str, ttl: int = CACHE_TTL_URL) -> bool` ¶

Check if a URL exists, with disk caching.

Returns True/False. Positive results are cached for ttl seconds; negative results are cached for CACHE_TTL_URL_NEG (shorter) so they are re-checked periodically without hammering every run.

Source code in src/scrapers/repo_utils.py

def check_url_cached(url: str, ttl: int = CACHE_TTL_URL) -> bool:
    """Check if a URL exists, with disk caching.

    Returns True/False.  Positive results are cached for ``ttl`` seconds;
    negative results are cached for CACHE_TTL_URL_NEG (shorter) so they
    are re-checked periodically without hammering every run.
    """
    # Skip non-HTTP URLs entirely
    if not url.startswith(("http://", "https://")):
        return False

    # Skip known-dead hosts to avoid expensive DNS/timeout retries
    try:
        host = urlparse(url).hostname
        if host and host.lower() in _load_known_dead_hosts():
            return False
    except Exception:
        pass

    cached = _read_cache(CACHE_DIR, url, ttl=ttl, namespace="url_exists")
    if cached is True:
        return True  # positive hit – trust it
    # Check negative cache (shorter TTL)
    cached_neg = _read_cache(CACHE_DIR, url, ttl=CACHE_TTL_URL_NEG, namespace="url_exists")
    if cached_neg is False:
        return False  # recently confirmed non-existent

    try:
        resp = _session.head(url, allow_redirects=True, timeout=10)
        if resp.status_code == 429:
            time.sleep(10)
            resp = _session.head(url, allow_redirects=True, timeout=10)
        exists = 200 <= resp.status_code < 300
    except requests.exceptions.ConnectionError as e:
        logger.error(f"  Request error for {url}: {e}")
        # DNS failures and connection refused are effectively permanent
        # for old artifact URLs — cache as negative to avoid retrying.
        _write_cache(CACHE_DIR, url, False, namespace="url_exists")
        return False
    except requests.RequestException as e:
        logger.error(f"  Request error for {url}: {e}")
        return False

    _write_cache(CACHE_DIR, url, exists, namespace="url_exists")
    return exists

`cached_github_stats(url: str, ttl: int = CACHE_TTL_STATS) -> dict[str, Any]` ¶

Fetch GitHub repo stats with caching, ETags, and rate-limit handling.

Uses conditional requests (If-None-Match) so that 304 responses do NOT count against the GitHub API rate limit. This effectively makes re-runs free for repos whose data hasn't changed.

Source code in src/scrapers/repo_utils.py

def cached_github_stats(url: str, ttl: int = CACHE_TTL_STATS) -> dict[str, Any]:
    """Fetch GitHub repo stats with caching, ETags, and rate-limit handling.

    Uses conditional requests (If-None-Match) so that 304 responses do NOT
    count against the GitHub API rate limit.  This effectively makes re-runs
    free for repos whose data hasn't changed.
    """
    cached = _read_cache(CACHE_DIR, url, ttl=ttl, namespace="github_stats")
    if cached is not _MISSING:
        return cached  # dict or None — still fresh

    repo = url.split("github.com/")[1]
    for suffix in ("/tree/", "/blob/", "/pkgs/", "/releases/", "/wiki", "/issues", "/pull/", "/commit/"):
        if suffix in repo:
            repo = repo.split(suffix)[0]
    # Keep only owner/repo (first two path segments)
    parts = repo.strip("/").split("/")
    repo = "/".join(parts[:2]).removesuffix(".git")

    headers = _github_headers()

    # Use stored ETag for conditional request (304 = free, no rate cost)
    entry = _read_cache_entry(CACHE_DIR, url, namespace="github_stats")
    if entry and entry.get("etag"):
        headers["If-None-Match"] = entry["etag"]

    try:
        resp = _session.get(f"https://api.github.com/repos/{repo}", headers=headers, timeout=_session.default_timeout)
        if resp.status_code == 403 and "rate limit" in resp.text.lower():
            reset_time = int(resp.headers.get("X-RateLimit-Reset", 0))
            wait = max(reset_time - int(time.time()), 0) + 5
            logger.info(f"  Rate limited. Waiting {wait}s for reset...")
            time.sleep(wait)
            resp = _session.get(
                f"https://api.github.com/repos/{repo}", headers=headers, timeout=_session.default_timeout
            )

        if resp.status_code == 304 and entry:
            # Data unchanged — refresh timestamp, return cached data (free!)
            _refresh_cache_ts(CACHE_DIR, url, namespace="github_stats")
            return entry.get("body")
        if resp.status_code == 200:
            d = resp.json()
            result = {
                "github_forks": d.get("forks_count", 0),
                "github_stars": d.get("stargazers_count", 0),
                "updated_at": d.get("updated_at", "NA"),
                "created_at": d.get("created_at", "NA"),
                "pushed_at": d.get("pushed_at", "NA"),
                "name": d.get("full_name", "NA"),
                "description": d.get("description", ""),
                "language": d.get("language", ""),
                "license": (d.get("license") or {}).get("spdx_id", ""),
                "topics": d.get("topics", []),
            }
            etag = resp.headers.get("ETag")
            _write_cache(CACHE_DIR, url, result, namespace="github_stats", etag=etag)
            return result
        logger.warning(f"  Could not collect GitHub stats for {url} (HTTP {resp.status_code})")
    except requests.RequestException as e:
        logger.warning(f"  GitHub request error for {url}: {e}")
    result = None
    _write_cache(CACHE_DIR, url, result, namespace="github_stats")
    return result

`cached_zenodo_stats(url: str, ttl: int = CACHE_TTL_STATS) -> dict[str, Any]` ¶

Fetch Zenodo record stats with caching and 429 retry.

Source code in src/scrapers/repo_utils.py

def cached_zenodo_stats(url: str, ttl: int = CACHE_TTL_STATS) -> dict[str, Any]:
    """Fetch Zenodo record stats with caching and 429 retry."""
    cached = _read_cache(CACHE_DIR, url, ttl=ttl, namespace="zenodo_stats")
    if cached is not _MISSING:
        # Stale entries cached before linked_github_urls extraction was
        # added lack the key entirely.  Force a re-fetch so we discover
        # GitHub repos linked from Zenodo metadata.
        if isinstance(cached, dict) and "linked_github_urls" not in cached:
            pass  # fall through to re-fetch
        else:
            return cached

    rec = _resolve_zenodo_record_id(url)
    if rec is None:
        logger.info(f"  Could not parse Zenodo URL {url}")
        return None

    result = None
    try:
        for attempt in range(4):  # up to 3 retries on 429
            resp = _session.get(f"https://zenodo.org/api/records/{rec}", timeout=_session.default_timeout)
            if resp.status_code == 200:
                record = resp.json()
                stats = record.get("stats", {})
                result = {
                    "zenodo_views": stats.get("unique_views", 0),
                    "zenodo_downloads": stats.get("unique_downloads", 0),
                    "updated_at": record.get("updated", ""),
                    "created_at": record.get("created", ""),
                }
                # Extract linked GitHub URLs from related_identifiers.
                # Always store the key (even empty) so the cache can
                # distinguish "checked, no links" from "never checked".
                result["linked_github_urls"] = _extract_github_urls_from_zenodo(record)
                break
            if resp.status_code == 429:
                retry_after = int(resp.headers.get("Retry-After", 0))
                wait = max(retry_after, 2 ** (attempt + 1))  # exponential backoff: 2, 4, 8s
                logger.info(f"  Zenodo 429 for {url}, waiting {wait}s (attempt {attempt + 1}/4)")
                time.sleep(wait)
            elif resp.status_code in (404, 410):
                # Concept DOI or superseded version — resolve via redirect
                resolved = _resolve_zenodo_doi(url)
                if resolved and resolved != rec:
                    logger.info(f"  Zenodo {resp.status_code} for record {rec}, resolved DOI to {resolved}")
                    rec = resolved
                    continue  # retry with the resolved ID
                logger.info(f"  Could not collect Zenodo stats for {url} (HTTP {resp.status_code})")
                break
            else:
                logger.info(f"  Could not collect Zenodo stats for {url} (HTTP {resp.status_code})")
                break
    except requests.RequestException as e:
        logger.error(f"  Zenodo request error for {url}: {e}")

    _write_cache(CACHE_DIR, url, result, namespace="zenodo_stats")
    return result

`cached_figshare_stats(url, ttl=CACHE_TTL_STATS)` ¶

Fetch Figshare article stats with caching.

Source code in src/scrapers/repo_utils.py

def cached_figshare_stats(url, ttl=CACHE_TTL_STATS):
    """Fetch Figshare article stats with caching."""
    cached = _read_cache(CACHE_DIR, url, ttl=ttl, namespace="figshare_stats")
    if cached is not _MISSING:
        return cached

    clean = url
    if clean.endswith((".v1", ".v2", ".v3", ".v4", ".v5", ".v6", ".v7", ".v8", ".v9")):
        clean = clean[:-3]
    article_id = clean.split("figshare.")[-1]

    views = downloads = -1
    updated = created = "NA"
    linked: list[str] = []
    try:
        r = _session.get(
            f"https://stats.figshare.com/total/views/article/{article_id}", timeout=_session.default_timeout
        )
        if r.status_code == 200:
            views = r.json().get("totals", -1)
        r = _session.get(
            f"https://stats.figshare.com/total/downloads/article/{article_id}", timeout=_session.default_timeout
        )
        if r.status_code == 200:
            downloads = r.json().get("totals", -1)
        r = _session.get(f"https://api.figshare.com/v2/articles/{article_id}", timeout=_session.default_timeout)
        if r.status_code == 200:
            d = r.json()
            updated = d.get("modified_date", "NA")
            created = d.get("created_date", "NA")
            # Extract linked GitHub URLs from references/related_materials
            linked = _extract_github_urls_from_figshare(d)
    except requests.RequestException as e:
        logger.error(f"  Figshare request error for {url}: {e}")

    result = {
        "figshare_views": views,
        "figshare_downloads": downloads,
        "updated_at": updated,
        "created_at": created,
    }
    if linked:
        result["linked_github_urls"] = linked
    _write_cache(CACHE_DIR, url, result, namespace="figshare_stats")
    return result

repo_utils¶

src.scrapers.repo_utils ¶

check_url_cached(url: str, ttl: int = CACHE_TTL_URL) -> bool ¶

cached_github_stats(url: str, ttl: int = CACHE_TTL_STATS) -> dict[str, Any] ¶

cached_zenodo_stats(url: str, ttl: int = CACHE_TTL_STATS) -> dict[str, Any] ¶

cached_figshare_stats(url, ttl=CACHE_TTL_STATS) ¶

`src.scrapers.repo_utils` ¶

`check_url_cached(url: str, ttl: int = CACHE_TTL_URL) -> bool` ¶

`cached_github_stats(url: str, ttl: int = CACHE_TTL_STATS) -> dict[str, Any]` ¶

`cached_zenodo_stats(url: str, ttl: int = CACHE_TTL_STATS) -> dict[str, Any]` ¶

`cached_figshare_stats(url, ttl=CACHE_TTL_STATS)` ¶