sys_sec_scrape¶

`src.scrapers.sys_sec_scrape` ¶

`check_url_cached(url: str, ttl: int = CACHE_TTL_URL) -> bool` ¶

Check if a URL exists, with disk caching.

Returns True/False. Positive results are cached for ttl seconds; negative results are cached for CACHE_TTL_URL_NEG (shorter) so they are re-checked periodically without hammering every run.

Source code in src/scrapers/sys_sec_scrape.py

def check_url_cached(url: str, ttl: int = CACHE_TTL_URL) -> bool:
    """Check if a URL exists, with disk caching.

    Returns True/False.  Positive results are cached for ``ttl`` seconds;
    negative results are cached for CACHE_TTL_URL_NEG (shorter) so they
    are re-checked periodically without hammering every run.
    """
    cached = _read_cache(CACHE_DIR, url, ttl=ttl, namespace="url_exists")
    if cached is True:
        return True  # positive hit – trust it
    # Check negative cache (shorter TTL)
    cached_neg = _read_cache(CACHE_DIR, url, ttl=CACHE_TTL_URL_NEG, namespace="url_exists")
    if cached_neg is False:
        return False  # recently confirmed non-existent

    try:
        resp = _session.head(url, allow_redirects=True, timeout=10)
        if resp.status_code == 429:
            time.sleep(10)
            resp = _session.head(url, allow_redirects=True, timeout=10)
        exists = 200 <= resp.status_code < 300
    except requests.RequestException as e:
        logger.error(f"  Request error for {url}: {e}")
        # Network errors (timeouts, DNS failures, connection refused) are
        # transient — do NOT cache them as negative results.
        return False

    _write_cache(CACHE_DIR, url, exists, namespace="url_exists")
    return exists

`cached_github_stats(url: str, ttl: int = CACHE_TTL_STATS) -> dict[str, Any]` ¶

Fetch GitHub repo stats with caching, ETags, and rate-limit handling.

Uses conditional requests (If-None-Match) so that 304 responses do NOT count against the GitHub API rate limit. This effectively makes re-runs free for repos whose data hasn't changed.

Source code in src/scrapers/sys_sec_scrape.py

def cached_github_stats(url: str, ttl: int = CACHE_TTL_STATS) -> dict[str, Any]:
    """Fetch GitHub repo stats with caching, ETags, and rate-limit handling.

    Uses conditional requests (If-None-Match) so that 304 responses do NOT
    count against the GitHub API rate limit.  This effectively makes re-runs
    free for repos whose data hasn't changed.
    """
    cached = _read_cache(CACHE_DIR, url, ttl=ttl, namespace="github_stats")
    if cached is not _MISSING:
        return cached  # dict or None — still fresh

    repo = url.split("github.com/")[1]
    for suffix in ("/tree/", "/blob/", "/pkgs/", "/releases/", "/wiki", "/issues", "/pull/", "/commit/"):
        if suffix in repo:
            repo = repo.split(suffix)[0]
    # Keep only owner/repo (first two path segments)
    parts = repo.strip("/").split("/")
    repo = "/".join(parts[:2]).removesuffix(".git")

    headers = _github_headers()

    # Use stored ETag for conditional request (304 = free, no rate cost)
    entry = _read_cache_entry(CACHE_DIR, url, namespace="github_stats")
    if entry and entry.get("etag"):
        headers["If-None-Match"] = entry["etag"]

    try:
        resp = _session.get(f"https://api.github.com/repos/{repo}", headers=headers, timeout=_session.default_timeout)
        if resp.status_code == 403 and "rate limit" in resp.text.lower():
            reset_time = int(resp.headers.get("X-RateLimit-Reset", 0))
            wait = max(reset_time - int(time.time()), 0) + 5
            logger.info(f"  Rate limited. Waiting {wait}s for reset...")
            time.sleep(wait)
            resp = _session.get(
                f"https://api.github.com/repos/{repo}", headers=headers, timeout=_session.default_timeout
            )

        if resp.status_code == 304 and entry:
            # Data unchanged — refresh timestamp, return cached data (free!)
            _refresh_cache_ts(CACHE_DIR, url, namespace="github_stats")
            return entry.get("body")
        if resp.status_code == 200:
            d = resp.json()
            result = {
                "github_forks": d.get("forks_count", 0),
                "github_stars": d.get("stargazers_count", 0),
                "updated_at": d.get("updated_at", "NA"),
                "created_at": d.get("created_at", "NA"),
                "pushed_at": d.get("pushed_at", "NA"),
                "name": d.get("full_name", "NA"),
                "description": d.get("description", ""),
                "language": d.get("language", ""),
                "license": (d.get("license") or {}).get("spdx_id", ""),
                "topics": d.get("topics", []),
            }
            etag = resp.headers.get("ETag")
            _write_cache(CACHE_DIR, url, result, namespace="github_stats", etag=etag)
            return result
        logger.warning(f"  Could not collect GitHub stats for {url} (HTTP {resp.status_code})")
    except requests.RequestException as e:
        logger.warning(f"  GitHub request error for {url}: {e}")
    result = None
    _write_cache(CACHE_DIR, url, result, namespace="github_stats")
    return result

`cached_zenodo_stats(url: str, ttl: int = CACHE_TTL_STATS) -> dict[str, Any]` ¶

Fetch Zenodo record stats with caching and 429 retry.

Source code in src/scrapers/sys_sec_scrape.py

def cached_zenodo_stats(url: str, ttl: int = CACHE_TTL_STATS) -> dict[str, Any]:
    """Fetch Zenodo record stats with caching and 429 retry."""
    cached = _read_cache(CACHE_DIR, url, ttl=ttl, namespace="zenodo_stats")
    if cached is not _MISSING:
        return cached

    if "/records/" in url:
        rec = url.split("/records/")[-1]
    elif "/record/" in url:
        rec = url.split("/record/")[-1]
    elif "zenodo." in url:
        rec = url.split("zenodo.")[-1]
    else:
        logger.info(f"  Could not parse Zenodo URL {url}")
        return None

    # Strip fragments (#...) and query strings (?...)
    rec = rec.split("#")[0].split("?")[0].strip("/")

    result = None
    try:
        for attempt in range(4):  # up to 3 retries on 429
            resp = _session.get(f"https://zenodo.org/api/records/{rec}", timeout=_session.default_timeout)
            if resp.status_code == 200:
                record = resp.json()
                stats = record.get("stats", {})
                result = {
                    "zenodo_views": stats.get("unique_views", 0),
                    "zenodo_downloads": stats.get("unique_downloads", 0),
                    "updated_at": record.get("updated", ""),
                    "created_at": record.get("created", ""),
                }
                break
            if resp.status_code == 429:
                retry_after = int(resp.headers.get("Retry-After", 0))
                wait = max(retry_after, 2 ** (attempt + 1))  # exponential backoff: 2, 4, 8s
                logger.info(f"  Zenodo 429 for {url}, waiting {wait}s (attempt {attempt + 1}/4)")
                time.sleep(wait)
            else:
                logger.info(f"  Could not collect Zenodo stats for {url} (HTTP {resp.status_code})")
                break
    except requests.RequestException as e:
        logger.error(f"  Zenodo request error for {url}: {e}")

    _write_cache(CACHE_DIR, url, result, namespace="zenodo_stats")
    return result

`cached_figshare_stats(url, ttl=CACHE_TTL_STATS)` ¶

Fetch Figshare article stats with caching.

Source code in src/scrapers/sys_sec_scrape.py

def cached_figshare_stats(url, ttl=CACHE_TTL_STATS):
    """Fetch Figshare article stats with caching."""
    cached = _read_cache(CACHE_DIR, url, ttl=ttl, namespace="figshare_stats")
    if cached is not _MISSING:
        return cached

    clean = url
    if clean.endswith((".v1", ".v2", ".v3", ".v4", ".v5", ".v6", ".v7", ".v8", ".v9")):
        clean = clean[:-3]
    article_id = clean.split("figshare.")[-1]

    views = downloads = -1
    updated = created = "NA"
    try:
        r = _session.get(
            f"https://stats.figshare.com/total/views/article/{article_id}", timeout=_session.default_timeout
        )
        if r.status_code == 200:
            views = r.json().get("totals", -1)
        r = _session.get(
            f"https://stats.figshare.com/total/downloads/article/{article_id}", timeout=_session.default_timeout
        )
        if r.status_code == 200:
            downloads = r.json().get("totals", -1)
        r = _session.get(f"https://api.figshare.com/v2/articles/{article_id}", timeout=_session.default_timeout)
        if r.status_code == 200:
            d = r.json()
            updated = d.get("modified_date", "NA")
            created = d.get("created_date", "NA")
    except requests.RequestException as e:
        logger.error(f"  Figshare request error for {url}: {e}")

    result = {
        "figshare_views": views,
        "figshare_downloads": downloads,
        "updated_at": updated,
        "created_at": created,
    }
    _write_cache(CACHE_DIR, url, result, namespace="figshare_stats")
    return result

sys_sec_scrape¶

src.scrapers.sys_sec_scrape ¶

check_url_cached(url: str, ttl: int = CACHE_TTL_URL) -> bool ¶

cached_github_stats(url: str, ttl: int = CACHE_TTL_STATS) -> dict[str, Any] ¶

cached_zenodo_stats(url: str, ttl: int = CACHE_TTL_STATS) -> dict[str, Any] ¶

cached_figshare_stats(url, ttl=CACHE_TTL_STATS) ¶

`src.scrapers.sys_sec_scrape` ¶

`check_url_cached(url: str, ttl: int = CACHE_TTL_URL) -> bool` ¶

`cached_github_stats(url: str, ttl: int = CACHE_TTL_STATS) -> dict[str, Any]` ¶

`cached_zenodo_stats(url: str, ttl: int = CACHE_TTL_STATS) -> dict[str, Any]` ¶

`cached_figshare_stats(url, ttl=CACHE_TTL_STATS)` ¶