download_dblp¶

`src.utils.apis.download_dblp` ¶

Download DBLP XML database (~3 GB compressed) for author matching.

Usage::

python -m src.utils.download_dblp            # interactive
python -m src.utils.download_dblp --auto     # non-interactive (CI)

Output: data/dblp/dblp.xml.gz

This replaces the former scripts/download_dblp.sh and removes the dependency on curl.

`download_dblp(*, auto: bool = False) -> bool` ¶

Download or update the DBLP XML file.

Parameters¶

auto : bool If True run non-interactively (skip prompts, download only when missing or outdated).

Returns¶

bool True if the file is present and valid at exit.

Source code in src/utils/apis/download_dblp.py

def download_dblp(*, auto: bool = False) -> bool:
    """Download or update the DBLP XML file.

    Parameters
    ----------
    auto : bool
        If ``True`` run non-interactively (skip prompts, download only
        when missing or outdated).

    Returns
    -------
    bool
        ``True`` if the file is present and valid at exit.
    """
    DBLP_DIR.mkdir(parents=True, exist_ok=True)

    if DBLP_FILE.is_file():
        size_mb = DBLP_FILE.stat().st_size >> 20
        freshness = _is_up_to_date(DBLP_FILE)

        if freshness is True:
            logger.info("DBLP file is up to date (%d MB)", size_mb)
            return True

        if freshness is False:
            logger.warning("DBLP file is outdated (%d MB)", size_mb)
        else:
            logger.warning("DBLP file exists (%d MB), could not check remote date", size_mb)
            if auto:
                return True

        if not auto:
            try:
                answer = input("Re-download? (y/N): ").strip().lower()
            except EOFError:
                answer = "n"
            if answer != "y":
                return True

        DBLP_FILE.unlink()

    # Connectivity check
    try:
        headers = {"User-Agent": "ReproDB-Pipeline/1.0 (+https://github.com/ReproDB/reprodb-pipeline)"}
        requests.head(DBLP_URL, timeout=10, headers=headers)
    except requests.ConnectionError:
        proxy = os.environ.get("https_proxy", "")
        logger.error("Cannot connect to dblp.org (proxy: %s)", proxy)
        return False

    _download(DBLP_FILE)

    size_mb = DBLP_FILE.stat().st_size >> 20
    if size_mb < MIN_SIZE_MB:
        logger.error("File too small (%d MB, expected >= %d MB) — download may be truncated", size_mb, MIN_SIZE_MB)
        return False

    logger.info("Download complete (%d MB)", size_mb)
    return True

download_dblp¶

src.utils.apis.download_dblp ¶

download_dblp(*, auto: bool = False) -> bool ¶

Parameters¶

Returns¶

`src.utils.apis.download_dblp` ¶

`download_dblp(*, auto: bool = False) -> bool` ¶