generate_paper_citations_doi¶

`src.generators.citations.generate_paper_citations_doi` ¶

Generate paper citation counts via OpenAlex and Semantic Scholar (DOI-based).

Reads

assets/data/artifacts.json — paper DOIs (paper_url field)

Writes

_build/paper_citations.json — per-paper citation data _build/citation_history.json — time-series history (append-only)

Usage::

python -m src.generators.generate_paper_citations_doi --data_dir ../reprodb.github.io/src

`generate(data_dir: str) -> list[dict] | None` ¶

Collect paper citation counts and write results + history.

Source code in src/generators/citations/generate_paper_citations_doi.py

def generate(data_dir: str) -> list[dict] | None:
    """Collect paper citation counts and write results + history."""
    data_path = Path(data_dir)
    artifacts_path = data_path / "assets" / "data" / "artifacts.json"
    build_dir = data_path / "_build"
    build_dir.mkdir(parents=True, exist_ok=True)
    out_path = build_dir / "paper_citations.json"

    if not artifacts_path.exists():
        logger.error("artifacts.json not found at %s", artifacts_path)
        return None

    artifacts: list[dict] = load_json(artifacts_path)
    logger.info("Loaded %d artifacts", len(artifacts))

    # Deduplicate by normalized title
    seen_titles: set[str] = set()
    unique: list[dict] = []
    for a in artifacts:
        norm = normalize_title(a.get("title", ""))
        if norm and norm not in seen_titles:
            seen_titles.add(norm)
            unique.append(a)
    logger.info("%d unique papers to process", len(unique))

    session = create_session()

    s2_disabled = os.environ.get("DISABLE_SEMANTIC_SCHOLAR", "").strip() == "1"
    s2_timeout_failures = 0

    entries: list[dict] = []
    cached_count = 0
    fetched_count = 0

    for i, artifact in enumerate(unique):
        title = artifact.get("title", "")
        norm = normalize_title(title)
        doi = _extract_paper_doi(artifact.get("paper_url"))

        # Try cache first (keyed by DOI or normalized title)
        cache_k = _cache_key(doi) if doi else _cache_key(norm)
        cached = read_cache(CACHE_DIR, cache_k, CACHE_TTL, CACHE_NS)
        if cached is not _MISSING:
            entries.append(cached)
            cached_count += 1
            continue

        # Lookup via APIs
        openalex_count: int | None = None
        s2_count: int | None = None
        source = ""
        openalex_id = ""

        if doi:
            time.sleep(_OPENALEX_DELAY)
            oa = _openalex_lookup(doi, session)
            if oa:
                openalex_count = oa["cited_by_count"]
                openalex_id = oa["openalex_id"]
                source = "openalex_doi"

            if not s2_disabled:
                time.sleep(_S2_DELAY)
                s2_count = _s2_lookup(doi, session)
                if s2_count is None and s2_timeout_failures < _S2_MAX_TIMEOUT_FAILURES:
                    s2_timeout_failures += 1
                    if s2_timeout_failures >= _S2_MAX_TIMEOUT_FAILURES:
                        logger.warning("Disabling Semantic Scholar after %d timeout failures", s2_timeout_failures)
                        s2_disabled = True
        else:
            # No DOI — fall back to OpenAlex title search
            time.sleep(_OPENALEX_DELAY)
            oa = _openalex_title_search(title, session)
            if oa:
                openalex_count = oa["cited_by_count"]
                openalex_id = oa["openalex_id"]
                source = "openalex_title"

        cited_by = best_citation_count(openalex_count, s2_count)

        entry = {
            "title": title,
            "conference": artifact.get("conference", ""),
            "year": artifact.get("year", 0),
            "category": artifact.get("category", ""),
            "ae_paper": True,
            "paper_doi": doi,
            "openalex_id": openalex_id,
            "cited_by_count": cited_by,
            "citations_openalex": openalex_count,
            "citations_semantic_scholar": s2_count,
            "source": source,
        }
        entries.append(entry)
        write_cache(CACHE_DIR, cache_k, entry, CACHE_NS)
        fetched_count += 1

        if (i + 1) % 100 == 0:
            logger.info("Progress: %d/%d (cached=%d, fetched=%d)", i + 1, len(unique), cached_count, fetched_count)

    logger.info(
        "Done: %d entries (cached=%d, fetched=%d, with_citations=%d)",
        len(entries),
        cached_count,
        fetched_count,
        sum(1 for e in entries if isinstance(e.get("cited_by_count"), int) and e["cited_by_count"] > 0),
    )

    save_json(out_path, entries)
    logger.info("Wrote %s", out_path)

    # ── History tracking ─────────────────────────────────────────────────
    _update_history(entries, build_dir)

    return entries

generate_paper_citations_doi¶

src.generators.citations.generate_paper_citations_doi ¶

generate(data_dir: str) -> list[dict] | None ¶

`src.generators.citations.generate_paper_citations_doi` ¶

`generate(data_dir: str) -> list[dict] | None` ¶