generate_ranking_history¶

`src.generators.generate_ranking_history` ¶

Maintain ranking history snapshots for authors and institutions.

Each pipeline run appends a timestamped snapshot to

assets/data/ranking_history.json (author rankings over time)
assets/data/institution_ranking_history.json (institution rankings over time)

The history files are arrays of snapshot objects

[ { "date": "2026-03", "entries": { "Author Name": {"rank": 1, "score": 60, "as": 0, "aes": 60}, ... } }, ... ]

Only authors/institutions that appear in the current rankings are tracked. The "date" is year-month (YYYY-MM) to give monthly granularity.

Usage

python -m src.generators.generate_ranking_history --data_dir ../reprodb.github.io

`generate_ranking_history(data_dir: str, force: bool = False) -> None` ¶

Generate/update author and institution ranking history.

Source code in src/generators/generate_ranking_history.py

def generate_ranking_history(data_dir: str, force: bool = False) -> None:
    """Generate/update author and institution ranking history."""
    date = _snapshot_date()

    # ── Author rankings ──────────────────────────────────────────────────
    cr_path = os.path.join(data_dir, "assets/data/combined_rankings.json")
    author_hist_path = os.path.join(data_dir, "assets/data/ranking_history.json")

    rankings = _load_json(cr_path)
    author_history: list = _load_json(author_hist_path)  # type: ignore[assignment]

    if _has_snapshot(author_history, date) and not force:
        logger.warning(
            f"  Author ranking history: snapshot for {date} already exists, skipping (use --force to overwrite)"
        )
    else:
        author_entries = {}
        for r in rankings:
            name = r.get("name", "")
            if not name:
                continue
            author_entries[name] = {
                "rank": r.get("rank", 0),
                "score": r.get("combined_score", 0),
                "as": r.get("artifact_score", 0),
                "aes": r.get("ae_score", 0),
                "tp": r.get("total_papers", 0),
                "ta": r.get("artifacts", 0),
                "ar": r.get("artifact_rate", 0),
                "rr": r.get("repro_rate", 0),
            }

        author_history = _update_history(author_history, author_entries, date)

        with open(author_hist_path, "w") as f:
            json.dump(author_history, f, ensure_ascii=False, separators=(",", ":"))

        logger.info(
            f"  Author ranking history: {len(author_history)} snapshots, {len(author_entries)} entries for {date}"
        )
        logger.info(f"  Wrote {author_hist_path} ({os.path.getsize(author_hist_path) / 1024:.0f}KB)")

    # ── Institution rankings ─────────────────────────────────────────────
    ir_path = os.path.join(data_dir, "assets/data/institution_rankings.json")
    inst_hist_path = os.path.join(data_dir, "assets/data/institution_ranking_history.json")

    inst_rankings = _load_json(ir_path)
    inst_history: list = _load_json(inst_hist_path)  # type: ignore[assignment]

    if _has_snapshot(inst_history, date) and not force:
        logger.warning(
            f"  Institution ranking history: snapshot for {date} already exists, skipping (use --force to overwrite)"
        )
    else:
        inst_entries = {}
        for idx, r in enumerate(inst_rankings):
            name = r.get("affiliation", "")
            if not name:
                continue
            # Calculate repro rate for institution
            inst_rr = 0
            if r.get("artifacts", 0) > 0:
                inst_rr = round((r.get("badges_reproducible", 0) / r["artifacts"]) * 100, 1)
            inst_entries[name] = {
                "rank": idx + 1,
                "score": r.get("combined_score", 0),
                "as": r.get("artifact_score", 0),
                "aes": r.get("ae_score", 0),
                "tp": r.get("total_papers", 0),
                "ta": r.get("artifacts", 0),
                "ar": r.get("artifact_rate", 0),
                "rr": inst_rr,
                "r": r.get("num_authors", 0),
            }

        inst_history = _update_history(inst_history, inst_entries, date)

        with open(inst_hist_path, "w") as f:
            json.dump(inst_history, f, ensure_ascii=False, separators=(",", ":"))

        logger.info(
            f"  Institution ranking history: {len(inst_history)} snapshots, {len(inst_entries)} entries for {date}"
        )
        logger.info(f"  Wrote {inst_hist_path} ({os.path.getsize(inst_hist_path) / 1024:.0f}KB)")

generate_ranking_history¶

src.generators.generate_ranking_history ¶

generate_ranking_history(data_dir: str, force: bool = False) -> None ¶

`src.generators.generate_ranking_history` ¶

`generate_ranking_history(data_dir: str, force: bool = False) -> None` ¶