Skip to content

generate_ranking_history

src.generators.generate_ranking_history

Maintain ranking history snapshots for authors and institutions.

Each pipeline run appends a timestamped snapshot to
  • assets/data/ranking_history.json (author rankings over time)
  • assets/data/institution_ranking_history.json (institution rankings over time)
The history files are arrays of snapshot objects

[ { "date": "2026-03", "entries": { "Author Name": {"rank": 1, "score": 60, "as": 0, "aes": 60}, ... } }, ... ]

Only authors/institutions that appear in the current rankings are tracked. The "date" is year-month (YYYY-MM) to give monthly granularity.

Usage

python -m src.generators.generate_ranking_history --data_dir ../reprodb.github.io

generate_ranking_history(data_dir: str, force: bool = False) -> None

Generate/update author and institution ranking history.

Source code in src/generators/generate_ranking_history.py
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
def generate_ranking_history(data_dir: str, force: bool = False) -> None:
    """Generate/update author and institution ranking history."""
    date = _snapshot_date()

    # ── Author rankings ──────────────────────────────────────────────────
    cr_path = os.path.join(data_dir, "assets/data/combined_rankings.json")
    author_hist_path = os.path.join(data_dir, "assets/data/ranking_history.json")

    rankings = _load_json(cr_path)
    author_history: list = _load_json(author_hist_path)  # type: ignore[assignment]

    if _has_snapshot(author_history, date) and not force:
        logger.warning(
            f"  Author ranking history: snapshot for {date} already exists, skipping (use --force to overwrite)"
        )
    else:
        author_entries = {}
        for r in rankings:
            name = r.get("name", "")
            if not name:
                continue
            author_entries[name] = {
                "rank": r.get("rank", 0),
                "score": r.get("combined_score", 0),
                "as": r.get("artifact_score", 0),
                "aes": r.get("ae_score", 0),
                "tp": r.get("total_papers", 0),
                "ta": r.get("artifacts", 0),
                "ar": r.get("artifact_rate", 0),
                "rr": r.get("repro_rate", 0),
            }

        author_history = _update_history(author_history, author_entries, date)

        with open(author_hist_path, "w") as f:
            json.dump(author_history, f, ensure_ascii=False, separators=(",", ":"))

        logger.info(
            f"  Author ranking history: {len(author_history)} snapshots, {len(author_entries)} entries for {date}"
        )
        logger.info(f"  Wrote {author_hist_path} ({os.path.getsize(author_hist_path) / 1024:.0f}KB)")

    # ── Institution rankings ─────────────────────────────────────────────
    ir_path = os.path.join(data_dir, "assets/data/institution_rankings.json")
    inst_hist_path = os.path.join(data_dir, "assets/data/institution_ranking_history.json")

    inst_rankings = _load_json(ir_path)
    inst_history: list = _load_json(inst_hist_path)  # type: ignore[assignment]

    if _has_snapshot(inst_history, date) and not force:
        logger.warning(
            f"  Institution ranking history: snapshot for {date} already exists, skipping (use --force to overwrite)"
        )
    else:
        inst_entries = {}
        for idx, r in enumerate(inst_rankings):
            name = r.get("affiliation", "")
            if not name:
                continue
            # Calculate repro rate for institution
            inst_rr = 0
            if r.get("artifacts", 0) > 0:
                inst_rr = round((r.get("badges_reproducible", 0) / r["artifacts"]) * 100, 1)
            inst_entries[name] = {
                "rank": idx + 1,
                "score": r.get("combined_score", 0),
                "as": r.get("artifact_score", 0),
                "aes": r.get("ae_score", 0),
                "tp": r.get("total_papers", 0),
                "ta": r.get("artifacts", 0),
                "ar": r.get("artifact_rate", 0),
                "rr": inst_rr,
                "r": r.get("num_authors", 0),
            }

        inst_history = _update_history(inst_history, inst_entries, date)

        with open(inst_hist_path, "w") as f:
            json.dump(inst_history, f, ensure_ascii=False, separators=(",", ":"))

        logger.info(
            f"  Institution ranking history: {len(inst_history)} snapshots, {len(inst_entries)} entries for {date}"
        )
        logger.info(f"  Wrote {inst_hist_path} ({os.path.getsize(inst_hist_path) / 1024:.0f}KB)")