generate_institution_rankings¶

`src.generators.rankings.generate_institution_rankings` ¶

Generate institution rankings by aggregating combined ranking data by affiliation. Creates JSON files for overall, systems, and security institution rankings.

`load_combined_ranking(path)` ¶

Load combined ranking JSON.

Source code in src/generators/rankings/generate_institution_rankings.py

def load_combined_ranking(path):
    """Load combined ranking JSON."""
    return load_json(path)

`aggregate_by_institution(combined_data)` ¶

Aggregate individual rankings by institution affiliation.

Source code in src/generators/rankings/generate_institution_rankings.py

def aggregate_by_institution(combined_data):
    """Aggregate individual rankings by institution affiliation."""
    inst_data = defaultdict(
        lambda: {
            "affiliation": "",
            "combined_score": 0,
            "artifact_score": 0,
            "artifact_citations": 0,
            "citation_score": 0,
            "ae_score": 0,
            "artifact_count": 0,
            "badges_functional": 0,
            "badges_reproducible": 0,
            "ae_memberships": 0,
            "chair_count": 0,
            "total_papers": 0,
            "author_count": 0,
            "conferences": set(),
            "years": defaultdict(int),
        }
    )

    for person in combined_data:
        affiliation = _normalize_affiliation(person.get("affiliation", "").strip())

        # Skip entries with no affiliation or placeholder affiliations
        if not affiliation or affiliation == "Unknown" or affiliation.startswith("_"):
            affiliation = "Unknown"

        inst = inst_data[affiliation]
        inst["affiliation"] = affiliation
        inst["combined_score"] += person.get("combined_score", 0)
        inst["artifact_score"] += person.get("artifact_score", 0)
        inst["artifact_citations"] += person.get("artifact_citations", 0)
        inst["citation_score"] += person.get("citation_score", 0)
        inst["ae_score"] += person.get("ae_score", 0)
        inst["artifact_count"] += person.get("artifact_count", 0)
        inst["badges_functional"] += person.get("badges_functional", 0)
        inst["badges_reproducible"] += person.get("badges_reproducible", 0)
        inst["ae_memberships"] += person.get("ae_memberships", 0)
        inst["chair_count"] += person.get("chair_count", 0)
        inst["total_papers"] += person.get("total_papers", 0)
        inst["author_count"] += 1

        # Aggregate conferences
        if person.get("conferences"):
            inst["conferences"].update(person["conferences"])

        # Aggregate years
        if person.get("years"):
            for year, count in person["years"].items():
                inst["years"][year] += count

    # Convert to list and calculate derived fields
    institutions = []
    for affiliation, data in inst_data.items():
        if data["artifact_count"] > data["total_papers"]:
            raise ValueError(
                f"Invariant violation for institution '{affiliation}': artifact_count ({data['artifact_count']}) > total_papers ({data['total_papers']})"
            )
        if data["badges_reproducible"] > data["artifact_count"]:
            raise ValueError(
                f"Invariant violation for institution '{affiliation}': reproduced_badges ({data['badges_reproducible']}) > artifact_count ({data['artifact_count']})"
            )
        if data["badges_functional"] > data["artifact_count"]:
            raise ValueError(
                f"Invariant violation for institution '{affiliation}': functional_badges ({data['badges_functional']}) > artifact_count ({data['artifact_count']})"
            )

        # Calculate artifact rate
        artifact_pct = 0
        if data["total_papers"] > 0:
            artifact_pct = round((data["artifact_count"] / data["total_papers"]) * 100, 1)

        # Calculate A:E ratio
        ae_ratio = None
        if data["ae_score"] > 0:
            ae_ratio = round(data["artifact_score"] / data["ae_score"], 2)
        elif data["artifact_score"] > 0:
            ae_ratio = None  # Artifact-only, will display as ∞
        else:
            ae_ratio = 0.0  # Neither artifacts nor AE service

        # Classify institution role based on A:E ratio
        if ae_ratio is None:
            # Artifact-only (ae_score == 0, artifact_score > 0) → creator
            role = "Producer"
        elif ae_ratio == 0.0:
            # AE-only or neither (artifact_score == 0) → evaluator
            role = "Consumer"
        elif ae_ratio > 2.0:
            role = "Producer"
        elif ae_ratio < 0.5:
            role = "Consumer"
        else:
            role = "Balanced"

        # Only include institutions with meaningful contributions, excluding incomplete affiliations
        if data["combined_score"] >= 3 and affiliation.strip() not in ("Univ", "University", "Unknown", "_"):
            institutions.append(
                {
                    "affiliation": data["affiliation"],
                    "combined_score": data["combined_score"],
                    "artifact_score": data["artifact_score"],
                    "artifact_citations": data["artifact_citations"],
                    "citation_score": data["citation_score"],
                    "ae_score": data["ae_score"],
                    "ae_ratio": ae_ratio,
                    "role": role,
                    "artifact_count": data["artifact_count"],
                    "badges_functional": data["badges_functional"],
                    "badges_reproducible": data["badges_reproducible"],
                    "ae_memberships": data["ae_memberships"],
                    "chair_count": data["chair_count"],
                    "total_papers": data["total_papers"],
                    "artifact_pct": artifact_pct,
                    "author_count": data["author_count"],
                    "conferences": sorted(list(data["conferences"])),
                    "years": {str(k): v for k, v in data["years"].items()},
                    "top_authors": [],
                }
            )

    # Sort by combined_score descending
    institutions.sort(key=lambda x: x["combined_score"], reverse=True)

    return institutions

`main()` ¶

Generate institution ranking JSON files.

Source code in src/generators/rankings/generate_institution_rankings.py

def main():
    """Generate institution ranking JSON files."""
    parser = argparse.ArgumentParser(description="Generate institution rankings")
    parser.add_argument("--data_dir", type=str, default=None, help="Path to website root (reprodb.github.io)")
    args = parser.parse_args()

    if args.data_dir:
        website_path = Path(args.data_dir)
    else:
        base_path = Path(__file__).parent
        website_path = base_path.parent.parent.parent.parent / "reprodb.github.io" / "src"
    data_dir = website_path / "assets" / "data"

    # Build classifier once for country resolution
    logger.info("Building institution classifier...")
    prefix_tree, name_index = _build_classifier()

    def _enrich_with_country(institutions: list[dict]) -> None:
        """Add country and country_code fields to each institution dict in place."""
        matched = 0
        for inst in institutions:
            country, code = _classify_country(inst["affiliation"], prefix_tree, name_index)
            inst["country"] = country
            inst["country_code"] = code
            if code:
                matched += 1
        logger.info(f"    Country classification: {matched}/{len(institutions)} matched")

    # Process overall combined ranking
    logger.info("Processing overall combined ranking...")
    combined_path = data_dir / "combined_rankings.json"
    if combined_path.exists():
        combined_data = load_combined_ranking(combined_path)
        institutions = aggregate_by_institution(combined_data)
        _enrich_with_country(institutions)

        output_path = data_dir / "institution_rankings.json"
        save_validated_json(output_path, institutions, InstitutionRanking)
        logger.info(f"  ✓ Generated {output_path} ({len(institutions)} institutions)")
    else:
        logger.info(f"  ✗ {combined_path} not found")

    # Process per-area combined rankings (systems, security) into separate files.
    # Each gets country_code enrichment so the website doesn't need a lookup step.
    for area in ("systems", "security"):
        area_combined_path = data_dir / f"{area}_combined_rankings.json"
        if area_combined_path.exists():
            logger.info(f"Processing {area} institution rankings...")
            area_data = load_combined_ranking(area_combined_path)
            area_institutions = aggregate_by_institution(area_data)
            _enrich_with_country(area_institutions)

            area_output = data_dir / f"{area}_institution_rankings.json"
            save_validated_json(area_output, area_institutions, InstitutionRanking)
            logger.info(f"  ✓ Generated {area_output} ({len(area_institutions)} institutions)")
        else:
            logger.info(f"  ✗ {area_combined_path} not found")

generate_institution_rankings¶

src.generators.rankings.generate_institution_rankings ¶

load_combined_ranking(path) ¶

aggregate_by_institution(combined_data) ¶

main() ¶

`src.generators.rankings.generate_institution_rankings` ¶

`load_combined_ranking(path)` ¶

`aggregate_by_institution(combined_data)` ¶

`main()` ¶