generate_combined_rankings¶

`src.generators.rankings.generate_combined_rankings` ¶

Generate combined rankings that merge artifact authorship with AE committee service. Reads the per-area author JSON and AE member JSON produced by earlier pipeline stages and writes combined JSON files for the Jekyll site.
Outputs
assets/data/combined_rankings.json assets/data/systems_combined_rankings.json assets/data/security_combined_rankings.json _data/combined_summary.yml
Usage
python generate_combined_rankings.py --data_dir ../reprodb.github.io/src
`generate_combined_rankings(data_dir: str) -> None` ¶

Read author + AE data, write combined ranking files.
Source code in src/generators/rankings/generate_combined_rankings.py
def generate_combined_rankings(data_dir: str) -> None:
    """Read author + AE data, write combined ranking files."""

    assets_data = Path(data_dir) / "assets" / "data"
    yaml_dir = Path(data_dir) / "_data"

    # Load author data
    def _load_json(name):
        path = assets_data / name
        if not path.exists():
            logger.warning(f"  Warning: {name} not found, skipping")
            return []
        return load_json(path)

    all_authors = _load_json("authors.json")
    sys_authors = _load_json("systems_authors.json")
    sec_authors = _load_json("security_authors.json")
    all_ae_members = _load_json("ae_members.json")
    sys_members = _load_json("systems_ae_members.json")
    sec_members = _load_json("security_ae_members.json")

    # Load citation data and merge into authors
    cited_by_author = _load_json("cited_artifacts_by_author.json")

    # Create a mapping of normalized author names to citation counts
    def _normalize_for_citation(name):
        """Match names to citation data (uses same normalization as our name matching)."""
        norm = _normalize_name(name)
        return norm

    citation_by_norm = {}
    if cited_by_author:
        for author_name, author_data in cited_by_author.items():
            # author_data is either the old format (list) or new format (dict with cited_artifacts)
            if isinstance(author_data, dict):
                total_citations = author_data.get("total_citations", 0)
            elif isinstance(author_data, list):
                # Old format: just a list of artifacts
                total_citations = sum(int(a.get("citations", 0)) for a in author_data)
            else:
                total_citations = 0

            norm = _normalize_for_citation(author_name)
            citation_by_norm[norm] = total_citations

    # Merge citation data into author lists
    def _add_citations_to_authors(authors_list):
        """Add artifact_citations field to each author from citation data."""
        for author in authors_list:
            norm = _normalize_for_citation(author.get("name", ""))
            author["artifact_citations"] = citation_by_norm.get(norm, 0)

    _add_citations_to_authors(all_authors)
    _add_citations_to_authors(sys_authors)
    _add_citations_to_authors(sec_authors)

    # Generate combined rankings for systems and security
    combined_sys = _merge_rankings(sys_authors, sys_members)
    combined_sec = _merge_rankings(sec_authors, sec_members)

    # Create combined_all as the union of systems and security to enforce
    # monotonic totals (all >= systems, all >= security)
    logger.info("Merging systems and security rankings into combined all...")
    combined_all_dict = {}

    # Add all people from systems
    for person in combined_sys:
        # Use raw name (preserves DBLP suffix like '0017') so that
        # distinct people who share the same base name are not merged.
        key = person["name"]
        combined_all_dict[key] = person.copy()

    # Merge in people from security
    for person in combined_sec:
        key = person["name"]
        if key in combined_all_dict:
            # Person is in both - merge their data by SUMMING contributions
            # Systems and security track different conferences, so artifacts,
            # papers, and AE memberships should be additive
            existing = combined_all_dict[key]

            # Sum all contribution metrics
            existing["artifact_count"] += person["artifact_count"]
            existing["artifact_score"] += person["artifact_score"]
            existing["artifact_citations"] += person.get("artifact_citations", 0)
            existing["citation_score"] += person.get("citation_score", 0)
            existing["badges_available"] += person.get("badges_available", 0)
            existing["badges_functional"] += person.get("badges_functional", 0)
            existing["badges_reproducible"] += person.get("badges_reproducible", 0)
            existing["ae_memberships"] += person["ae_memberships"]
            existing["chair_count"] += person["chair_count"]
            existing["ae_score"] += person["ae_score"]
            existing["combined_score"] += person["combined_score"]
            existing["total_papers"] += person["total_papers"]

            # Merge conferences and years (union)
            existing_confs = set(existing.get("conferences", []))
            person_confs = set(person.get("conferences", []))
            existing["conferences"] = sorted(existing_confs | person_confs)

            existing_years = existing.get("years", {})
            person_years = person.get("years", {})
            merged_years = existing_years.copy()
            for yr, cnt in person_years.items():
                # For years, sum the activity counts
                merged_years[yr] = merged_years.get(yr, 0) + cnt
            existing["years"] = merged_years

            # Update year range
            all_years = list(merged_years.keys())
            if all_years:
                existing["first_year"] = min(all_years)
                existing["last_year"] = max(all_years)

            if existing["artifact_count"] > existing["total_papers"]:
                logger.info(
                    f"  ⚠ DBLP undercount after merge for '{existing['name']}': "
                    f"artifact_count ({existing['artifact_count']}) > total_papers ({existing['total_papers']}), clamping"
                )
                existing["total_papers"] = existing["artifact_count"]
            if existing["badges_reproducible"] > existing["artifact_count"]:
                raise ValueError(
                    f"Invariant violation after systems+security merge for '{existing['name']}': reproduced_badges ({existing['badges_reproducible']}) > artifact_count ({existing['artifact_count']})"
                )
            if existing["badges_functional"] > existing["artifact_count"]:
                raise ValueError(
                    f"Invariant violation after systems+security merge for '{existing['name']}': functional_badges ({existing['badges_functional']}) > artifact_count ({existing['artifact_count']})"
                )

            # Recalculate rates based on summed totals
            if existing["total_papers"] > 0:
                existing["artifact_pct"] = int(round((existing["artifact_count"] / existing["total_papers"]) * 100))
            if existing["artifact_count"] > 0:
                existing["repro_pct"] = int(round((existing["badges_reproducible"] / existing["artifact_count"]) * 100))
            # Recalculate ae_ratio based on merged scores
            if existing["ae_score"] > 0:
                existing["ae_ratio"] = round(existing["artifact_score"] / existing["ae_score"], 2)
            else:
                existing["ae_ratio"] = None
        else:
            # Person only in security - add them
            combined_all_dict[key] = person.copy()

    # Convert back to list and sort by combined_score descending
    combined_all = sorted(combined_all_dict.values(), key=lambda x: x["combined_score"], reverse=True)

    # Filter: only include people with combined_score >= 3
    # With additive scoring (each badge level=+1, max 3 per artifact,
    # AE membership=3, AE chair=+2), a score of 3 means at least one
    # reproducible artifact, or one AE membership, or meaningful contribution.
    combined_all = [c for c in combined_all if c["combined_score"] >= 3]
    combined_sys = [c for c in combined_sys if c["combined_score"] >= 3]
    combined_sec = [c for c in combined_sec if c["combined_score"] >= 3]

    # Re-rank after filtering
    for lst in (combined_all, combined_sys, combined_sec):
        rank = 1
        for i, c in enumerate(lst):
            if i > 0 and c["combined_score"] < lst[i - 1]["combined_score"]:
                rank = i + 1
            c["rank"] = rank

    # Inject author_id from the canonical index
    try:
        from src.utils.normalization.author_index import build_name_to_id

        name_to_id = build_name_to_id(data_dir)
        if name_to_id:
            for lst in (combined_all, combined_sys, combined_sec):
                for entry in lst:
                    aid = name_to_id.get(entry["name"])
                    if aid is not None:
                        entry["author_id"] = aid
            logger.info("  Author IDs injected from index")
    except ImportError:
        logger.debug("Optional module not available, skipping enrichment")

    # Write JSON
    assets_data.mkdir(parents=True, exist_ok=True)
    for fname, data in [
        ("combined_rankings.json", combined_all),
        ("systems_combined_rankings.json", combined_sys),
        ("security_combined_rankings.json", combined_sec),
    ]:
        path = assets_data / fname
        save_validated_json(path, data, AuthorRanking, indent=None)
        logger.info(f"  Wrote {path} ({len(data)} entries)")

    # ── Per-conference combined rankings ──────────────────────────────────
    # Discover conferences from {conf}_conf_authors.json files in _build/
    import glob

    build_dir = assets_data.parent / "_build"
    conf_author_files = glob.glob(str(build_dir / "*_conf_authors.json"))
    # Fall back to legacy location (assets/data/) for backward compatibility
    if not conf_author_files:
        conf_author_files = glob.glob(str(assets_data / "*_conf_authors.json"))
    for conf_author_path in sorted(conf_author_files):
        conf_lower = Path(conf_author_path).name.replace("_conf_authors.json", "")
        conf_upper = conf_lower.upper()

        # Load per-conference authors (already filtered & scored for this conf)
        conf_authors_data = load_json(conf_author_path)

        # Add citation data
        _add_citations_to_authors(conf_authors_data)

        # Filter AE members to this conference, recompute per-conf AE stats
        conf_ae_members = []
        for m in all_ae_members:
            entries = [
                c
                for c in (m.get("conferences") or [])
                if (isinstance(c, list) and c[0] == conf_upper)
                or (isinstance(c, dict) and c.get("conference") == conf_upper)
            ]
            if not entries:
                continue

            def _conf_role(e):
                return e[2] if isinstance(e, list) else e.get("role", "member")

            def _conf_year(e):
                return e[1] if isinstance(e, list) else e.get("year")

            conf_m = {
                "name": m["name"],
                "display_name": m.get("display_name", m["name"]),
                "affiliation": m.get("affiliation", ""),
                "total_memberships": len(entries),
                "chair_count": sum(1 for e in entries if _conf_role(e) == "chair"),
                "conferences": entries,
                "years": {},
            }
            for e in entries:
                yr = str(_conf_year(e))
                conf_m["years"][yr] = conf_m["years"].get(yr, 0) + 1
            conf_ae_members.append(conf_m)

        # Merge using the same logic as area-level rankings
        conf_combined = _merge_rankings(conf_authors_data, conf_ae_members)
        conf_combined = [c for c in conf_combined if c["combined_score"] >= 3]

        # Re-rank
        rank = 1
        for i, c in enumerate(conf_combined):
            if i > 0 and c["combined_score"] < conf_combined[i - 1]["combined_score"]:
                rank = i + 1
            c["rank"] = rank

        # Inject author_id
        try:
            if name_to_id:
                for entry in conf_combined:
                    aid = name_to_id.get(entry["name"])
                    if aid is not None:
                        entry["author_id"] = aid
        except NameError:
            pass

        fname = f"{conf_lower}_combined_rankings.json"
        path = assets_data / fname
        save_validated_json(path, conf_combined, AuthorRanking, indent=None)
        logger.info(f"  Wrote {path} ({len(conf_combined)} entries)")

    # Summary YAML
    # Count people who have both artifacts AND AE service
    both_all = sum(1 for c in combined_all if c["artifact_count"] > 0 and c["ae_memberships"] > 0)
    both_sys = sum(1 for c in combined_sys if c["artifact_count"] > 0 and c["ae_memberships"] > 0)
    both_sec = sum(1 for c in combined_sec if c["artifact_count"] > 0 and c["ae_memberships"] > 0)

    summary = {
        "combined_total": len(combined_all),
        "combined_systems": len(combined_sys),
        "combined_security": len(combined_sec),
        "both_artifacts_and_ae": both_all,
        "both_artifacts_and_ae_systems": both_sys,
        "both_artifacts_and_ae_security": both_sec,
        "top_combined_score": combined_all[0]["combined_score"] if combined_all else 0,
    }
    yml_path = yaml_dir / "combined_summary.yml"
    save_yaml(yml_path, summary)
    logger.info(f"  Wrote {yml_path}")

    logger.info(
        f"  Combined rankings: {len(combined_all)} total, {len(combined_sys)} systems, {len(combined_sec)} security"
    )
    logger.info(f"  People with both artifacts and AE service: {both_all}")
generate_combined_rankings¶

src.generators.rankings.generate_combined_rankings ¶

generate_combined_rankings(data_dir: str) -> None ¶

`src.generators.rankings.generate_combined_rankings` ¶

`generate_combined_rankings(data_dir: str) -> None` ¶