generate_combined_rankings¶

`src.generators.generate_combined_rankings` ¶

Generate combined rankings that merge artifact authorship with AE committee service. Reads the per-area author JSON and AE member JSON produced by earlier pipeline stages and writes combined JSON files for the Jekyll site.
Outputs
assets/data/combined_rankings.json assets/data/systems_combined_rankings.json assets/data/security_combined_rankings.json _data/combined_summary.yml
Usage
python generate_combined_rankings.py --data_dir ../reprodb.github.io
`generate_combined_rankings(data_dir: str) -> None` ¶

Read author + AE data, write combined ranking files.
Source code in src/generators/generate_combined_rankings.py
def generate_combined_rankings(data_dir: str) -> None:
    """Read author + AE data, write combined ranking files."""

    assets_data = os.path.join(data_dir, "assets", "data")
    yaml_dir = os.path.join(data_dir, "_data")

    # Load author data
    def _load_json(name):
        path = os.path.join(assets_data, name)
        if not os.path.exists(path):
            logger.warning(f"  Warning: {name} not found, skipping")
            return []
        with open(path) as f:
            return json.load(f)

    all_authors = _load_json("authors.json")
    sys_authors = _load_json("systems_authors.json")
    sec_authors = _load_json("security_authors.json")
    all_ae_members = _load_json("ae_members.json")
    sys_members = _load_json("systems_ae_members.json")
    sec_members = _load_json("security_ae_members.json")

    # Load citation data and merge into authors
    cited_by_author = _load_json("cited_artifacts_by_author.json")

    # Create a mapping of normalized author names to citation counts
    def _normalize_for_citation(name):
        """Match names to citation data (uses same normalization as our name matching)."""
        norm = _normalize_name(name)
        return norm

    citation_by_norm = {}
    if cited_by_author:
        for author_name, author_data in cited_by_author.items():
            # author_data is either the old format (list) or new format (dict with cited_artifacts)
            if isinstance(author_data, dict):
                total_citations = author_data.get("total_citations", 0)
            elif isinstance(author_data, list):
                # Old format: just a list of artifacts
                total_citations = sum(int(a.get("citations", 0)) for a in author_data)
            else:
                total_citations = 0

            norm = _normalize_for_citation(author_name)
            citation_by_norm[norm] = total_citations

    # Merge citation data into author lists
    def _add_citations_to_authors(authors_list):
        """Add artifact_citations field to each author from citation data."""
        for author in authors_list:
            norm = _normalize_for_citation(author.get("name", ""))
            author["artifact_citations"] = citation_by_norm.get(norm, 0)

    _add_citations_to_authors(all_authors)
    _add_citations_to_authors(sys_authors)
    _add_citations_to_authors(sec_authors)

    # Generate combined rankings for systems and security
    combined_sys = _merge_rankings(sys_authors, sys_members)
    combined_sec = _merge_rankings(sec_authors, sec_members)

    # Create combined_all as the union of systems and security to enforce
    # monotonic totals (all >= systems, all >= security)
    logger.info("Merging systems and security rankings into combined all...")
    combined_all_dict = {}

    # Add all people from systems
    for person in combined_sys:
        # Use raw name (preserves DBLP suffix like '0017') so that
        # distinct people who share the same base name are not merged.
        key = person["name"]
        combined_all_dict[key] = person.copy()

    # Merge in people from security
    for person in combined_sec:
        key = person["name"]
        if key in combined_all_dict:
            # Person is in both - merge their data by SUMMING contributions
            # Systems and security track different conferences, so artifacts,
            # papers, and AE memberships should be additive
            existing = combined_all_dict[key]

            # Sum all contribution metrics
            existing["artifacts"] += person["artifacts"]
            existing["artifact_score"] += person["artifact_score"]
            existing["artifact_citations"] += person.get("artifact_citations", 0)
            existing["citation_score"] += person.get("citation_score", 0)
            existing["badges_available"] += person.get("badges_available", 0)
            existing["badges_functional"] += person.get("badges_functional", 0)
            existing["badges_reproducible"] += person.get("badges_reproducible", 0)
            existing["ae_memberships"] += person["ae_memberships"]
            existing["chair_count"] += person["chair_count"]
            existing["ae_score"] += person["ae_score"]
            existing["combined_score"] += person["combined_score"]
            existing["total_papers"] += person["total_papers"]

            # Merge conferences and years (union)
            existing_confs = set(existing.get("conferences", []))
            person_confs = set(person.get("conferences", []))
            existing["conferences"] = sorted(existing_confs | person_confs)

            existing_years = existing.get("years", {})
            person_years = person.get("years", {})
            merged_years = existing_years.copy()
            for yr, cnt in person_years.items():
                # For years, sum the activity counts
                merged_years[yr] = merged_years.get(yr, 0) + cnt
            existing["years"] = merged_years

            # Update year range
            all_years = list(merged_years.keys())
            if all_years:
                existing["first_year"] = min(all_years)
                existing["last_year"] = max(all_years)

            if existing["artifacts"] > existing["total_papers"]:
                logger.info(
                    f"  ⚠ DBLP undercount after merge for '{existing['name']}': "
                    f"artifacts ({existing['artifacts']}) > total_papers ({existing['total_papers']}), clamping"
                )
                existing["total_papers"] = existing["artifacts"]
            if existing["badges_reproducible"] > existing["artifacts"]:
                raise ValueError(
                    f"Invariant violation after systems+security merge for '{existing['name']}': reproduced_badges ({existing['badges_reproducible']}) > artifacts ({existing['artifacts']})"
                )
            if existing["badges_functional"] > existing["artifacts"]:
                raise ValueError(
                    f"Invariant violation after systems+security merge for '{existing['name']}': functional_badges ({existing['badges_functional']}) > artifacts ({existing['artifacts']})"
                )

            # Recalculate rates based on summed totals
            if existing["total_papers"] > 0:
                existing["artifact_rate"] = int(round((existing["artifacts"] / existing["total_papers"]) * 100))
            if existing["artifacts"] > 0:
                existing["repro_rate"] = int(round((existing["badges_reproducible"] / existing["artifacts"]) * 100))
            # Recalculate ae_ratio based on merged scores
            if existing["ae_score"] > 0:
                existing["ae_ratio"] = round(existing["artifact_score"] / existing["ae_score"], 2)
            else:
                existing["ae_ratio"] = None
        else:
            # Person only in security - add them
            combined_all_dict[key] = person.copy()

    # Convert back to list and sort by combined_score descending
    combined_all = sorted(combined_all_dict.values(), key=lambda x: x["combined_score"], reverse=True)

    # Filter: only include people with combined_score >= 3
    # With additive scoring (each badge level=+1, max 3 per artifact,
    # AE membership=3, AE chair=+2), a score of 3 means at least one
    # reproducible artifact, or one AE membership, or meaningful contribution.
    combined_all = [c for c in combined_all if c["combined_score"] >= 3]
    combined_sys = [c for c in combined_sys if c["combined_score"] >= 3]
    combined_sec = [c for c in combined_sec if c["combined_score"] >= 3]

    # Re-rank after filtering
    for lst in (combined_all, combined_sys, combined_sec):
        rank = 1
        for i, c in enumerate(lst):
            if i > 0 and c["combined_score"] < lst[i - 1]["combined_score"]:
                rank = i + 1
            c["rank"] = rank

    # Inject author_id from the canonical index
    try:
        from src.utils.author_index import build_name_to_id

        name_to_id = build_name_to_id(data_dir)
        if name_to_id:
            for lst in (combined_all, combined_sys, combined_sec):
                for entry in lst:
                    aid = name_to_id.get(entry["name"])
                    if aid is not None:
                        entry["author_id"] = aid
            logger.info("  Author IDs injected from index")
    except ImportError:
        logger.debug("Optional module not available, skipping enrichment")

    # Write JSON
    os.makedirs(assets_data, exist_ok=True)
    for fname, data in [
        ("combined_rankings.json", combined_all),
        ("systems_combined_rankings.json", combined_sys),
        ("security_combined_rankings.json", combined_sec),
    ]:
        path = os.path.join(assets_data, fname)
        with open(path, "w") as f:
            json.dump(data, f, ensure_ascii=False)
        logger.info(f"  Wrote {path} ({len(data)} entries)")

    # ── Per-conference combined rankings ──────────────────────────────────
    # Discover conferences from existing {conf}_conf_authors.json files
    import glob

    conf_author_files = glob.glob(os.path.join(assets_data, "*_conf_authors.json"))
    for conf_author_path in sorted(conf_author_files):
        conf_lower = os.path.basename(conf_author_path).replace("_conf_authors.json", "")
        conf_upper = conf_lower.upper()

        # Load per-conference authors (already filtered & scored for this conf)
        with open(conf_author_path) as f:
            conf_authors_data = json.load(f)

        # Add citation data
        _add_citations_to_authors(conf_authors_data)

        # Filter AE members to this conference, recompute per-conf AE stats
        conf_ae_members = []
        for m in all_ae_members:
            entries = [c for c in (m.get("conferences") or []) if isinstance(c, list) and c[0] == conf_upper]
            if not entries:
                continue
            conf_m = {
                "name": m["name"],
                "display_name": m.get("display_name", m["name"]),
                "affiliation": m.get("affiliation", ""),
                "total_memberships": len(entries),
                "chair_count": sum(1 for e in entries if e[2] == "chair"),
                "conferences": entries,
                "years": {},
            }
            for e in entries:
                yr = str(e[1])
                conf_m["years"][yr] = conf_m["years"].get(yr, 0) + 1
            conf_ae_members.append(conf_m)

        # Merge using the same logic as area-level rankings
        conf_combined = _merge_rankings(conf_authors_data, conf_ae_members)
        conf_combined = [c for c in conf_combined if c["combined_score"] >= 3]

        # Re-rank
        rank = 1
        for i, c in enumerate(conf_combined):
            if i > 0 and c["combined_score"] < conf_combined[i - 1]["combined_score"]:
                rank = i + 1
            c["rank"] = rank

        # Inject author_id
        try:
            if name_to_id:
                for entry in conf_combined:
                    aid = name_to_id.get(entry["name"])
                    if aid is not None:
                        entry["author_id"] = aid
        except NameError:
            pass

        fname = f"{conf_lower}_combined_rankings.json"
        path = os.path.join(assets_data, fname)
        with open(path, "w") as f:
            json.dump(conf_combined, f, ensure_ascii=False)
        logger.info(f"  Wrote {path} ({len(conf_combined)} entries)")

    # Summary YAML
    # Count people who have both artifacts AND AE service
    both_all = sum(1 for c in combined_all if c["artifacts"] > 0 and c["ae_memberships"] > 0)
    both_sys = sum(1 for c in combined_sys if c["artifacts"] > 0 and c["ae_memberships"] > 0)
    both_sec = sum(1 for c in combined_sec if c["artifacts"] > 0 and c["ae_memberships"] > 0)

    summary = {
        "combined_total": len(combined_all),
        "combined_systems": len(combined_sys),
        "combined_security": len(combined_sec),
        "both_artifacts_and_ae": both_all,
        "both_artifacts_and_ae_systems": both_sys,
        "both_artifacts_and_ae_security": both_sec,
        "top_combined_score": combined_all[0]["combined_score"] if combined_all else 0,
    }
    yml_path = os.path.join(yaml_dir, "combined_summary.yml")
    with open(yml_path, "w") as f:
        yaml.dump(summary, f, default_flow_style=False, sort_keys=False)
    logger.info(f"  Wrote {yml_path}")

    logger.info(
        f"  Combined rankings: {len(combined_all)} total, {len(combined_sys)} systems, {len(combined_sec)} security"
    )
    logger.info(f"  People with both artifacts and AE service: {both_all}")
generate_combined_rankings¶

src.generators.generate_combined_rankings ¶

generate_combined_rankings(data_dir: str) -> None ¶

`src.generators.generate_combined_rankings` ¶

`generate_combined_rankings(data_dir: str) -> None` ¶