generate_paper_index¶

`src.generators.generate_paper_index` ¶

Generate a canonical paper index (papers.json) from authors.yml.

Each unique paper gets a stable integer ID. Co-authored papers are stored once in the index and referenced by ID from each author entry.

Usage

python -m src.generators.generate_paper_index --data_dir ../reprodb.github.io

`load_existing_index(path)` ¶

Load existing paper index to preserve IDs across runs.

Source code in src/generators/generate_paper_index.py

def load_existing_index(path):
    """Load existing paper index to preserve IDs across runs."""
    if not os.path.exists(path):
        return [], {}
    with open(path) as f:
        entries = json.load(f)
    by_norm_title = {}
    for entry in entries:
        key = entry.get("normalized_title", "")
        if key:
            by_norm_title[key] = entry
    return entries, by_norm_title

`build_paper_index(authors_data, existing_by_title, max_id)` ¶

Build paper index from authors.yml data.

Returns (papers_list, norm_title_to_id dict).

Source code in src/generators/generate_paper_index.py

def build_paper_index(authors_data, existing_by_title, max_id):
    """Build paper index from authors.yml data.

    Returns (papers_list, norm_title_to_id dict).
    """
    # Collect unique papers from all authors
    seen = {}  # normalized_title -> paper dict
    for author in authors_data:
        for paper in author.get("papers", []):
            title = paper.get("title", "")
            norm = normalize_title(title)
            if not norm:
                continue
            if norm not in seen:
                seen[norm] = {
                    "title": title,
                    "normalized_title": norm,
                    "conference": paper.get("conference", ""),
                    "year": paper.get("year"),
                    "category": paper.get("category", ""),
                    "badges": paper.get("badges", []),
                    "artifact_citations": paper.get("artifact_citations", 0),
                }
            else:
                # Update citation count if higher
                existing_cit = seen[norm].get("artifact_citations", 0) or 0
                new_cit = paper.get("artifact_citations", 0) or 0
                if new_cit > existing_cit:
                    seen[norm]["artifact_citations"] = new_cit

        # Also collect papers_without_artifacts
        for paper in author.get("papers_without_artifacts", []):
            title = paper.get("title", "")
            norm = normalize_title(title)
            if not norm:
                continue
            if norm not in seen:
                seen[norm] = {
                    "title": title,
                    "normalized_title": norm,
                    "conference": paper.get("conference", ""),
                    "year": paper.get("year"),
                    "category": paper.get("category", ""),
                    "badges": [],
                    "artifact_citations": 0,
                    "has_artifact": False,
                }

    # Assign IDs: preserve existing, assign new for unseen
    papers = []
    next_id = max_id + 1

    for norm_title, paper in seen.items():
        if norm_title in existing_by_title:
            paper["id"] = existing_by_title[norm_title]["id"]
        else:
            paper["id"] = next_id
            next_id += 1
        # Mark whether this paper has artifacts
        if "has_artifact" not in paper:
            paper["has_artifact"] = True
        papers.append(paper)

    papers.sort(key=lambda x: x["id"])
    norm_to_id = {p["normalized_title"]: p["id"] for p in papers}
    return papers, norm_to_id

generate_paper_index¶

src.generators.generate_paper_index ¶

load_existing_index(path) ¶

build_paper_index(authors_data, existing_by_title, max_id) ¶

`src.generators.generate_paper_index` ¶

`load_existing_index(path)` ¶

`build_paper_index(authors_data, existing_by_title, max_id)` ¶