Skip to content

generate_paper_index

src.generators.generate_paper_index

Generate a canonical paper index (papers.json) from authors.yml.

Each unique paper gets a stable integer ID. Co-authored papers are stored once in the index and referenced by ID from each author entry.

Usage

python -m src.generators.generate_paper_index --data_dir ../reprodb.github.io

load_existing_index(path)

Load existing paper index to preserve IDs across runs.

Source code in src/generators/generate_paper_index.py
22
23
24
25
26
27
28
29
30
31
32
33
def load_existing_index(path):
    """Load existing paper index to preserve IDs across runs."""
    if not os.path.exists(path):
        return [], {}
    with open(path) as f:
        entries = json.load(f)
    by_norm_title = {}
    for entry in entries:
        key = entry.get("normalized_title", "")
        if key:
            by_norm_title[key] = entry
    return entries, by_norm_title

build_paper_index(authors_data, existing_by_title, max_id)

Build paper index from authors.yml data.

Returns (papers_list, norm_title_to_id dict).

Source code in src/generators/generate_paper_index.py
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
def build_paper_index(authors_data, existing_by_title, max_id):
    """Build paper index from authors.yml data.

    Returns (papers_list, norm_title_to_id dict).
    """
    # Collect unique papers from all authors
    seen = {}  # normalized_title -> paper dict
    for author in authors_data:
        for paper in author.get("papers", []):
            title = paper.get("title", "")
            norm = normalize_title(title)
            if not norm:
                continue
            if norm not in seen:
                seen[norm] = {
                    "title": title,
                    "normalized_title": norm,
                    "conference": paper.get("conference", ""),
                    "year": paper.get("year"),
                    "category": paper.get("category", ""),
                    "badges": paper.get("badges", []),
                    "artifact_citations": paper.get("artifact_citations", 0),
                }
            else:
                # Update citation count if higher
                existing_cit = seen[norm].get("artifact_citations", 0) or 0
                new_cit = paper.get("artifact_citations", 0) or 0
                if new_cit > existing_cit:
                    seen[norm]["artifact_citations"] = new_cit

        # Also collect papers_without_artifacts
        for paper in author.get("papers_without_artifacts", []):
            title = paper.get("title", "")
            norm = normalize_title(title)
            if not norm:
                continue
            if norm not in seen:
                seen[norm] = {
                    "title": title,
                    "normalized_title": norm,
                    "conference": paper.get("conference", ""),
                    "year": paper.get("year"),
                    "category": paper.get("category", ""),
                    "badges": [],
                    "artifact_citations": 0,
                    "has_artifact": False,
                }

    # Assign IDs: preserve existing, assign new for unseen
    papers = []
    next_id = max_id + 1

    for norm_title, paper in seen.items():
        if norm_title in existing_by_title:
            paper["id"] = existing_by_title[norm_title]["id"]
        else:
            paper["id"] = next_id
            next_id += 1
        # Mark whether this paper has artifacts
        if "has_artifact" not in paper:
            paper["has_artifact"] = True
        papers.append(paper)

    papers.sort(key=lambda x: x["id"])
    norm_to_id = {p["normalized_title"]: p["id"] for p in papers}
    return papers, norm_to_id