Skip to content

generate_paper_index

src.generators.output.generate_paper_index

Generate a canonical paper index (papers.json) from authors.yml.

Each unique paper gets a stable integer ID. Co-authored papers are stored once in the index and referenced by ID from each author entry.

Usage

python -m src.generators.generate_paper_index --data_dir ../reprodb.github.io/src

load_existing_index(path)

Load existing paper index to preserve IDs across runs.

Source code in src/generators/output/generate_paper_index.py
23
24
25
26
27
28
29
30
31
32
33
def load_existing_index(path):
    """Load existing paper index to preserve IDs across runs."""
    if not Path(path).exists():
        return [], {}
    entries = load_json(path)
    by_norm_title = {}
    for entry in entries:
        key = normalize_title(entry.get("title", ""))
        if key:
            by_norm_title[key] = entry
    return entries, by_norm_title

build_paper_index(authors_data, existing_by_title, max_id)

Build paper index from authors.yml data.

Returns (papers_list, norm_title_to_id dict).

Source code in src/generators/output/generate_paper_index.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def build_paper_index(authors_data, existing_by_title, max_id):
    """Build paper index from authors.yml data.

    Returns (papers_list, norm_title_to_id dict).
    """
    # Collect unique papers from all authors
    seen = {}  # normalized_title -> paper dict
    for author in authors_data:
        for paper in author.get("papers", []):
            title = paper.get("title", "")
            norm = normalize_title(title)
            if not norm:
                continue
            if norm not in seen:
                seen[norm] = {
                    "title": title,
                    "conference": paper.get("conference", ""),
                    "year": paper.get("year"),
                    "category": paper.get("category", ""),
                    "badges": paper.get("badges", []),
                    "artifact_citations": paper.get("artifact_citations", 0),
                }
            else:
                # Update citation count if higher
                existing_cit = seen[norm].get("artifact_citations", 0) or 0
                new_cit = paper.get("artifact_citations", 0) or 0
                if new_cit > existing_cit:
                    seen[norm]["artifact_citations"] = new_cit

        # Also collect papers_without_artifacts
        for paper in author.get("papers_without_artifacts", []):
            title = paper.get("title", "")
            norm = normalize_title(title)
            if not norm:
                continue
            if norm not in seen:
                seen[norm] = {
                    "title": title,
                    "conference": paper.get("conference", ""),
                    "year": paper.get("year"),
                    "category": paper.get("category", ""),
                    "badges": [],
                    "artifact_citations": 0,
                    "has_artifact": False,
                }

    # Assign IDs: preserve existing, assign new for unseen
    papers = []
    next_id = max_id + 1

    for norm_title, paper in seen.items():
        if norm_title in existing_by_title:
            paper["id"] = existing_by_title[norm_title]["id"]
        else:
            paper["id"] = next_id
            next_id += 1
        # Mark whether this paper has artifacts
        if "has_artifact" not in paper:
            paper["has_artifact"] = True
        papers.append(paper)

    papers.sort(key=lambda x: x["id"])
    norm_to_id = {normalize_title(p["title"]): p["id"] for p in papers}
    return papers, norm_to_id