Skip to content

generate_author_index

src.generators.generate_author_index

Generate and maintain the canonical author index.

The author index is the single source of truth for author identity and affiliation. Every author gets a stable integer ID that never changes.

Reads
  • assets/data/authors.json (from generate_author_stats — names + display_names)
  • assets/data/author_index.json (previous index, if any — preserves IDs)
Writes
  • assets/data/author_index.json
Usage

python -m src.generators.generate_author_index --data_dir ../reprodb.github.io

load_existing_index(path)

Load the previous author index, return (list, name->entry dict, max_id).

Source code in src/generators/generate_author_index.py
28
29
30
31
32
33
34
35
36
def load_existing_index(path):
    """Load the previous author index, return (list, name->entry dict, max_id)."""
    if not os.path.exists(path):
        return [], {}, 0
    with open(path, "r", encoding="utf-8") as f:
        entries = json.load(f)
    by_name = {e["name"]: e for e in entries}
    max_id = max((e["id"] for e in entries), default=0)
    return entries, by_name, max_id

load_authors_json(path)

Load authors.json produced by generate_author_stats.

Source code in src/generators/generate_author_index.py
39
40
41
42
43
44
def load_authors_json(path):
    """Load authors.json produced by generate_author_stats."""
    if not os.path.exists(path):
        return []
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

build_index(authors: list[dict], existing_by_name: dict[str, dict], max_id: int) -> list[dict]

Build a new index, preserving existing IDs and syncing affiliations.

When an enricher updates authors.json with a new affiliation, we detect the change here, update the index entry, and record the old value in affiliation_history.

Returns (index_list, stats_dict).

Source code in src/generators/generate_author_index.py
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def build_index(authors: list[dict], existing_by_name: dict[str, dict], max_id: int) -> list[dict]:
    """Build a new index, preserving existing IDs and syncing affiliations.

    When an enricher updates authors.json with a new affiliation, we detect
    the change here, update the index entry, and record the old value in
    affiliation_history.

    Returns (index_list, stats_dict).
    """
    today = datetime.now().strftime("%Y-%m-%d")
    index = []
    next_id = max_id + 1
    new_count = 0
    preserved_count = 0
    affiliation_changed_count = 0

    for author in authors:
        name = author.get("name", "")
        if not name:
            continue

        display_name = author.get("display_name", name)
        category = author.get("category", "")
        new_affiliation = author.get("affiliation", "")

        if name in existing_by_name:
            entry = existing_by_name[name].copy()
            # Ensure lists/dicts aren't shared references
            entry["affiliation_history"] = list(entry.get("affiliation_history", []))
            entry["external_ids"] = dict(entry.get("external_ids", {}))

            # Update display_name and category in case they changed
            entry["display_name"] = display_name
            if category:
                entry["category"] = category

            # Detect affiliation change from enrichers
            old_affiliation = entry.get("affiliation", "")
            if new_affiliation and new_affiliation != old_affiliation:
                # Record old value in history (if there was one)
                if old_affiliation:
                    entry["affiliation_history"].append(
                        {
                            "affiliation": old_affiliation,
                            "source": entry.get("affiliation_source", ""),
                            "date": entry.get("affiliation_updated", ""),
                        }
                    )
                entry["affiliation"] = new_affiliation
                entry["affiliation_updated"] = today
                # We don't know which enricher set it here;
                # enrichers can set affiliation_source directly later
                if not entry.get("affiliation_source"):
                    entry["affiliation_source"] = "enriched"
                affiliation_changed_count += 1
            preserved_count += 1
        else:
            # New author — assign next ID
            entry = {
                "id": next_id,
                "name": name,
                "display_name": display_name,
                "affiliation": new_affiliation,
                "affiliation_source": "",
                "affiliation_updated": "",
                "affiliation_history": [],
                "external_ids": {},
                "category": category,
            }
            if new_affiliation:
                entry["affiliation_source"] = "dblp"
                entry["affiliation_updated"] = today

            next_id += 1
            new_count += 1

        index.append(entry)

    # Sort by ID for stable output
    index.sort(key=lambda e: e["id"])

    stats = {
        "total": len(index),
        "preserved": preserved_count,
        "new": new_count,
        "affiliation_changed": affiliation_changed_count,
        "max_id": max(e["id"] for e in index) if index else 0,
        "with_affiliation": sum(1 for e in index if e.get("affiliation")),
    }
    return index, stats