Skip to content

generate_author_index

src.generators.authors.generate_author_index

Generate and maintain the canonical author index.

The author index is the single source of truth for author identity and affiliation. Every author gets a stable integer ID that never changes.

Reads
  • assets/data/authors.json (from generate_author_stats — names + display_names)
  • assets/data/author_index.json (previous index, if any — preserves IDs)
Writes
  • assets/data/author_index.json
Usage

python -m src.generators.generate_author_index --data_dir ../reprodb.github.io/src

load_existing_index(path: Path) -> tuple[list, dict[str, dict], int]

Load the previous author index, return (list, name->entry dict, max_id).

Source code in src/generators/authors/generate_author_index.py
30
31
32
33
34
35
36
37
def load_existing_index(path: Path) -> tuple[list, dict[str, dict], int]:
    """Load the previous author index, return (list, name->entry dict, max_id)."""
    if not path.exists():
        return [], {}, 0
    entries = load_json(path)
    by_name = {e["name"]: e for e in entries}
    max_id = max((e["id"] for e in entries), default=0)
    return entries, by_name, max_id

load_authors_json(path: Path) -> list[dict]

Load authors.json produced by generate_author_stats.

Source code in src/generators/authors/generate_author_index.py
40
41
42
43
44
def load_authors_json(path: Path) -> list[dict]:
    """Load authors.json produced by generate_author_stats."""
    if not path.exists():
        return []
    return load_json(path)

build_index(authors: list[dict], existing_by_name: dict[str, dict], max_id: int) -> list[dict]

Build a new index, preserving existing IDs and syncing affiliations.

Returns the updated index list.

Source code in src/generators/authors/generate_author_index.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def build_index(authors: list[dict], existing_by_name: dict[str, dict], max_id: int) -> list[dict]:
    """Build a new index, preserving existing IDs and syncing affiliations.

    Returns the updated index list.
    """
    index = []
    next_id = max_id + 1
    new_count = 0
    preserved_count = 0

    for author in authors:
        name = author.get("name", "")
        if not name:
            continue

        display_name = author.get("display_name") or clean_name(name)
        category = author.get("category", "systems")

        if name in existing_by_name:
            entry = existing_by_name[name].copy()
            entry["affiliation_history"] = list(entry.get("affiliation_history", []))
            entry["external_ids"] = dict(entry.get("external_ids", {}))
            entry["display_name"] = display_name
            entry["category"] = category
            preserved_count += 1
        else:
            entry = {
                "id": next_id,
                "name": name,
                "display_name": display_name,
                "affiliation": "",
                "affiliation_source": "",
                "affiliation_updated": "",
                "affiliation_history": [],
                "external_ids": {},
                "category": category,
            }
            next_id += 1
            new_count += 1

        index.append(entry)

    # Sort by ID for stable output
    index.sort(key=lambda e: e["id"])

    logger.info(f"Author index: {len(index)} total ({new_count} new, {preserved_count} preserved)")
    return index

generate_author_index(data_dir: str) -> dict

Main entry point: build/update the canonical author index.

Source code in src/generators/authors/generate_author_index.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
def generate_author_index(data_dir: str) -> dict:
    """Main entry point: build/update the canonical author index."""
    data_dir_path = Path(data_dir)

    authors_path = resolve_data_path(data_dir_path, "authors.json")
    index_path = resolve_data_path(data_dir_path, "author_index.json")

    authors = load_authors_json(authors_path)
    if not authors:
        logger.warning(f"No authors found at {authors_path}")
        return {"total": 0, "new": 0}

    _, existing_by_name, max_id = load_existing_index(index_path)
    index = build_index(authors, existing_by_name, max_id)

    # Write to assets/data (canonical location)
    output_path = data_dir_path / "assets" / "data" / "author_index.json"
    output_path.parent.mkdir(parents=True, exist_ok=True)
    save_validated_json(output_path, index, AuthorIndexEntry)

    logger.info(f"Wrote {len(index)} entries to {output_path}")
    return {"total": len(index), "path": str(output_path)}