Skip to content

author_index

src.utils.author_index

Utilities for reading and updating the canonical author index.

The author index (author_index.json) is the single source of truth for author identity (stable integer IDs) and affiliation data. Enrichers call update_author_affiliation() to write back discovered affiliations with proper source tracking and history.

load_author_index(data_dir: str) -> tuple[list, dict[str, dict]]

Load author_index.json and return (entries, name→entry dict).

data_dir is the website repo root (contains assets/data/).

Source code in src/utils/author_index.py
16
17
18
19
20
21
22
23
24
25
26
27
def load_author_index(data_dir: str) -> tuple[list, dict[str, dict]]:
    """Load ``author_index.json`` and return (entries, name→entry dict).

    ``data_dir`` is the website repo root (contains ``assets/data/``).
    """
    path = os.path.join(data_dir, "assets", "data", "author_index.json")
    if not os.path.exists(path):
        return [], {}
    with open(path, "r", encoding="utf-8") as f:
        entries = json.load(f)
    by_name = {e["name"]: e for e in entries}
    return entries, by_name

build_name_to_id(data_dir: str) -> dict[str, int]

Return a {name: author_id} dict. Returns empty dict if no index.

Source code in src/utils/author_index.py
30
31
32
33
def build_name_to_id(data_dir: str) -> dict[str, int]:
    """Return a ``{name: author_id}`` dict. Returns empty dict if no index."""
    _, by_name = load_author_index(data_dir)
    return {name: entry["id"] for name, entry in by_name.items()}

save_author_index(data_dir: str, entries: list[dict]) -> str

Write author_index.json back to disk. Returns the file path.

Source code in src/utils/author_index.py
36
37
38
39
40
41
42
def save_author_index(data_dir: str, entries: list[dict]) -> str:
    """Write ``author_index.json`` back to disk.  Returns the file path."""
    path = os.path.join(data_dir, "assets", "data", "author_index.json")
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(entries, f, indent=2, ensure_ascii=False)
    return path

update_author_affiliation(entry: dict, new_affiliation: str, source: str, *, external_id_key: Optional[str] = None, external_id_value: Optional[str] = None) -> bool

Update an index entry's affiliation if it changed.

Sets affiliation, affiliation_source, affiliation_updated and appends the old value to affiliation_history when the affiliation actually changes.

Optionally records an external ID (e.g. dblp_pid, openalex_id).

Returns True if the entry was modified.

Source code in src/utils/author_index.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def update_author_affiliation(
    entry: dict,
    new_affiliation: str,
    source: str,
    *,
    external_id_key: Optional[str] = None,
    external_id_value: Optional[str] = None,
) -> bool:
    """Update an index entry's affiliation if it changed.

    Sets ``affiliation``, ``affiliation_source``, ``affiliation_updated``
    and appends the old value to ``affiliation_history`` when the affiliation
    actually changes.

    Optionally records an external ID (e.g. ``dblp_pid``, ``openalex_id``).

    Returns True if the entry was modified.
    """
    if not new_affiliation:
        # Nothing to do — caller found no affiliation
        # But still record external ID if provided
        if external_id_key and external_id_value:
            entry.setdefault("external_ids", {})[external_id_key] = external_id_value
            return True
        return False

    today = datetime.now().strftime("%Y-%m-%d")
    old_affiliation = entry.get("affiliation", "")
    changed = False

    if new_affiliation != old_affiliation:
        # Record old value in history
        if old_affiliation:
            entry.setdefault("affiliation_history", []).append(
                {
                    "affiliation": old_affiliation,
                    "source": entry.get("affiliation_source", ""),
                    "date": entry.get("affiliation_updated", ""),
                }
            )
        entry["affiliation"] = new_affiliation
        entry["affiliation_source"] = source
        entry["affiliation_updated"] = today
        changed = True
    elif entry.get("affiliation_source", "") != source:
        # Same affiliation but different/better source
        entry["affiliation_source"] = source
        entry["affiliation_updated"] = today
        changed = True

    if external_id_key and external_id_value:
        entry.setdefault("external_ids", {})[external_id_key] = external_id_value
        changed = True

    return changed