Skip to content

author_index

src.utils.normalization.author_index

Utilities for reading and updating the canonical author index.

The author index (author_index.json) is the single source of truth for author identity (stable integer IDs) and affiliation data. Enrichers call update_author_affiliation() to write back discovered affiliations with proper source tracking and history.

load_author_index(data_dir: str) -> tuple[list, dict[str, dict]]

Load author_index.json and return (entries, name→entry dict).

data_dir is the website repo root (contains _build/ and assets/data/).

Source code in src/utils/normalization/author_index.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def load_author_index(data_dir: str) -> tuple[list, dict[str, dict]]:
    """Load ``author_index.json`` and return (entries, name→entry dict).

    ``data_dir`` is the website repo root (contains ``_build/`` and ``assets/data/``).
    """
    from pathlib import Path

    path = resolve_data_path(Path(data_dir), "author_index.json")
    if not path.exists():
        return [], {}
    with open(path, "r", encoding="utf-8") as f:
        entries = json.load(f)
    by_name = {e["name"]: e for e in entries}
    return entries, by_name

build_name_to_id(data_dir: str) -> dict[str, int]

Return a {name: author_id} dict. Returns empty dict if no index.

Source code in src/utils/normalization/author_index.py
34
35
36
37
def build_name_to_id(data_dir: str) -> dict[str, int]:
    """Return a ``{name: author_id}`` dict. Returns empty dict if no index."""
    _, by_name = load_author_index(data_dir)
    return {name: entry["id"] for name, entry in by_name.items()}

save_author_index(data_dir: str, entries: list[dict]) -> str

Write author_index.json back to disk. Returns the file path.

Source code in src/utils/normalization/author_index.py
40
41
42
43
44
45
46
def save_author_index(data_dir: str, entries: list[dict]) -> str:
    """Write ``author_index.json`` back to disk.  Returns the file path."""
    path = os.path.join(data_dir, "_build", "author_index.json")
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(entries, f, indent=2, ensure_ascii=False)
    return path

update_author_affiliation(entry: dict, new_affiliation: str, source: str, *, external_id_key: Optional[str] = None, external_id_value: Optional[str] = None) -> bool

Update an index entry's affiliation if it changed.

Sets affiliation, affiliation_source, affiliation_updated and appends the old value to affiliation_history when the affiliation actually changes.

Optionally records an external ID (e.g. dblp_pid, openalex_id).

Returns True if the entry was modified.

Source code in src/utils/normalization/author_index.py
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def update_author_affiliation(
    entry: dict,
    new_affiliation: str,
    source: str,
    *,
    external_id_key: Optional[str] = None,
    external_id_value: Optional[str] = None,
) -> bool:
    """Update an index entry's affiliation if it changed.

    Sets ``affiliation``, ``affiliation_source``, ``affiliation_updated``
    and appends the old value to ``affiliation_history`` when the affiliation
    actually changes.

    Optionally records an external ID (e.g. ``dblp_pid``, ``openalex_id``).

    Returns True if the entry was modified.
    """
    if not new_affiliation:
        # Nothing to do — caller found no affiliation
        # But still record external ID if provided
        if external_id_key and external_id_value:
            entry.setdefault("external_ids", {})[external_id_key] = external_id_value
            return True
        return False

    today = datetime.now().strftime("%Y-%m-%d")
    old_affiliation = entry.get("affiliation", "")
    changed = False

    if new_affiliation != old_affiliation:
        # Record old value in history
        if old_affiliation:
            entry.setdefault("affiliation_history", []).append(
                {
                    "affiliation": old_affiliation,
                    "source": entry.get("affiliation_source", ""),
                    "date": entry.get("affiliation_updated", ""),
                }
            )
        entry["affiliation"] = new_affiliation
        entry["affiliation_source"] = source
        entry["affiliation_updated"] = today
        changed = True
    elif entry.get("affiliation_source", "") != source:
        # Same affiliation but different/better source
        entry["affiliation_source"] = source
        entry["affiliation_updated"] = today
        changed = True

    if external_id_key and external_id_value:
        entry.setdefault("external_ids", {})[external_id_key] = external_id_value
        changed = True

    return changed