Skip to content

affiliation

src.utils.normalization.affiliation

Shared affiliation normalization utilities.

Provides normalize_affiliation() (and its helpers) which apply regex rules from data/affiliation_rules.yaml, strip sub-unit details, and collapse trailing location tokens so that different spellings of the same institution map to one canonical name.

normalize_affiliation(affiliation: str) -> str

Normalize affiliation string to a canonical form.

Strategy: 1. Try each regex rule; if one matches, return its canonical name. 2. Strip sub-units after a university name core. 3. Strip trailing location tokens (city, state, country).

Source code in src/utils/normalization/affiliation.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def normalize_affiliation(affiliation: str) -> str:
    """Normalize affiliation string to a canonical form.

    Strategy:
    1. Try each regex rule; if one matches, return its canonical name.
    2. Strip sub-units after a university name core.
    3. Strip trailing location tokens (city, state, country).
    """
    if not affiliation:
        return ""
    aff = affiliation.strip()
    if not aff:
        return ""

    # Strip leading "The " so individual YAML rules don't need ^The\s+ variants.
    aff = re.sub(r"^[Tt]he\s+", "", aff)

    # 1. Apply explicit pattern rules
    for pat, canonical in _AFFILIATION_RULES:
        if pat.search(aff):
            return canonical

    # 2. Generic: strip sub-unit details after the university name
    #    Match "<Name> University" or "University of <Name>" then drop the rest.
    m = re.match(
        r"((?:The\s+)?(?:University|Universität|Universidade|Università|Université)"
        r"\s+(?:of\s+)?[\w''\-\–\—.]+(?:\s+[\w''\-\–\—.]+){0,4}?)"
        r"\s*[,(]",
        aff,
        re.IGNORECASE,
    )
    if m:
        core = m.group(1).strip()
        # Keep it only if the core is long enough to be meaningful
        if len(core) > 10:
            return core

    # Same for "<Name> University" pattern (e.g. "Tsinghua University, ...")
    m = re.match(
        r"([\w''\-\–\—.]+(?:\s+[\w''\-\–\—.]+){0,4}?\s+"
        r"(?:University|Institute|Universität|Polytechnic|College))"
        r"\s*[,(]",
        aff,
        re.IGNORECASE,
    )
    if m:
        core = m.group(1).strip()
        if len(core) > 10:
            return core

    # 3. Strip trailing location (city, state, country) from any affiliation
    return _strip_trailing_location(aff)