Skip to content

conference

src.utils.normalization.conference

Shared conference metadata, area classification, and name-normalization helpers.

Every generator / enricher should import from here rather than redefining its own SYSTEMS_CONFS, SECURITY_CONFS, _conf_area(), _extract_conf_year(), or name-cleaning functions.

discover_conferences(website_root: str | None = None) -> tuple[frozenset[str], frozenset[str]]

Return (systems, security) conference sets from the website.

Falls back to auto-detection of the website root, then to built-in defaults if the directory is not found (e.g. in tests). Discovered conferences are merged with the built-in fallbacks so that known conferences are always classified even before the pipeline auto-generates their pages.

Source code in src/utils/normalization/conference.py
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def discover_conferences(website_root: str | None = None) -> tuple[frozenset[str], frozenset[str]]:
    """Return ``(systems, security)`` conference sets from the website.

    Falls back to auto-detection of the website root, then to built-in
    defaults if the directory is not found (e.g. in tests).
    Discovered conferences are merged with the built-in fallbacks so that
    known conferences are always classified even before the pipeline
    auto-generates their pages.
    """
    root = website_root or _find_website_root()
    if root and os.path.isdir(root):
        sys_confs = _scan_area_confs(root, "systems") | _FALLBACK_SYSTEMS
        sec_confs = _scan_area_confs(root, "security") | _FALLBACK_SECURITY
        if sys_confs or sec_confs:
            logger.debug(
                "Discovered conferences from %s: systems=%s, security=%s",
                root,
                sorted(sys_confs),
                sorted(sec_confs),
            )
            return sys_confs, sec_confs
    # Fallback for tests / CI where the website repo isn't available
    logger.debug("Website directory not found; using built-in conference defaults")
    return _FALLBACK_SYSTEMS, _FALLBACK_SECURITY

refresh_conference_sets(website_root: str | None = None) -> None

Re-scan the website directory and update the module-level conference sets.

Call this after :func:ensure_conference_pages has created new files.

Source code in src/utils/normalization/conference.py
101
102
103
104
105
106
107
108
def refresh_conference_sets(website_root: str | None = None) -> None:
    """Re-scan the website directory and update the module-level conference sets.

    Call this after :func:`ensure_conference_pages` has created new files.
    """
    global SYSTEMS_CONFS, SECURITY_CONFS, ALL_CONFS  # noqa: PLW0603
    SYSTEMS_CONFS, SECURITY_CONFS = discover_conferences(website_root)
    ALL_CONFS = SYSTEMS_CONFS | SECURITY_CONFS

ensure_conference_pages(sys_dirs: set[str] | None = None, sec_dirs: set[str] | None = None, website_root: str | None = None) -> list[str]

Create <conf>.md pages for conferences discovered in artifact sites.

For each conference-year directory (e.g. vehiclesec2026) found in the sysartifacts / secartifacts repos, extract the conference name prefix and ensure a corresponding {website_root}/{area}/{conf}.md page exists. Missing pages are created from a standard template.

Parameters

sys_dirs, sec_dirs: Sets of directory names (e.g. {"osdi2024", "sosp2023"}). If None, they are fetched from the GitHub API via :func:~src.scrapers.repo_utils.get_conferences_from_prefix. website_root: Path to the reprodb.github.io checkout. Auto-detected if None.

Returns

list[str] Paths of newly created .md files.

Source code in src/utils/normalization/conference.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
def ensure_conference_pages(
    sys_dirs: set[str] | None = None,
    sec_dirs: set[str] | None = None,
    website_root: str | None = None,
) -> list[str]:
    """Create ``<conf>.md`` pages for conferences discovered in artifact sites.

    For each conference-year directory (e.g. ``vehiclesec2026``) found in the
    sysartifacts / secartifacts repos, extract the conference name prefix and
    ensure a corresponding ``{website_root}/{area}/{conf}.md`` page exists.
    Missing pages are created from a standard template.

    Parameters
    ----------
    sys_dirs, sec_dirs:
        Sets of directory names (e.g. ``{"osdi2024", "sosp2023"}``).
        If *None*, they are fetched from the GitHub API via
        :func:`~src.scrapers.repo_utils.get_conferences_from_prefix`.
    website_root:
        Path to the ``reprodb.github.io`` checkout.  Auto-detected if *None*.

    Returns
    -------
    list[str]
        Paths of newly created ``.md`` files.
    """
    root = website_root or _find_website_root()
    if not root or not os.path.isdir(root):
        logger.debug("Website root not found; skipping conference page creation")
        return []

    # Lazy-import to avoid pulling network dependencies at module load time.
    if sys_dirs is None or sec_dirs is None:
        from src.scrapers.repo_utils import get_conferences_from_prefix

        if sys_dirs is None:
            sys_dirs = {item["name"] for item in get_conferences_from_prefix("sys")}
        if sec_dirs is None:
            sec_dirs = {item["name"] for item in get_conferences_from_prefix("sec")}

    created: list[str] = []
    for area, dirs in [("systems", sys_dirs), ("security", sec_dirs)]:
        area_dir = os.path.join(root, "content", area)
        if not os.path.isdir(area_dir):
            continue
        existing = {fname[:-3].upper() for fname in os.listdir(area_dir) if fname.endswith(".md")}
        seen_prefixes: set[str] = set()
        for dir_name in sorted(dirs):
            conf_upper, year = parse_conf_year(dir_name)
            if year is None or conf_upper in seen_prefixes:
                continue
            seen_prefixes.add(conf_upper)
            if conf_upper in existing:
                continue
            slug = conf_upper.lower()
            display_name = CONF_DISPLAY_NAMES.get(conf_upper, conf_upper)
            page_path = os.path.join(area_dir, f"{slug}.md")
            content = _CONF_PAGE_TEMPLATE.format(
                display_name=display_name,
                area=area,
                slug=slug,
                conf_upper=conf_upper,
            )
            with open(page_path, "w") as fh:
                fh.write(content)
            logger.info("Created conference page: %s", page_path)
            created.append(page_path)

    if created:
        refresh_conference_sets(root)
    return created

canonicalize_name(name: str) -> str

Map known name aliases to their canonical form.

Source code in src/utils/normalization/conference.py
237
238
239
240
241
242
def canonicalize_name(name: str) -> str:
    """Map known name aliases to their canonical form."""
    for pat, canonical in _NAME_ALIASES:
        if pat.search(name):
            return canonical
    return name

conf_area(conf_name: str) -> str

Return 'systems', 'security', or 'unknown'.

Accepts a bare conference name ('OSDI') or a conf-year string ('osdi2024'). Any casing is accepted.

Source code in src/utils/normalization/conference.py
245
246
247
248
249
250
251
252
253
254
255
256
def conf_area(conf_name: str) -> str:
    """Return ``'systems'``, ``'security'``, or ``'unknown'``.

    Accepts a bare conference name (``'OSDI'``) **or** a conf-year string
    (``'osdi2024'``).  Any casing is accepted.
    """
    upper = re.sub(r"\d+$", "", conf_name).strip().upper()
    if upper in SYSTEMS_CONFS:
        return "systems"
    if upper in SECURITY_CONFS:
        return "security"
    return "unknown"

parse_conf_year(conf_year_str: str) -> tuple[str, int | None]

Parse 'osdi2024'('OSDI', 2024).

Returns (name_upper, year_int) on success, or (conf_year_str.upper(), None) on failure.

Source code in src/utils/normalization/conference.py
264
265
266
267
268
269
270
271
272
273
def parse_conf_year(conf_year_str: str) -> tuple[str, int | None]:
    """Parse ``'osdi2024'`` → ``('OSDI', 2024)``.

    Returns ``(name_upper, year_int)`` on success, or
    ``(conf_year_str.upper(), None)`` on failure.
    """
    m = _CONF_YEAR_RE.match(conf_year_str)
    if m:
        return m.group(1).upper(), int(m.group(2))
    return conf_year_str.upper(), None

clean_name(name: str) -> str

Remove DBLP disambiguation suffixes and collapse whitespace.

'Jane Doe 0001''Jane Doe'

Source code in src/utils/normalization/conference.py
279
280
281
282
283
284
285
286
287
288
289
def clean_name(name: str) -> str:
    """Remove DBLP disambiguation suffixes and collapse whitespace.

    ``'Jane Doe 0001'`` → ``'Jane Doe'``
    """
    if not name:
        return ""
    name = re.sub(r"[\t\n\r]+", " ", name)
    name = re.sub(r"\s+\d{4}$", "", name)
    name = re.sub(r"\s+", " ", name).strip()
    return name

normalize_name(name: str, *, strip_initials: bool = False) -> str

Aggressive normalisation for cross-source matching.

Lower-cases, strips accents, removes dots, collapses whitespace. Applies name alias canonicalisation first so that known aliases (e.g. 'Bogdan "Bo" Stoica''Bogdan Alexandru Stoica') collapse to the same normalised key. Optionally strips single-letter initials (e.g. "J. Doe" → "Doe") and leading underscores for ranking deduplication.

Source code in src/utils/normalization/conference.py
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
def normalize_name(name: str, *, strip_initials: bool = False) -> str:
    """Aggressive normalisation for cross-source matching.

    Lower-cases, strips accents, removes dots, collapses whitespace.
    Applies name alias canonicalisation first so that known aliases
    (e.g. ``'Bogdan "Bo" Stoica'`` → ``'Bogdan Alexandru Stoica'``)
    collapse to the same normalised key.
    Optionally strips single-letter initials (e.g. "J. Doe" → "Doe")
    and leading underscores for ranking deduplication.
    """
    if not name:
        return ""
    name = canonicalize_name(name)
    name = name.strip().lower()
    name = unicodedata.normalize("NFKD", name)
    name = "".join(c for c in name if not unicodedata.combining(c))
    name = re.sub(r"\.", "", name)
    name = re.sub(r"\s+\d{4}$", "", name)
    if strip_initials:
        name = re.sub(r"\b[a-z]\s+", "", name)
        name = name.lstrip("_").strip()
    name = re.sub(r"\s+", " ", name).strip()
    return name

normalize_title(title: str) -> str

Normalize a paper title for fuzzy matching.

Decodes HTML entities, normalizes Unicode (NFKD, strip combining marks), lower-cases, strips punctuation (keeping word chars and spaces), and collapses whitespace. Used for deduplication and cross-source title matching.

Source code in src/utils/normalization/conference.py
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
def normalize_title(title: str) -> str:
    """Normalize a paper title for fuzzy matching.

    Decodes HTML entities, normalizes Unicode (NFKD, strip combining marks),
    lower-cases, strips punctuation (keeping word chars and spaces),
    and collapses whitespace.  Used for deduplication and cross-source
    title matching.
    """
    if not title:
        return ""
    import html

    # Decode HTML entities (&amp; → &, etc.)
    t = html.unescape(title)
    # Unicode NFKD normalization (fi→fi, fl→fl, μ→μ, 𝜇→μ, – → -)
    t = unicodedata.normalize("NFKD", t)
    # Strip combining marks (accents that are separate code points after NFKD)
    t = "".join(c for c in t if not unicodedata.combining(c))
    # Strip known scraping artifacts
    t = re.sub(r"\s*full strip note\s*$", "", t, flags=re.IGNORECASE)
    # Strip trailing artifact tags like "[Artifacts]"
    t = re.sub(r"\s*\[Artifacts?\]\s*$", "", t, flags=re.IGNORECASE)
    return " ".join(re.sub(r"[^\w\s]", "", t.lower()).split())

venue_to_conference(booktitle: str) -> str | None

Map a DBLP booktitle to our conference identifier, or None.

Source code in src/utils/normalization/conference.py
376
377
378
379
380
381
382
383
384
385
386
387
388
389
def venue_to_conference(booktitle: str) -> str | None:
    """Map a DBLP booktitle to our conference identifier, or None."""
    if not booktitle:
        return None
    bt = booktitle.strip()

    # Handle SC explicitly to avoid false positives (e.g., matching inside "ACSAC")
    if bt == "SC" or bt.startswith("SC "):
        return "SC"

    for pattern, conf in DBLP_VENUE_MAP.items():
        if pattern in booktitle:
            return conf
    return None

clean_member_name(raw_name: str) -> str | None

Clean a committee member name.

Strips markdown links, trailing <br> tags, and skips placeholder names. Returns the cleaned name, or None if the entry should be dropped.

Source code in src/utils/normalization/conference.py
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
def clean_member_name(raw_name: str) -> str | None:
    """Clean a committee member name.

    Strips markdown links, trailing ``<br>`` tags, and skips placeholder names.
    Returns the cleaned name, or ``None`` if the entry should be dropped.
    """
    name = raw_name.strip()
    link_match = _MARKDOWN_LINK.match(name)
    if link_match:
        name = link_match.group(1)
    name = _BR_TAG.sub("", name).strip()
    # Strip trailing footnote markers (e.g., "Cen Zhang¹" → "Cen Zhang")
    name = name.rstrip("¹²³⁴⁵⁶⁷⁸⁹⁰").strip()
    if name.lower() in PLACEHOLDER_NAMES or len(name) <= 1:
        return None
    if "contact" in name.lower() or "reach" in name.lower() or "mailto:" in name.lower():
        return None
    lower = name.lower()
    if "award" in lower or "distinguished" in lower or "participated in" in lower:
        return None
    # Skip footnote markers (superscript digits at start of line)
    if name.lstrip().startswith(("¹", "²", "³", "⁴", "⁵")):
        return None
    return clean_name(name)