conference¶

`src.utils.normalization.conference` ¶

Shared conference metadata, area classification, and name-normalization helpers.

Every generator / enricher should import from here rather than redefining its own SYSTEMS_CONFS, SECURITY_CONFS, _conf_area(), _extract_conf_year(), or name-cleaning functions.

`discover_conferences(website_root: str | None = None) -> tuple[frozenset[str], frozenset[str]]` ¶

Return (systems, security) conference sets from the website.

Falls back to auto-detection of the website root, then to built-in defaults if the directory is not found (e.g. in tests). Discovered conferences are merged with the built-in fallbacks so that known conferences are always classified even before the pipeline auto-generates their pages.

Source code in src/utils/normalization/conference.py

def discover_conferences(website_root: str | None = None) -> tuple[frozenset[str], frozenset[str]]:
    """Return ``(systems, security)`` conference sets from the website.

    Falls back to auto-detection of the website root, then to built-in
    defaults if the directory is not found (e.g. in tests).
    Discovered conferences are merged with the built-in fallbacks so that
    known conferences are always classified even before the pipeline
    auto-generates their pages.
    """
    root = website_root or _find_website_root()
    if root and os.path.isdir(root):
        sys_confs = _scan_area_confs(root, "systems") | _FALLBACK_SYSTEMS
        sec_confs = _scan_area_confs(root, "security") | _FALLBACK_SECURITY
        if sys_confs or sec_confs:
            logger.debug(
                "Discovered conferences from %s: systems=%s, security=%s",
                root,
                sorted(sys_confs),
                sorted(sec_confs),
            )
            return sys_confs, sec_confs
    # Fallback for tests / CI where the website repo isn't available
    logger.debug("Website directory not found; using built-in conference defaults")
    return _FALLBACK_SYSTEMS, _FALLBACK_SECURITY

`refresh_conference_sets(website_root: str | None = None) -> None` ¶

Re-scan the website directory and update the module-level conference sets.

Call this after :func:ensure_conference_pages has created new files.

Source code in src/utils/normalization/conference.py

def refresh_conference_sets(website_root: str | None = None) -> None:
    """Re-scan the website directory and update the module-level conference sets.

    Call this after :func:`ensure_conference_pages` has created new files.
    """
    global SYSTEMS_CONFS, SECURITY_CONFS, ALL_CONFS  # noqa: PLW0603
    SYSTEMS_CONFS, SECURITY_CONFS = discover_conferences(website_root)
    ALL_CONFS = SYSTEMS_CONFS | SECURITY_CONFS

`ensure_conference_pages(sys_dirs: set[str] | None = None, sec_dirs: set[str] | None = None, website_root: str | None = None) -> list[str]` ¶

Create <conf>.md pages for conferences discovered in artifact sites.

For each conference-year directory (e.g. vehiclesec2026) found in the sysartifacts / secartifacts repos, extract the conference name prefix and ensure a corresponding {website_root}/{area}/{conf}.md page exists. Missing pages are created from a standard template.

Parameters¶

sys_dirs, sec_dirs: Sets of directory names (e.g. {"osdi2024", "sosp2023"}). If None, they are fetched from the GitHub API via :func:~src.scrapers.repo_utils.get_conferences_from_prefix. website_root: Path to the reprodb.github.io checkout. Auto-detected if None.

Returns¶

list[str] Paths of newly created .md files.

Source code in src/utils/normalization/conference.py

def ensure_conference_pages(
    sys_dirs: set[str] | None = None,
    sec_dirs: set[str] | None = None,
    website_root: str | None = None,
) -> list[str]:
    """Create ``<conf>.md`` pages for conferences discovered in artifact sites.

    For each conference-year directory (e.g. ``vehiclesec2026``) found in the
    sysartifacts / secartifacts repos, extract the conference name prefix and
    ensure a corresponding ``{website_root}/{area}/{conf}.md`` page exists.
    Missing pages are created from a standard template.

    Parameters
    ----------
    sys_dirs, sec_dirs:
        Sets of directory names (e.g. ``{"osdi2024", "sosp2023"}``).
        If *None*, they are fetched from the GitHub API via
        :func:`~src.scrapers.repo_utils.get_conferences_from_prefix`.
    website_root:
        Path to the ``reprodb.github.io`` checkout.  Auto-detected if *None*.

    Returns
    -------
    list[str]
        Paths of newly created ``.md`` files.
    """
    root = website_root or _find_website_root()
    if not root or not os.path.isdir(root):
        logger.debug("Website root not found; skipping conference page creation")
        return []

    # Lazy-import to avoid pulling network dependencies at module load time.
    if sys_dirs is None or sec_dirs is None:
        from src.scrapers.repo_utils import get_conferences_from_prefix

        if sys_dirs is None:
            sys_dirs = {item["name"] for item in get_conferences_from_prefix("sys")}
        if sec_dirs is None:
            sec_dirs = {item["name"] for item in get_conferences_from_prefix("sec")}

    created: list[str] = []
    for area, dirs in [("systems", sys_dirs), ("security", sec_dirs)]:
        area_dir = os.path.join(root, "content", area)
        if not os.path.isdir(area_dir):
            continue
        existing = {fname[:-3].upper() for fname in os.listdir(area_dir) if fname.endswith(".md")}
        seen_prefixes: set[str] = set()
        for dir_name in sorted(dirs):
            conf_upper, year = parse_conf_year(dir_name)
            if year is None or conf_upper in seen_prefixes:
                continue
            seen_prefixes.add(conf_upper)
            if conf_upper in existing:
                continue
            slug = conf_upper.lower()
            display_name = CONF_DISPLAY_NAMES.get(conf_upper, conf_upper)
            page_path = os.path.join(area_dir, f"{slug}.md")
            content = _CONF_PAGE_TEMPLATE.format(
                display_name=display_name,
                area=area,
                slug=slug,
                conf_upper=conf_upper,
            )
            with open(page_path, "w") as fh:
                fh.write(content)
            logger.info("Created conference page: %s", page_path)
            created.append(page_path)

    if created:
        refresh_conference_sets(root)
    return created

`canonicalize_name(name: str) -> str` ¶

Map known name aliases to their canonical form.

Source code in src/utils/normalization/conference.py

def canonicalize_name(name: str) -> str:
    """Map known name aliases to their canonical form."""
    for pat, canonical in _NAME_ALIASES:
        if pat.search(name):
            return canonical
    return name

`conf_area(conf_name: str) -> str` ¶

Return 'systems', 'security', or 'unknown'.

Accepts a bare conference name ('OSDI') or a conf-year string ('osdi2024'). Any casing is accepted.

Source code in src/utils/normalization/conference.py

def conf_area(conf_name: str) -> str:
    """Return ``'systems'``, ``'security'``, or ``'unknown'``.

    Accepts a bare conference name (``'OSDI'``) **or** a conf-year string
    (``'osdi2024'``).  Any casing is accepted.
    """
    upper = re.sub(r"\d+$", "", conf_name).strip().upper()
    if upper in SYSTEMS_CONFS:
        return "systems"
    if upper in SECURITY_CONFS:
        return "security"
    return "unknown"

`parse_conf_year(conf_year_str: str) -> tuple[str, int | None]` ¶

Parse 'osdi2024' → ('OSDI', 2024).

Returns (name_upper, year_int) on success, or (conf_year_str.upper(), None) on failure.

Source code in src/utils/normalization/conference.py

def parse_conf_year(conf_year_str: str) -> tuple[str, int | None]:
    """Parse ``'osdi2024'`` → ``('OSDI', 2024)``.

    Returns ``(name_upper, year_int)`` on success, or
    ``(conf_year_str.upper(), None)`` on failure.
    """
    m = _CONF_YEAR_RE.match(conf_year_str)
    if m:
        return m.group(1).upper(), int(m.group(2))
    return conf_year_str.upper(), None

`clean_name(name: str) -> str` ¶

Remove DBLP disambiguation suffixes and collapse whitespace.

'Jane Doe 0001' → 'Jane Doe'

Source code in src/utils/normalization/conference.py

def clean_name(name: str) -> str:
    """Remove DBLP disambiguation suffixes and collapse whitespace.

    ``'Jane Doe 0001'`` → ``'Jane Doe'``
    """
    if not name:
        return ""
    name = re.sub(r"[\t\n\r]+", " ", name)
    name = re.sub(r"\s+\d{4}$", "", name)
    name = re.sub(r"\s+", " ", name).strip()
    return name

`normalize_name(name: str, *, strip_initials: bool = False) -> str` ¶

Aggressive normalisation for cross-source matching.

Lower-cases, strips accents, removes dots, collapses whitespace. Applies name alias canonicalisation first so that known aliases (e.g. 'Bogdan "Bo" Stoica' → 'Bogdan Alexandru Stoica') collapse to the same normalised key. Optionally strips single-letter initials (e.g. "J. Doe" → "Doe") and leading underscores for ranking deduplication.

Source code in src/utils/normalization/conference.py

def normalize_name(name: str, *, strip_initials: bool = False) -> str:
    """Aggressive normalisation for cross-source matching.

    Lower-cases, strips accents, removes dots, collapses whitespace.
    Applies name alias canonicalisation first so that known aliases
    (e.g. ``'Bogdan "Bo" Stoica'`` → ``'Bogdan Alexandru Stoica'``)
    collapse to the same normalised key.
    Optionally strips single-letter initials (e.g. "J. Doe" → "Doe")
    and leading underscores for ranking deduplication.
    """
    if not name:
        return ""
    name = canonicalize_name(name)
    name = name.strip().lower()
    name = unicodedata.normalize("NFKD", name)
    name = "".join(c for c in name if not unicodedata.combining(c))
    name = re.sub(r"\.", "", name)
    name = re.sub(r"\s+\d{4}$", "", name)
    if strip_initials:
        name = re.sub(r"\b[a-z]\s+", "", name)
        name = name.lstrip("_").strip()
    name = re.sub(r"\s+", " ", name).strip()
    return name

`normalize_title(title: str) -> str` ¶

Normalize a paper title for fuzzy matching.

Decodes HTML entities, normalizes Unicode (NFKD, strip combining marks), lower-cases, strips punctuation (keeping word chars and spaces), and collapses whitespace. Used for deduplication and cross-source title matching.

Source code in src/utils/normalization/conference.py

def normalize_title(title: str) -> str:
    """Normalize a paper title for fuzzy matching.

    Decodes HTML entities, normalizes Unicode (NFKD, strip combining marks),
    lower-cases, strips punctuation (keeping word chars and spaces),
    and collapses whitespace.  Used for deduplication and cross-source
    title matching.
    """
    if not title:
        return ""
    import html

    # Decode HTML entities (&amp; → &, etc.)
    t = html.unescape(title)
    # Unicode NFKD normalization (ﬁ→fi, ﬂ→fl, μ→μ, 𝜇→μ, – → -)
    t = unicodedata.normalize("NFKD", t)
    # Strip combining marks (accents that are separate code points after NFKD)
    t = "".join(c for c in t if not unicodedata.combining(c))
    # Strip known scraping artifacts
    t = re.sub(r"\s*full strip note\s*$", "", t, flags=re.IGNORECASE)
    # Strip trailing artifact tags like "[Artifacts]"
    t = re.sub(r"\s*\[Artifacts?\]\s*$", "", t, flags=re.IGNORECASE)
    return " ".join(re.sub(r"[^\w\s]", "", t.lower()).split())

`venue_to_conference(booktitle: str) -> str | None` ¶

Map a DBLP booktitle to our conference identifier, or None.

Source code in src/utils/normalization/conference.py

def venue_to_conference(booktitle: str) -> str | None:
    """Map a DBLP booktitle to our conference identifier, or None."""
    if not booktitle:
        return None
    bt = booktitle.strip()

    # Handle SC explicitly to avoid false positives (e.g., matching inside "ACSAC")
    if bt == "SC" or bt.startswith("SC "):
        return "SC"

    for pattern, conf in DBLP_VENUE_MAP.items():
        if pattern in booktitle:
            return conf
    return None

`clean_member_name(raw_name: str) -> str | None` ¶

Clean a committee member name.

Strips markdown links, trailing <br> tags, and skips placeholder names. Returns the cleaned name, or None if the entry should be dropped.

Source code in src/utils/normalization/conference.py

def clean_member_name(raw_name: str) -> str | None:
    """Clean a committee member name.

    Strips markdown links, trailing ``<br>`` tags, and skips placeholder names.
    Returns the cleaned name, or ``None`` if the entry should be dropped.
    """
    name = raw_name.strip()
    link_match = _MARKDOWN_LINK.match(name)
    if link_match:
        name = link_match.group(1)
    name = _BR_TAG.sub("", name).strip()
    # Strip trailing footnote markers (e.g., "Cen Zhang¹" → "Cen Zhang")
    name = name.rstrip("¹²³⁴⁵⁶⁷⁸⁹⁰").strip()
    if name.lower() in PLACEHOLDER_NAMES or len(name) <= 1:
        return None
    if "contact" in name.lower() or "reach" in name.lower() or "mailto:" in name.lower():
        return None
    lower = name.lower()
    if "award" in lower or "distinguished" in lower or "participated in" in lower:
        return None
    # Skip footnote markers (superscript digits at start of line)
    if name.lstrip().startswith(("¹", "²", "³", "⁴", "⁵")):
        return None
    return clean_name(name)

conference¶

src.utils.normalization.conference ¶

discover_conferences(website_root: str | None = None) -> tuple[frozenset[str], frozenset[str]] ¶

refresh_conference_sets(website_root: str | None = None) -> None ¶

ensure_conference_pages(sys_dirs: set[str] | None = None, sec_dirs: set[str] | None = None, website_root: str | None = None) -> list[str] ¶

Parameters¶

Returns¶

canonicalize_name(name: str) -> str ¶

conf_area(conf_name: str) -> str ¶

parse_conf_year(conf_year_str: str) -> tuple[str, int | None] ¶

clean_name(name: str) -> str ¶

normalize_name(name: str, *, strip_initials: bool = False) -> str ¶

normalize_title(title: str) -> str ¶

venue_to_conference(booktitle: str) -> str | None ¶

clean_member_name(raw_name: str) -> str | None ¶

`src.utils.normalization.conference` ¶

`discover_conferences(website_root: str | None = None) -> tuple[frozenset[str], frozenset[str]]` ¶

`refresh_conference_sets(website_root: str | None = None) -> None` ¶

`ensure_conference_pages(sys_dirs: set[str] | None = None, sec_dirs: set[str] | None = None, website_root: str | None = None) -> list[str]` ¶

`canonicalize_name(name: str) -> str` ¶

`conf_area(conf_name: str) -> str` ¶

`parse_conf_year(conf_year_str: str) -> tuple[str, int | None]` ¶

`clean_name(name: str) -> str` ¶

`normalize_name(name: str, *, strip_initials: bool = False) -> str` ¶

`normalize_title(title: str) -> str` ¶

`venue_to_conference(booktitle: str) -> str | None` ¶

`clean_member_name(raw_name: str) -> str | None` ¶