Skip to content

ReproDB Pipeline

conference

reprodb-pipeline

conference¶

`src.utils.conference` ¶

Shared conference metadata, area classification, and name-normalization helpers.

Every generator / enricher should import from here rather than redefining its own SYSTEMS_CONFS, SECURITY_CONFS, _conf_area(), _extract_conf_year(), or name-cleaning functions.

`discover_conferences(website_root: str | None = None) -> tuple[frozenset[str], frozenset[str]]` ¶

Return (systems, security) conference sets from the website.

Falls back to auto-detection of the website root, then to built-in defaults if the directory is not found (e.g. in tests). Discovered conferences are merged with the built-in fallbacks so that known conferences are always classified even before the pipeline auto-generates their pages.

Source code in src/utils/conference.py

def discover_conferences(website_root: str | None = None) -> tuple[frozenset[str], frozenset[str]]:
    """Return ``(systems, security)`` conference sets from the website.

    Falls back to auto-detection of the website root, then to built-in
    defaults if the directory is not found (e.g. in tests).
    Discovered conferences are merged with the built-in fallbacks so that
    known conferences are always classified even before the pipeline
    auto-generates their pages.
    """
    root = website_root or _find_website_root()
    if root and os.path.isdir(root):
        sys_confs = _scan_area_confs(root, "systems") | _FALLBACK_SYSTEMS
        sec_confs = _scan_area_confs(root, "security") | _FALLBACK_SECURITY
        if sys_confs or sec_confs:
            logger.debug(
                "Discovered conferences from %s: systems=%s, security=%s",
                root,
                sorted(sys_confs),
                sorted(sec_confs),
            )
            return sys_confs, sec_confs
    # Fallback for tests / CI where the website repo isn't available
    logger.debug("Website directory not found; using built-in conference defaults")
    return _FALLBACK_SYSTEMS, _FALLBACK_SECURITY

`canonicalize_name(name: str) -> str` ¶

Map known name aliases to their canonical form.

Source code in src/utils/conference.py

def canonicalize_name(name: str) -> str:
    """Map known name aliases to their canonical form."""
    for pat, canonical in _NAME_ALIASES:
        if pat.search(name):
            return canonical
    return name

`conf_area(conf_name: str) -> str` ¶

Return 'systems', 'security', or 'unknown'.

Accepts a bare conference name ('OSDI') or a conf-year string ('osdi2024'). Any casing is accepted.

Source code in src/utils/conference.py

def conf_area(conf_name: str) -> str:
    """Return ``'systems'``, ``'security'``, or ``'unknown'``.

    Accepts a bare conference name (``'OSDI'``) **or** a conf-year string
    (``'osdi2024'``).  Any casing is accepted.
    """
    upper = re.sub(r"\d+$", "", conf_name).strip().upper()
    if upper in SYSTEMS_CONFS:
        return "systems"
    if upper in SECURITY_CONFS:
        return "security"
    return "unknown"

`parse_conf_year(conf_year_str: str) -> tuple[str, int | None]` ¶

Parse 'osdi2024' → ('OSDI', 2024).

Returns (name_upper, year_int) on success, or (conf_year_str.upper(), None) on failure.

Source code in src/utils/conference.py

def parse_conf_year(conf_year_str: str) -> tuple[str, int | None]:
    """Parse ``'osdi2024'`` → ``('OSDI', 2024)``.

    Returns ``(name_upper, year_int)`` on success, or
    ``(conf_year_str.upper(), None)`` on failure.
    """
    m = _CONF_YEAR_RE.match(conf_year_str)
    if m:
        return m.group(1).upper(), int(m.group(2))
    return conf_year_str.upper(), None

`clean_name(name: str) -> str` ¶

Remove DBLP disambiguation suffixes and collapse whitespace.

'Jane Doe 0001' → 'Jane Doe'

Source code in src/utils/conference.py

def clean_name(name: str) -> str:
    """Remove DBLP disambiguation suffixes and collapse whitespace.

    ``'Jane Doe 0001'`` → ``'Jane Doe'``
    """
    if not name:
        return ""
    name = re.sub(r"[\t\n\r]+", " ", name)
    name = re.sub(r"\s+\d{4}$", "", name)
    name = re.sub(r"\s+", " ", name).strip()
    return name

`normalize_name(name: str, *, strip_initials: bool = False) -> str` ¶

Aggressive normalisation for cross-source matching.

Lower-cases, strips accents, removes dots, collapses whitespace. Applies name alias canonicalisation first so that known aliases (e.g. 'Bogdan "Bo" Stoica' → 'Bogdan Alexandru Stoica') collapse to the same normalised key. Optionally strips single-letter initials (e.g. "J. Doe" → "Doe") and leading underscores for ranking deduplication.

Source code in src/utils/conference.py

def normalize_name(name: str, *, strip_initials: bool = False) -> str:
    """Aggressive normalisation for cross-source matching.

    Lower-cases, strips accents, removes dots, collapses whitespace.
    Applies name alias canonicalisation first so that known aliases
    (e.g. ``'Bogdan "Bo" Stoica'`` → ``'Bogdan Alexandru Stoica'``)
    collapse to the same normalised key.
    Optionally strips single-letter initials (e.g. "J. Doe" → "Doe")
    and leading underscores for ranking deduplication.
    """
    if not name:
        return ""
    name = canonicalize_name(name)
    name = name.strip().lower()
    name = unicodedata.normalize("NFKD", name)
    name = "".join(c for c in name if not unicodedata.combining(c))
    name = re.sub(r"\.", "", name)
    name = re.sub(r"\s+\d{4}$", "", name)
    if strip_initials:
        name = re.sub(r"\b[a-z]\s+", "", name)
        name = name.lstrip("_").strip()
    name = re.sub(r"\s+", " ", name).strip()
    return name

`normalize_title(title: str) -> str` ¶

Normalize a paper title for fuzzy matching.

Lower-cases, strips punctuation (keeping word chars and spaces), and collapses whitespace. Used for deduplication and cross-source title matching.

Source code in src/utils/conference.py

def normalize_title(title: str) -> str:
    """Normalize a paper title for fuzzy matching.

    Lower-cases, strips punctuation (keeping word chars and spaces),
    and collapses whitespace.  Used for deduplication and cross-source
    title matching.
    """
    if not title:
        return ""
    return " ".join(re.sub(r"[^\w\s]", "", title.lower()).split())

`venue_to_conference(booktitle: str) -> str | None` ¶

Map a DBLP booktitle to our conference identifier, or None.

Source code in src/utils/conference.py

def venue_to_conference(booktitle: str) -> str | None:
    """Map a DBLP booktitle to our conference identifier, or None."""
    if not booktitle:
        return None
    bt = booktitle.strip()

    # Handle SC explicitly to avoid false positives (e.g., matching inside "ACSAC")
    if bt == "SC" or bt.startswith("SC "):
        return "SC"

    for pattern, conf in DBLP_VENUE_MAP.items():
        if pattern in booktitle:
            return conf
    return None

`clean_member_name(raw_name: str) -> str | None` ¶

Clean a committee member name.

Strips markdown links, trailing <br> tags, and skips placeholder names. Returns the cleaned name, or None if the entry should be dropped.

Source code in src/utils/conference.py

def clean_member_name(raw_name: str) -> str | None:
    """Clean a committee member name.

    Strips markdown links, trailing ``<br>`` tags, and skips placeholder names.
    Returns the cleaned name, or ``None`` if the entry should be dropped.
    """
    name = raw_name.strip()
    link_match = _MARKDOWN_LINK.match(name)
    if link_match:
        name = link_match.group(1)
    name = _BR_TAG.sub("", name).strip()
    if name.lower() in PLACEHOLDER_NAMES or len(name) <= 1:
        return None
    if "contact" in name.lower() or "reach" in name.lower() or "mailto:" in name.lower():
        return None
    return clean_name(name)