scrape_committee_web¶

`src.scrapers.scrape_committee_web` ¶

Scrape AE committee data from alternative sources when sysartifacts/secartifacts GitHub repos don't have the information.

Supported sources: - USENIX website (FAST, OSDI, ATC, USENIX Security, WOOT) - CHES website (ches.iacr.org) - PETS website (petsymposium.org) - ACSAC website (acsac.org) - IEEE S&P website (sp2026.ieee-security.org)

`scrape_usenix_committee(conference, year, session=None, cache_only=False)` ¶

Scrape AE committee from a USENIX conference call-for-artifacts page.

Parameters¶

conference : str Conference name (e.g. 'fast', 'osdi', 'usenixsec', 'woot') year : int 4-digit year session : requests.Session, optional cache_only : bool If True, only return data from the disk cache.

Returns¶

list of {name, affiliation} dicts, or None if page not found

Source code in src/scrapers/scrape_committee_web.py

def scrape_usenix_committee(conference, year, session=None, cache_only=False):
    """Scrape AE committee from a USENIX conference call-for-artifacts page.

    Parameters
    ----------
    conference : str
        Conference name (e.g. 'fast', 'osdi', 'usenixsec', 'woot')
    year : int
        4-digit year
    session : requests.Session, optional
    cache_only : bool
        If True, only return data from the disk cache.

    Returns
    -------
    list of {name, affiliation} dicts, or None if page not found
    """
    slug = USENIX_CONF_SLUGS.get(conference.lower())
    if slug is None:
        return None

    yy = str(year)[2:]  # e.g. 2024 -> "24"
    url = f"{BASE_USENIX}/conference/{slug}{yy}/call-for-artifacts"

    html = _cached_fetch(url, session=session, cache_only=cache_only)
    if html is None:
        return None

    soup = BeautifulSoup(html, "html.parser")

    # Parse co-chairs and regular committee members
    chairs = _parse_usenix_cochairs_html(soup)
    members = _parse_usenix_committee_html(soup)

    # Mark roles
    for m in chairs:
        m["role"] = "chair"
    for m in members:
        m["role"] = "member"

    # Combine (chairs + members, dedup by name)
    all_members = chairs + members
    seen = set()
    deduped = []
    for m in all_members:
        if m["name"].lower() not in seen:
            seen.add(m["name"].lower())
            deduped.append(m)

    if deduped:
        logger.info(f"  USENIX: Found {len(deduped)} members for {conference}{year}")
    return deduped if deduped else None

`scrape_ches_committee(year, session=None, cache_only=False)` ¶

Scrape AE committee from the CHES website.

Members are fetched from the JSON API (ches.iacr.org/{year}/json/artifact.json). If the JSON endpoint is unavailable (e.g. CHES 2022), members are parsed from the static HTML. Chairs are parsed from the JSON artifact_chairs field when available (2025+), or from (Chair) annotations in JSON member names (2024), or from the HTML page (ches.iacr.org/{year}/artifacts.php).

Returns list of {name, affiliation, role} dicts, or None if not found.

Source code in src/scrapers/scrape_committee_web.py

def scrape_ches_committee(year, session=None, cache_only=False):
    """Scrape AE committee from the CHES website.

    Members are fetched from the JSON API
    (``ches.iacr.org/{year}/json/artifact.json``).  If the JSON endpoint is
    unavailable (e.g. CHES 2022), members are parsed from the static HTML.
    Chairs are parsed from the JSON ``artifact_chairs`` field when available
    (2025+), or from ``(Chair)`` annotations in JSON member names (2024), or
    from the HTML page (``ches.iacr.org/{year}/artifacts.php``).

    Returns list of {name, affiliation, role} dicts, or None if not found.
    """
    members = []
    json_chairs = []

    # 1. Try JSON API for members (and chairs when available)
    json_url = f"https://ches.iacr.org/{year}/json/artifact.json"
    json_text = _cached_fetch(json_url, session=session, cache_only=cache_only)
    # Fallback: CHES 2021 uses comm2.json instead of artifact.json
    if json_text is None:
        json_url = f"https://ches.iacr.org/{year}/json/comm2.json"
        json_text = _cached_fetch(json_url, session=session, cache_only=cache_only)
    if json_text is not None:
        try:
            import json as _json

            data = _json.loads(json_text)
            for entry in data.get("committee", []):
                name = re.sub(r"\s+", " ", entry.get("name", "")).strip()
                affiliation = re.sub(r"\s+", " ", entry.get("affiliation", "")).strip()
                # Detect "(Chair)" / "(Co-Chair)" embedded in the name (e.g. CHES 2024)
                chair_match = re.search(r"\s*\((?:Co-)?Chair\)\s*$", name, re.IGNORECASE)
                if chair_match:
                    name = name[: chair_match.start()].strip()
                    role = "chair"
                else:
                    role = "member"
                if name and len(name) > 1:
                    members.append({"name": name, "affiliation": affiliation, "role": role})
            # Parse artifact_chairs field (2025+)
            for entry in data.get("artifact_chairs", []):
                name = re.sub(r"\s+", " ", entry.get("name", "")).strip()
                affiliation = re.sub(r"\s+", " ", entry.get("affiliation", "")).strip()
                if name and len(name) > 1:
                    json_chairs.append({"name": name, "affiliation": affiliation, "role": "chair"})
        except (ValueError, KeyError):
            logger.warning("Failed to parse CHES committee JSON, skipping JSON source")

    # 2. Fetch HTML page for chairs (and fallback members if JSON failed)
    html_url = f"https://ches.iacr.org/{year}/artifacts.php"
    html_text = _cached_fetch(html_url, session=session, cache_only=cache_only)
    if html_text is not None:
        soup = BeautifulSoup(html_text, "html.parser")

        # Parse chairs from HTML (fallback when JSON has no chair info)
        html_chairs = _scrape_ches_chairs_html(soup)

        # If JSON didn't return members, try HTML fallback (CHES 2022)
        if not members:
            members = _scrape_ches_members_html(soup)

        # Combine: JSON chairs > HTML chairs > members (dedup by name)
        all_members = json_chairs + html_chairs + members
        seen = set()
        deduped = []
        for m in all_members:
            key = m["name"].lower()
            if key not in seen:
                seen.add(key)
                deduped.append(m)

        if deduped:
            chair_count = sum(1 for m in deduped if m["role"] == "chair")
            member_count = len(deduped) - chair_count
            logger.info(f"  CHES: Found {member_count} members + {chair_count} chair(s) for ches{year}")
        return deduped if deduped else None

    # If only JSON data was found (HTML failed), return those
    combined = json_chairs + members
    if combined:
        logger.info(f"  CHES: Found {len(combined)} entries for ches{year} (JSON only)")
        return combined

    return None

`scrape_pets_committee(year, session=None, cache_only=False)` ¶

Scrape artifact review committee from PETS/PoPETs website.

PETS publishes ARC on: petsymposium.org/cfp{YY}.php Format:

Artifact Review Committee:

Name, Affiliation

...

Returns list of {name, affiliation} dicts, or None if not found.

Source code in src/scrapers/scrape_committee_web.py

def scrape_pets_committee(year, session=None, cache_only=False):
    """Scrape artifact review committee from PETS/PoPETs website.

    PETS publishes ARC on: petsymposium.org/cfp{YY}.php
    Format: <dt><font><b>Artifact Review Committee:</b></font></dt>
            <dd>Name, <i>Affiliation</i></dd>
            <dd>Name, <i>Affiliation</i></dd>
            ...

    Returns list of {name, affiliation} dicts, or None if not found.
    """
    yy = str(year)[2:]
    url = f"https://petsymposium.org/cfp{yy}.php"

    html = _cached_fetch(url, session=session, cache_only=cache_only)
    if html is None:
        return None

    soup = BeautifulSoup(html, "html.parser")
    members = []

    # Find the <dt> element containing "Artifact Review Committee"
    arc_dt = None
    for dt in soup.find_all("dt"):
        txt = dt.get_text().lower()
        if "artifact" in txt and "committee" in txt:
            arc_dt = dt
            break

    if arc_dt is None:
        return None

    # Collect all <dd> siblings following the <dt> until the next <dt>
    for sib in arc_dt.next_siblings:
        if not hasattr(sib, "name"):
            continue
        if sib.name == "dt":
            break  # reached the next definition term
        if sib.name == "dd":
            text = sib.get_text().strip()
            if not text or len(text) < 3:
                continue
            # Parse "Name, Affiliation"
            if "," in text:
                parts = text.split(",", 1)
                name = parts[0].strip()
                affiliation = parts[1].strip()
            else:
                name = text
                affiliation = ""
            name = re.sub(r"\s+", " ", name).strip().strip("*_").strip()
            affiliation = re.sub(r"\s+", " ", affiliation).strip().strip("*_").strip()
            if name and len(name) > 2:
                members.append({"name": name, "affiliation": affiliation, "role": "member"})

    if members:
        logger.info(f"  PETS: Found {len(members)} members for pets{year}")
    return members if members else None

`scrape_acsac_committee(year, session=None, cache_only=False)` ¶

Scrape AE committee from the ACSAC website.

ACSAC publishes artifact committee data at two URL patterns: - https://www.acsac.org/{year}/committees/artifact/ (2020-2022) - https://www.acsac.org/{year}/committees/artifacts/ (2023+)

Returns list of {name, affiliation, role} dicts, or None if not found.

Source code in src/scrapers/scrape_committee_web.py

def scrape_acsac_committee(year, session=None, cache_only=False):
    """Scrape AE committee from the ACSAC website.

    ACSAC publishes artifact committee data at two URL patterns:
    - ``https://www.acsac.org/{year}/committees/artifact/``  (2020-2022)
    - ``https://www.acsac.org/{year}/committees/artifacts/`` (2023+)

    Returns list of {name, affiliation, role} dicts, or None if not found.
    """
    # Try both URL patterns (slug changed from singular to plural in 2023)
    urls = [
        f"https://www.acsac.org/{year}/committees/artifacts/",
        f"https://www.acsac.org/{year}/committees/artifact/",
    ]

    soup = None
    for url in urls:
        html = _cached_fetch(url, session=session, cache_only=cache_only)
        if html is not None:
            soup = BeautifulSoup(html, "html.parser")
            break

    if soup is None:
        logger.info("  ACSAC: No committee page found for acsac%d", year)
        return None

    # Parse chairs
    chairs = _parse_acsac_chairs(soup)

    # Parse members from all relevant sections
    # 2020-2022: "Students" and "Mentors"
    # 2023+: "Reviewers" and "Mentors"
    members = _parse_acsac_section_members(soup, ["student", "reviewer"])
    mentors = _parse_acsac_section_members(soup, ["mentor"])

    # Fallback for flat-list pages (e.g. 2019) that have no section headings:
    # collect members from the text between <h1> and the footer.
    if not members and not mentors:
        members = _parse_acsac_flat_members(soup, chairs)

    all_members = chairs + mentors + members

    # Deduplicate by name (case-insensitive)
    seen = set()
    deduped = []
    for m in all_members:
        key = m["name"].lower()
        if key not in seen:
            seen.add(key)
            deduped.append(m)

    if deduped:
        chair_count = sum(1 for m in deduped if m["role"] == "chair")
        member_count = len(deduped) - chair_count
        logger.info("  ACSAC: Found %d members + %d chair(s) for acsac%d", member_count, chair_count, year)
    return deduped if deduped else None

`scrape_sp_committee(year, session=None, cache_only=False)` ¶

Scrape AE committee data from the IEEE S&P 2026 website.

The AEC members are listed on cfartifacts.html in separate cycle tables. Artifact Evaluation Chairs are listed on index.html in the organizing committee table.

Source code in src/scrapers/scrape_committee_web.py

def scrape_sp_committee(year, session=None, cache_only=False):
    """Scrape AE committee data from the IEEE S&P 2026 website.

    The AEC members are listed on ``cfartifacts.html`` in separate cycle tables.
    Artifact Evaluation Chairs are listed on ``index.html`` in the organizing
    committee table.
    """
    if year not in SP_KNOWN_YEARS:
        return None

    base_url = f"https://sp{year}.ieee-security.org"
    members_html = _cached_fetch(f"{base_url}/cfartifacts.html", session=session, cache_only=cache_only)
    chairs_html = _cached_fetch(f"{base_url}/index.html", session=session, cache_only=cache_only)

    members = []
    chairs = []

    if members_html is not None:
        soup = BeautifulSoup(members_html, "html.parser")
        members = _scrape_sp_members_html(soup)

    if chairs_html is not None:
        soup = BeautifulSoup(chairs_html, "html.parser")
        chairs = _scrape_sp_chairs_html(soup)

    if not members and not chairs:
        return None

    all_members = chairs + members
    seen = set()
    deduped = []
    for member in all_members:
        key = member["name"].lower()
        if key in seen:
            continue
        seen.add(key)
        deduped.append(member)

    if deduped:
        chair_count = sum(1 for member in deduped if member["role"] == "chair")
        member_count = len(deduped) - chair_count
        logger.info("  IEEE S&P: Found %d members + %d chair(s) for sp%d", member_count, chair_count, year)

    return deduped if deduped else None

`scrape_hotcrp_committee(conference, year, session=None, cache_only=False)` ¶

Scrape AE committee from a public HotCRP PC-list page.

Parameters¶

conference : str Conference name (e.g. 'sosp') year : int 4-digit year session : requests.Session, optional cache_only : bool If True, only return data from the disk cache.

Returns¶

list of {name, affiliation, role} dicts, or None if page not found

Source code in src/scrapers/scrape_committee_web.py

def scrape_hotcrp_committee(conference, year, session=None, cache_only=False):
    """Scrape AE committee from a public HotCRP PC-list page.

    Parameters
    ----------
    conference : str
        Conference name (e.g. 'sosp')
    year : int
        4-digit year
    session : requests.Session, optional
    cache_only : bool
        If True, only return data from the disk cache.

    Returns
    -------
    list of {name, affiliation, role} dicts, or None if page not found
    """
    url = HOTCRP_URLS.get((conference.lower(), year))
    if url is None:
        return None

    html = _cached_fetch(url, session=session, cache_only=cache_only)
    if html is None:
        return None

    soup = BeautifulSoup(html, "html.parser")
    table = soup.find("table")
    if table is None:
        return None

    members = []
    for row in table.find_all("tr")[1:]:  # skip header row
        cells = row.find_all("td")
        if len(cells) < 2:
            continue

        name_cell = cells[0]
        affil_cell = cells[1]

        # Detect chair role from <span class="pcrole">chair</span>
        role_span = name_cell.find("span", class_="pcrole")
        is_chair = role_span is not None and "chair" in role_span.get_text(strip=True).lower()

        # Extract name: prefer <span class="taghl"> (highlighted name),
        # otherwise fall back to full cell text minus the role span
        name_span = name_cell.find("span", class_="taghl")
        if name_span:
            name = name_span.get_text(strip=True)
        else:
            # Remove the role span text from the cell text
            name = name_cell.get_text(strip=True)
            if role_span:
                name = name.replace(role_span.get_text(strip=True), "").strip()

        affiliation = affil_cell.get_text(strip=True)

        # Clean up
        name = re.sub(r"\s+", " ", name).strip()
        affiliation = re.sub(r"\s+", " ", affiliation).strip()

        # Skip placeholder entries ("[No name]", empty, "None")
        if not name or name == "[No name]" or len(name) < 2:
            continue

        role = "chair" if is_chair else "member"
        members.append({"name": name, "affiliation": affiliation, "role": role})

    if members:
        chair_count = sum(1 for m in members if m["role"] == "chair")
        member_count = len(members) - chair_count
        logger.info(
            "  HotCRP: Found %d members + %d chair(s) for %s%d",
            member_count,
            chair_count,
            conference,
            year,
        )
    return members if members else None

`get_alternative_committees(conferences_needed)` ¶

Fetch committees from alternative sources for conferences not in sysartifacts/secartifacts.

Parameters¶

conferences_needed : dict {conf_year_str: 'systems'|'security'} — conferences that need data. e.g. {'fast2024': 'systems', 'usenixsec2022': 'security'}

Returns¶

dict of {conf_year_str: [{name, affiliation}, ...]}

Source code in src/scrapers/scrape_committee_web.py

def get_alternative_committees(conferences_needed):
    """Fetch committees from alternative sources for conferences not in sysartifacts/secartifacts.

    Parameters
    ----------
    conferences_needed : dict
        {conf_year_str: 'systems'|'security'} — conferences that need data.
        e.g. {'fast2024': 'systems', 'usenixsec2022': 'security'}

    Returns
    -------
    dict of {conf_year_str: [{name, affiliation}, ...]}
    """
    cache_only = os.getenv("SKIP_USENIX_SCRAPE", "").strip().lower() in {
        "1",
        "true",
        "yes",
    }
    results = {}
    local = _load_local_committees()
    sess = None if cache_only else _get_session()

    if cache_only:
        logger.info("  SKIP_USENIX_SCRAPE set — using cached scraping results only (no live HTTP)")

    for conf_year_str, _area in conferences_needed.items():
        m = re.match(r"^([a-zA-Z]+)(\d{4})$", conf_year_str)
        if not m:
            continue
        conf = m.group(1).lower()
        year = int(m.group(2))

        committee = None

        # Try web scraper (uses disk cache; cache_only=True means no live HTTP)
        if conf in USENIX_CONF_SLUGS:
            committee = scrape_usenix_committee(conf, year, session=sess, cache_only=cache_only)
        elif conf == "ches":
            committee = scrape_ches_committee(year, session=sess, cache_only=cache_only)
        elif conf == "pets":
            committee = scrape_pets_committee(year, session=sess, cache_only=cache_only)
        elif conf == "acsac":
            committee = scrape_acsac_committee(year, session=sess, cache_only=cache_only)
        elif conf == "sp":
            committee = scrape_sp_committee(year, session=sess, cache_only=cache_only)

        # Try HotCRP (public pages, not blocked by SKIP_USENIX_SCRAPE)
        if not committee and (conf, year) in HOTCRP_URLS:
            hotcrp_sess = sess if sess is not None else _get_session()
            committee = scrape_hotcrp_committee(conf, year, session=hotcrp_sess, cache_only=False)

        # Fallback: local/static data (e.g. from local pipeline run)
        if not committee:
            committee = local.get(conf_year_str)

        if committee and len(committee) > 0:
            results[conf_year_str] = committee

    return results

`get_all_usenix_committees(conf_regex=None)` ¶

Scrape all available USENIX conference committees.

Parameters¶

conf_regex : str, optional Regex to filter conference/year strings (e.g. '.20[2][0-5]')

Returns¶

dict of {conf_year_str: [{name, affiliation}, ...]}

Source code in src/scrapers/scrape_committee_web.py

def get_all_usenix_committees(conf_regex=None):
    """Scrape all available USENIX conference committees.

    Parameters
    ----------
    conf_regex : str, optional
        Regex to filter conference/year strings (e.g. '.20[2][0-5]')

    Returns
    -------
    dict of {conf_year_str: [{name, affiliation}, ...]}
    """
    results = {}
    sess = _get_session()

    for conf, years in USENIX_KNOWN_YEARS.items():
        for year in years:
            conf_year = f"{conf}{year}"
            if conf_regex and not re.search(conf_regex, conf_year):
                continue
            committee = scrape_usenix_committee(conf, year, session=sess)
            if committee:
                results[conf_year] = committee

    return results

scrape_committee_web¶

src.scrapers.scrape_committee_web ¶

scrape_usenix_committee(conference, year, session=None, cache_only=False) ¶

Parameters¶

Returns¶

scrape_ches_committee(year, session=None, cache_only=False) ¶

scrape_pets_committee(year, session=None, cache_only=False) ¶

scrape_acsac_committee(year, session=None, cache_only=False) ¶

scrape_sp_committee(year, session=None, cache_only=False) ¶

scrape_hotcrp_committee(conference, year, session=None, cache_only=False) ¶

Parameters¶

Returns¶

get_alternative_committees(conferences_needed) ¶

Parameters¶

Returns¶

get_all_usenix_committees(conf_regex=None) ¶

Parameters¶

Returns¶

`src.scrapers.scrape_committee_web` ¶

`scrape_usenix_committee(conference, year, session=None, cache_only=False)` ¶

`scrape_ches_committee(year, session=None, cache_only=False)` ¶

`scrape_pets_committee(year, session=None, cache_only=False)` ¶

`scrape_acsac_committee(year, session=None, cache_only=False)` ¶

`scrape_sp_committee(year, session=None, cache_only=False)` ¶

`scrape_hotcrp_committee(conference, year, session=None, cache_only=False)` ¶

`get_alternative_committees(conferences_needed)` ¶

`get_all_usenix_committees(conf_regex=None)` ¶