Skip to content

scrape_committee_web

src.scrapers.scrape_committee_web

Scrape AE committee data from alternative sources when sysartifacts/secartifacts GitHub repos don't have the information.

Supported sources: - USENIX website (FAST, OSDI, ATC, USENIX Security, WOOT) - CHES website (ches.iacr.org) - PETS website (petsymposium.org) - ACSAC website (acsac.org) - IEEE S&P website (sp2026.ieee-security.org)

scrape_usenix_committee(conference, year, session=None, cache_only=False)

Scrape AE committee from a USENIX conference call-for-artifacts page.

Parameters

conference : str Conference name (e.g. 'fast', 'osdi', 'usenixsec', 'woot') year : int 4-digit year session : requests.Session, optional cache_only : bool If True, only return data from the disk cache.

Returns

list of {name, affiliation} dicts, or None if page not found

Source code in src/scrapers/scrape_committee_web.py
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
def scrape_usenix_committee(conference, year, session=None, cache_only=False):
    """Scrape AE committee from a USENIX conference call-for-artifacts page.

    Parameters
    ----------
    conference : str
        Conference name (e.g. 'fast', 'osdi', 'usenixsec', 'woot')
    year : int
        4-digit year
    session : requests.Session, optional
    cache_only : bool
        If True, only return data from the disk cache.

    Returns
    -------
    list of {name, affiliation} dicts, or None if page not found
    """
    slug = USENIX_CONF_SLUGS.get(conference.lower())
    if slug is None:
        return None

    yy = str(year)[2:]  # e.g. 2024 -> "24"
    url = f"{BASE_USENIX}/conference/{slug}{yy}/call-for-artifacts"

    html = _cached_fetch(url, session=session, cache_only=cache_only)
    if html is None:
        return None

    soup = BeautifulSoup(html, "html.parser")

    # Parse co-chairs and regular committee members
    chairs = _parse_usenix_cochairs_html(soup)
    members = _parse_usenix_committee_html(soup)

    # Mark roles
    for m in chairs:
        m["role"] = "chair"
    for m in members:
        m["role"] = "member"

    # Combine (chairs + members, dedup by name)
    all_members = chairs + members
    seen = set()
    deduped = []
    for m in all_members:
        if m["name"].lower() not in seen:
            seen.add(m["name"].lower())
            deduped.append(m)

    if deduped:
        logger.info(f"  USENIX: Found {len(deduped)} members for {conference}{year}")
    return deduped if deduped else None

scrape_ches_committee(year, session=None, cache_only=False)

Scrape AE committee from the CHES website.

Members are fetched from the JSON API (ches.iacr.org/{year}/json/artifact.json). If the JSON endpoint is unavailable (e.g. CHES 2022), members are parsed from the static HTML. Chairs are parsed from the JSON artifact_chairs field when available (2025+), or from (Chair) annotations in JSON member names (2024), or from the HTML page (ches.iacr.org/{year}/artifacts.php).

Returns list of {name, affiliation, role} dicts, or None if not found.

Source code in src/scrapers/scrape_committee_web.py
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
def scrape_ches_committee(year, session=None, cache_only=False):
    """Scrape AE committee from the CHES website.

    Members are fetched from the JSON API
    (``ches.iacr.org/{year}/json/artifact.json``).  If the JSON endpoint is
    unavailable (e.g. CHES 2022), members are parsed from the static HTML.
    Chairs are parsed from the JSON ``artifact_chairs`` field when available
    (2025+), or from ``(Chair)`` annotations in JSON member names (2024), or
    from the HTML page (``ches.iacr.org/{year}/artifacts.php``).

    Returns list of {name, affiliation, role} dicts, or None if not found.
    """
    members = []
    json_chairs = []

    # 1. Try JSON API for members (and chairs when available)
    json_url = f"https://ches.iacr.org/{year}/json/artifact.json"
    json_text = _cached_fetch(json_url, session=session, cache_only=cache_only)
    # Fallback: CHES 2021 uses comm2.json instead of artifact.json
    if json_text is None:
        json_url = f"https://ches.iacr.org/{year}/json/comm2.json"
        json_text = _cached_fetch(json_url, session=session, cache_only=cache_only)
    if json_text is not None:
        try:
            import json as _json

            data = _json.loads(json_text)
            for entry in data.get("committee", []):
                name = re.sub(r"\s+", " ", entry.get("name", "")).strip()
                affiliation = re.sub(r"\s+", " ", entry.get("affiliation", "")).strip()
                # Detect "(Chair)" / "(Co-Chair)" embedded in the name (e.g. CHES 2024)
                chair_match = re.search(r"\s*\((?:Co-)?Chair\)\s*$", name, re.IGNORECASE)
                if chair_match:
                    name = name[: chair_match.start()].strip()
                    role = "chair"
                else:
                    role = "member"
                if name and len(name) > 1:
                    members.append({"name": name, "affiliation": affiliation, "role": role})
            # Parse artifact_chairs field (2025+)
            for entry in data.get("artifact_chairs", []):
                name = re.sub(r"\s+", " ", entry.get("name", "")).strip()
                affiliation = re.sub(r"\s+", " ", entry.get("affiliation", "")).strip()
                if name and len(name) > 1:
                    json_chairs.append({"name": name, "affiliation": affiliation, "role": "chair"})
        except (ValueError, KeyError):
            logger.warning("Failed to parse CHES committee JSON, skipping JSON source")

    # 2. Fetch HTML page for chairs (and fallback members if JSON failed)
    html_url = f"https://ches.iacr.org/{year}/artifacts.php"
    html_text = _cached_fetch(html_url, session=session, cache_only=cache_only)
    if html_text is not None:
        soup = BeautifulSoup(html_text, "html.parser")

        # Parse chairs from HTML (fallback when JSON has no chair info)
        html_chairs = _scrape_ches_chairs_html(soup)

        # If JSON didn't return members, try HTML fallback (CHES 2022)
        if not members:
            members = _scrape_ches_members_html(soup)

        # Combine: JSON chairs > HTML chairs > members (dedup by name)
        all_members = json_chairs + html_chairs + members
        seen = set()
        deduped = []
        for m in all_members:
            key = m["name"].lower()
            if key not in seen:
                seen.add(key)
                deduped.append(m)

        if deduped:
            chair_count = sum(1 for m in deduped if m["role"] == "chair")
            member_count = len(deduped) - chair_count
            logger.info(f"  CHES: Found {member_count} members + {chair_count} chair(s) for ches{year}")
        return deduped if deduped else None

    # If only JSON data was found (HTML failed), return those
    combined = json_chairs + members
    if combined:
        logger.info(f"  CHES: Found {len(combined)} entries for ches{year} (JSON only)")
        return combined

    return None

scrape_pets_committee(year, session=None, cache_only=False)

Scrape artifact review committee from PETS/PoPETs website.

PETS publishes ARC on: petsymposium.org/cfp{YY}.php Format:

Artifact Review Committee:
Name, Affiliation
Name, Affiliation
...

Returns list of {name, affiliation} dicts, or None if not found.

Source code in src/scrapers/scrape_committee_web.py
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
def scrape_pets_committee(year, session=None, cache_only=False):
    """Scrape artifact review committee from PETS/PoPETs website.

    PETS publishes ARC on: petsymposium.org/cfp{YY}.php
    Format: <dt><font><b>Artifact Review Committee:</b></font></dt>
            <dd>Name, <i>Affiliation</i></dd>
            <dd>Name, <i>Affiliation</i></dd>
            ...

    Returns list of {name, affiliation} dicts, or None if not found.
    """
    yy = str(year)[2:]
    url = f"https://petsymposium.org/cfp{yy}.php"

    html = _cached_fetch(url, session=session, cache_only=cache_only)
    if html is None:
        return None

    soup = BeautifulSoup(html, "html.parser")
    members = []

    # Find the <dt> element containing "Artifact Review Committee"
    arc_dt = None
    for dt in soup.find_all("dt"):
        txt = dt.get_text().lower()
        if "artifact" in txt and "committee" in txt:
            arc_dt = dt
            break

    if arc_dt is None:
        return None

    # Collect all <dd> siblings following the <dt> until the next <dt>
    for sib in arc_dt.next_siblings:
        if not hasattr(sib, "name"):
            continue
        if sib.name == "dt":
            break  # reached the next definition term
        if sib.name == "dd":
            text = sib.get_text().strip()
            if not text or len(text) < 3:
                continue
            # Parse "Name, Affiliation"
            if "," in text:
                parts = text.split(",", 1)
                name = parts[0].strip()
                affiliation = parts[1].strip()
            else:
                name = text
                affiliation = ""
            name = re.sub(r"\s+", " ", name).strip().strip("*_").strip()
            affiliation = re.sub(r"\s+", " ", affiliation).strip().strip("*_").strip()
            if name and len(name) > 2:
                members.append({"name": name, "affiliation": affiliation, "role": "member"})

    if members:
        logger.info(f"  PETS: Found {len(members)} members for pets{year}")
    return members if members else None

scrape_acsac_committee(year, session=None, cache_only=False)

Scrape AE committee from the ACSAC website.

ACSAC publishes artifact committee data at two URL patterns: - https://www.acsac.org/{year}/committees/artifact/ (2020-2022) - https://www.acsac.org/{year}/committees/artifacts/ (2023+)

Returns list of {name, affiliation, role} dicts, or None if not found.

Source code in src/scrapers/scrape_committee_web.py
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
def scrape_acsac_committee(year, session=None, cache_only=False):
    """Scrape AE committee from the ACSAC website.

    ACSAC publishes artifact committee data at two URL patterns:
    - ``https://www.acsac.org/{year}/committees/artifact/``  (2020-2022)
    - ``https://www.acsac.org/{year}/committees/artifacts/`` (2023+)

    Returns list of {name, affiliation, role} dicts, or None if not found.
    """
    # Try both URL patterns (slug changed from singular to plural in 2023)
    urls = [
        f"https://www.acsac.org/{year}/committees/artifacts/",
        f"https://www.acsac.org/{year}/committees/artifact/",
    ]

    soup = None
    for url in urls:
        html = _cached_fetch(url, session=session, cache_only=cache_only)
        if html is not None:
            soup = BeautifulSoup(html, "html.parser")
            break

    if soup is None:
        logger.info("  ACSAC: No committee page found for acsac%d", year)
        return None

    # Parse chairs
    chairs = _parse_acsac_chairs(soup)

    # Parse members from all relevant sections
    # 2020-2022: "Students" and "Mentors"
    # 2023+: "Reviewers" and "Mentors"
    members = _parse_acsac_section_members(soup, ["student", "reviewer"])
    mentors = _parse_acsac_section_members(soup, ["mentor"])

    # Fallback for flat-list pages (e.g. 2019) that have no section headings:
    # collect members from the text between <h1> and the footer.
    if not members and not mentors:
        members = _parse_acsac_flat_members(soup, chairs)

    all_members = chairs + mentors + members

    # Deduplicate by name (case-insensitive)
    seen = set()
    deduped = []
    for m in all_members:
        key = m["name"].lower()
        if key not in seen:
            seen.add(key)
            deduped.append(m)

    if deduped:
        chair_count = sum(1 for m in deduped if m["role"] == "chair")
        member_count = len(deduped) - chair_count
        logger.info("  ACSAC: Found %d members + %d chair(s) for acsac%d", member_count, chair_count, year)
    return deduped if deduped else None

scrape_sp_committee(year, session=None, cache_only=False)

Scrape AE committee data from the IEEE S&P 2026 website.

The AEC members are listed on cfartifacts.html in separate cycle tables. Artifact Evaluation Chairs are listed on index.html in the organizing committee table.

Source code in src/scrapers/scrape_committee_web.py
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
def scrape_sp_committee(year, session=None, cache_only=False):
    """Scrape AE committee data from the IEEE S&P 2026 website.

    The AEC members are listed on ``cfartifacts.html`` in separate cycle tables.
    Artifact Evaluation Chairs are listed on ``index.html`` in the organizing
    committee table.
    """
    if year not in SP_KNOWN_YEARS:
        return None

    base_url = f"https://sp{year}.ieee-security.org"
    members_html = _cached_fetch(f"{base_url}/cfartifacts.html", session=session, cache_only=cache_only)
    chairs_html = _cached_fetch(f"{base_url}/index.html", session=session, cache_only=cache_only)

    members = []
    chairs = []

    if members_html is not None:
        soup = BeautifulSoup(members_html, "html.parser")
        members = _scrape_sp_members_html(soup)

    if chairs_html is not None:
        soup = BeautifulSoup(chairs_html, "html.parser")
        chairs = _scrape_sp_chairs_html(soup)

    if not members and not chairs:
        return None

    all_members = chairs + members
    seen = set()
    deduped = []
    for member in all_members:
        key = member["name"].lower()
        if key in seen:
            continue
        seen.add(key)
        deduped.append(member)

    if deduped:
        chair_count = sum(1 for member in deduped if member["role"] == "chair")
        member_count = len(deduped) - chair_count
        logger.info("  IEEE S&P: Found %d members + %d chair(s) for sp%d", member_count, chair_count, year)

    return deduped if deduped else None

scrape_hotcrp_committee(conference, year, session=None, cache_only=False)

Scrape AE committee from a public HotCRP PC-list page.

Parameters

conference : str Conference name (e.g. 'sosp') year : int 4-digit year session : requests.Session, optional cache_only : bool If True, only return data from the disk cache.

Returns

list of {name, affiliation, role} dicts, or None if page not found

Source code in src/scrapers/scrape_committee_web.py
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
def scrape_hotcrp_committee(conference, year, session=None, cache_only=False):
    """Scrape AE committee from a public HotCRP PC-list page.

    Parameters
    ----------
    conference : str
        Conference name (e.g. 'sosp')
    year : int
        4-digit year
    session : requests.Session, optional
    cache_only : bool
        If True, only return data from the disk cache.

    Returns
    -------
    list of {name, affiliation, role} dicts, or None if page not found
    """
    url = HOTCRP_URLS.get((conference.lower(), year))
    if url is None:
        return None

    html = _cached_fetch(url, session=session, cache_only=cache_only)
    if html is None:
        return None

    soup = BeautifulSoup(html, "html.parser")
    table = soup.find("table")
    if table is None:
        return None

    members = []
    for row in table.find_all("tr")[1:]:  # skip header row
        cells = row.find_all("td")
        if len(cells) < 2:
            continue

        name_cell = cells[0]
        affil_cell = cells[1]

        # Detect chair role from <span class="pcrole">chair</span>
        role_span = name_cell.find("span", class_="pcrole")
        is_chair = role_span is not None and "chair" in role_span.get_text(strip=True).lower()

        # Extract name: prefer <span class="taghl"> (highlighted name),
        # otherwise fall back to full cell text minus the role span
        name_span = name_cell.find("span", class_="taghl")
        if name_span:
            name = name_span.get_text(strip=True)
        else:
            # Remove the role span text from the cell text
            name = name_cell.get_text(strip=True)
            if role_span:
                name = name.replace(role_span.get_text(strip=True), "").strip()

        affiliation = affil_cell.get_text(strip=True)

        # Clean up
        name = re.sub(r"\s+", " ", name).strip()
        affiliation = re.sub(r"\s+", " ", affiliation).strip()

        # Skip placeholder entries ("[No name]", empty, "None")
        if not name or name == "[No name]" or len(name) < 2:
            continue

        role = "chair" if is_chair else "member"
        members.append({"name": name, "affiliation": affiliation, "role": role})

    if members:
        chair_count = sum(1 for m in members if m["role"] == "chair")
        member_count = len(members) - chair_count
        logger.info(
            "  HotCRP: Found %d members + %d chair(s) for %s%d",
            member_count,
            chair_count,
            conference,
            year,
        )
    return members if members else None

get_alternative_committees(conferences_needed)

Fetch committees from alternative sources for conferences not in sysartifacts/secartifacts.

Parameters

conferences_needed : dict {conf_year_str: 'systems'|'security'} — conferences that need data. e.g. {'fast2024': 'systems', 'usenixsec2022': 'security'}

Returns

dict of {conf_year_str: [{name, affiliation}, ...]}

Source code in src/scrapers/scrape_committee_web.py
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
def get_alternative_committees(conferences_needed):
    """Fetch committees from alternative sources for conferences not in sysartifacts/secartifacts.

    Parameters
    ----------
    conferences_needed : dict
        {conf_year_str: 'systems'|'security'} — conferences that need data.
        e.g. {'fast2024': 'systems', 'usenixsec2022': 'security'}

    Returns
    -------
    dict of {conf_year_str: [{name, affiliation}, ...]}
    """
    cache_only = os.getenv("SKIP_USENIX_SCRAPE", "").strip().lower() in {
        "1",
        "true",
        "yes",
    }
    results = {}
    local = _load_local_committees()
    sess = None if cache_only else _get_session()

    if cache_only:
        logger.info("  SKIP_USENIX_SCRAPE set — using cached scraping results only (no live HTTP)")

    for conf_year_str, _area in conferences_needed.items():
        m = re.match(r"^([a-zA-Z]+)(\d{4})$", conf_year_str)
        if not m:
            continue
        conf = m.group(1).lower()
        year = int(m.group(2))

        committee = None

        # Try web scraper (uses disk cache; cache_only=True means no live HTTP)
        if conf in USENIX_CONF_SLUGS:
            committee = scrape_usenix_committee(conf, year, session=sess, cache_only=cache_only)
        elif conf == "ches":
            committee = scrape_ches_committee(year, session=sess, cache_only=cache_only)
        elif conf == "pets":
            committee = scrape_pets_committee(year, session=sess, cache_only=cache_only)
        elif conf == "acsac":
            committee = scrape_acsac_committee(year, session=sess, cache_only=cache_only)
        elif conf == "sp":
            committee = scrape_sp_committee(year, session=sess, cache_only=cache_only)

        # Try HotCRP (public pages, not blocked by SKIP_USENIX_SCRAPE)
        if not committee and (conf, year) in HOTCRP_URLS:
            hotcrp_sess = sess if sess is not None else _get_session()
            committee = scrape_hotcrp_committee(conf, year, session=hotcrp_sess, cache_only=False)

        # Fallback: local/static data (e.g. from local pipeline run)
        if not committee:
            committee = local.get(conf_year_str)

        if committee and len(committee) > 0:
            results[conf_year_str] = committee

    return results

get_all_usenix_committees(conf_regex=None)

Scrape all available USENIX conference committees.

Parameters

conf_regex : str, optional Regex to filter conference/year strings (e.g. '.20[2][0-5]')

Returns

dict of {conf_year_str: [{name, affiliation}, ...]}

Source code in src/scrapers/scrape_committee_web.py
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
def get_all_usenix_committees(conf_regex=None):
    """Scrape all available USENIX conference committees.

    Parameters
    ----------
    conf_regex : str, optional
        Regex to filter conference/year strings (e.g. '.20[2][0-5]')

    Returns
    -------
    dict of {conf_year_str: [{name, affiliation}, ...]}
    """
    results = {}
    sess = _get_session()

    for conf, years in USENIX_KNOWN_YEARS.items():
        for year in years:
            conf_year = f"{conf}{year}"
            if conf_regex and not re.search(conf_regex, conf_year):
                continue
            committee = scrape_usenix_committee(conf, year, session=sess)
            if committee:
                results[conf_year] = committee

    return results