Skip to content

scraping

src.generators.committee_stats.scraping

Committee scraping/loading.

Fetches AE committee data from sysartifacts and secartifacts GitHub repos, falls back to web scrapers / local YAML for conferences whose committees are missing or incomplete, and cleans the resulting member lists (placeholder removal, markdown/HTML stripping, affiliation normalization).

scrape_committees(conf_regex: str) -> tuple[dict, dict]

Scrape committees from sysartifacts/secartifacts and alternative sources.

Parameters

conf_regex : str Regex matching conference-year names (e.g. .*20[12][0-9]).

Returns

(all_results, conf_to_area) all_results maps conf_year to a list of cleaned member dicts. conf_to_area maps conf_year to "systems" / "security" / "unknown", preferring the source repo over the fallback heuristic.

Source code in src/generators/committee_stats/scraping.py
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
def scrape_committees(conf_regex: str) -> tuple[dict, dict]:
    """Scrape committees from sysartifacts/secartifacts and alternative sources.

    Parameters
    ----------
    conf_regex : str
        Regex matching conference-year names (e.g. ``.*20[12][0-9]``).

    Returns
    -------
    (all_results, conf_to_area)
        ``all_results`` maps ``conf_year`` to a list of cleaned member dicts.
        ``conf_to_area`` maps ``conf_year`` to ``"systems"`` / ``"security"`` /
        ``"unknown"``, preferring the source repo over the fallback heuristic.
    """
    logger.info("  Scraping systems committee data from sysartifacts...")
    sys_results = get_committees(conf_regex, "sys")
    logger.info(f"    Found {len(sys_results)} systems conference-years")

    logger.info("  Scraping security committee data from secartifacts...")
    sec_results = get_committees(conf_regex, "sec")
    logger.info(f"    Found {len(sec_results)} security conference-years")

    all_results: dict = {}
    all_results.update(sys_results)
    all_results.update(sec_results)

    # Clean all results (remove placeholders, fix markdown links)
    for cy in list(all_results.keys()):
        all_results[cy] = _clean_committee(all_results[cy])

    # Supplement with alternative sources for missing/invalid committees.
    logger.info("  Checking for conferences needing alternative sources...")
    conferences_needed: dict[str, str] = {}

    all_conf_dirs: set[str] = set()
    for prefix in ("sys", "sec"):
        for entry in get_conferences_from_prefix(prefix) or []:
            name = entry.get("name", "")
            if re.search(conf_regex, name):
                all_conf_dirs.add(name)

    # Also include conferences from local_committees.yaml so that conferences
    # not yet merged into sysartifacts/secartifacts are still discovered.
    for cy in _load_local_committees():
        if re.search(conf_regex, cy):
            all_conf_dirs.add(cy)

    for cy in sorted(all_conf_dirs):
        if cy in all_results and _is_valid_committee(all_results[cy]):
            continue
        conferences_needed[cy] = _conf_area(cy)

    if conferences_needed:
        logger.info(f"    Need alternative sources for {len(conferences_needed)} conference-years:")
        for cy in sorted(conferences_needed.keys()):
            existing = len(all_results.get(cy, []))
            logger.info(f"      {cy} (currently {existing} members)")

        alt_results = get_alternative_committees(conferences_needed)
        for cy, members in alt_results.items():
            cleaned = _clean_committee(members)
            if cleaned:
                existing_count = len(all_results.get(cy, []))
                all_results[cy] = cleaned
                logger.info(f"    ✓ {cy}: replaced {existing_count}{len(cleaned)} members (alternative source)")
    else:
        logger.info("    All conference-years have valid committee data.")

    # Build area map – prefer the source prefix over the fallback list so that
    # newly added conferences are classified correctly.
    conf_to_area: dict[str, str] = {}
    for cy in all_results:
        if cy in sys_results:
            conf_to_area[cy] = "systems"
        elif cy in sec_results:
            conf_to_area[cy] = "security"
        else:
            conf_to_area[cy] = _conf_area(cy)

    return all_results, conf_to_area