Scrape ACSAC conference artifact evaluation pages for paper titles,
badges, and artifact URLs.
ACSAC publishes artifact evaluation results at
https://www.acsac.org//program/artifacts/
The page groups papers under badge images (code_available, code_reviewed,
code_reproducible) in bullet lists. Each bullet contains the paper title
and one or more links to the artifact.
Usage as a library
from src.scrapers.acsac_scrape import scrape_acsac_artifacts
artifacts = scrape_acsac_artifacts(2025)
Usage standalone
python -m src.scrapers.acsac_scrape --years 2024,2025
scrape_acsac_artifacts(year, session=None)
Scrape the ACSAC artifact evaluation page for a given year.
Parameters:
| Name |
Type |
Description |
Default |
year
|
|
|
required
|
session
|
|
Optional requests.Session to reuse.
|
None
|
Returns:
| Type |
Description |
|
|
List of dicts with keys:
- title (str): paper title
- badges (list[str]): e.g. ['available'] or ['available', 'reviewed']
- artifact_urls (list[str]): artifact link(s)
- paper_url (str): always empty (ACSAC pages don't link to papers)
|
Note: ACSAC awards each paper exactly one badge level (the highest),
so each artifact appears once under its highest badge section.
Source code in src/scrapers/acsac_scrape.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155 | def scrape_acsac_artifacts(year, session=None):
"""
Scrape the ACSAC artifact evaluation page for a given year.
Args:
year: Conference year (int).
session: Optional requests.Session to reuse.
Returns:
List of dicts with keys:
- title (str): paper title
- badges (list[str]): e.g. ['available'] or ['available', 'reviewed']
- artifact_urls (list[str]): artifact link(s)
- paper_url (str): always empty (ACSAC pages don't link to papers)
Note: ACSAC awards each paper exactly one badge level (the highest),
so each artifact appears once under its highest badge section.
"""
if session is None:
session = create_session()
url = ACSAC_ARTIFACTS_URL.format(year=year)
logger.info("Fetching %s", url)
resp = session.get(url, timeout=30)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
content = soup.find("main") or soup.find("article") or soup.body
if content is None:
logger.warning("Could not find main content element for ACSAC %d", year)
return []
# First pass: collect raw entries with their single badge
raw_entries = []
current_badge = None
for element in content.descendants:
# Detect badge section from badge images.
# NOTE: some ACSAC pages have incorrect alt text (e.g. "Code Available"
# for the reviewed badge), so we match on src first, falling back to alt.
if element.name == "img":
src = element.get("src", "").lower()
alt = element.get("alt", "").lower().replace(" ", "_")
matched = False
for fragment, badge_name in _BADGE_MAP.items():
if fragment in src:
current_badge = badge_name
matched = True
break
if not matched:
for fragment, badge_name in _BADGE_MAP.items():
if fragment in alt:
current_badge = badge_name
break
# Parse paper entries from list items.
# NOTE: ACSAC pages use malformed HTML where <li> elements are nested
# (no closing </li> tags), so element.get_text() would include all
# subsequent entries. We only collect direct children up to the first
# child <li>.
if element.name == "li" and current_badge:
title_parts = []
artifact_urls = []
for child in element.children:
if getattr(child, "name", None) == "li":
break
if getattr(child, "name", None) == "a":
href = child.get("href", "").strip()
if href and not href.startswith("#"):
artifact_urls.append(_strip_tokens(href))
elif isinstance(child, str):
title_parts.append(child.strip())
title = " ".join(p for p in title_parts if p).strip()
title = re.sub(r"^Title:\s*", "", title)
if title:
raw_entries.append(
{
"title": title,
"badge": current_badge,
"artifact_urls": artifact_urls,
}
)
# Convert to the common artifact format (list of badges per paper)
# ACSAC gives each paper its highest badge, so badges is a single-element list
artifacts = []
for entry in raw_entries:
artifacts.append(
{
"title": entry["title"],
"badges": [entry["badge"]],
"artifact_urls": entry["artifact_urls"],
"paper_url": "",
}
)
return artifacts
|