Parse artifact evaluation results from sysartifacts/secartifacts GitHub repos.
Downloads and parses results.md (or result.md) files that contain
YAML front-matter with artifact metadata (titles, badges, URLs). Also
handles the HTML-table variant used by some conferences (OSDI, ATC, etc.)
where badges are encoded as <span> tags inside markdown tables.
Public API
get_ae_results(conf_regex, prefix) -> dict[str, list[dict]]
parse_html_results(content)
Parse HTML-table-based results pages (used by OSDI, ATC, etc.).
These pages use tags with ids to indicate badges:
span#aa = Available, span#af = Functional, span#rr = Reproduced
And markdown tables with rows like:
| Title | AVAILABLE... | Github |
Source code in src/scrapers/parse_results_md.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108 | def parse_html_results(content):
"""
Parse HTML-table-based results pages (used by OSDI, ATC, etc.).
These pages use <span> tags with ids to indicate badges:
span#aa = Available, span#af = Functional, span#rr = Reproduced
And markdown tables with rows like:
| [Title](url) | <span id="aa">AVAILABLE</span>... | [Github](url) |
"""
artifacts = []
# Use BeautifulSoup to parse the HTML fragments within the markdown
soup = BeautifulSoup(content, "html.parser")
# Find all table rows
rows = soup.find_all("tr")
for row in rows:
cells = row.find_all("td")
if len(cells) < 2:
continue
# Extract title from first cell
title_cell = cells[0]
title_link = title_cell.find("a")
title = title_link.get_text(strip=True) if title_link else title_cell.get_text(strip=True)
if not title or title.lower() in ("paper title", ""):
continue
paper_url = ""
if title_link and title_link.get("href"):
paper_url = title_link["href"]
# Extract badges from second cell (span tags)
badge_cell = cells[1] if len(cells) > 1 else None
badges = []
if badge_cell:
spans = badge_cell.find_all("span")
for span in spans:
span_id = span.get("id", "")
span_text = span.get_text(strip=True).lower()
if span_id == "aa" or "available" in span_text:
badges.append("available")
elif span_id == "af" or "functional" in span_text:
badges.append("functional")
elif span_id == "rr" or "reproduc" in span_text or "replicated" in span_text:
badges.append("reproduced")
# Extract repository URL from third cell
repo_url = ""
artifact_url = ""
if len(cells) > 2:
url_cell = cells[2]
links = url_cell.find_all("a")
for link in links:
href = link.get("href", "")
link_text = link.get_text(strip=True).lower()
if (
"github" in link_text
or "github.com" in href
or "gitlab" in link_text
or "gitlab" in href
or "bitbucket" in link_text
):
repo_url = href
elif "zenodo" in link_text or "zenodo.org" in href or "figshare" in link_text or "doi.org" in href:
artifact_url = href
elif not repo_url:
repo_url = href
if title and (badges or repo_url or artifact_url):
artifact = {
"title": title,
"badges": ",".join(badges) if badges else "",
}
if paper_url:
artifact["paper_url"] = paper_url
if repo_url:
artifact["repository_url"] = repo_url
if artifact_url:
artifact["artifact_url"] = artifact_url
artifacts.append(artifact)
return artifacts
|
parse_markdown_table_results(content)
Fallback parser for markdown tables that weren't converted to HTML.
Parses raw markdown rows like:
| Title | AVAILABLE... | Github |
Source code in src/scrapers/parse_results_md.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178 | def parse_markdown_table_results(content):
"""
Fallback parser for markdown tables that weren't converted to HTML.
Parses raw markdown rows like:
| [Title](url) | <span id="aa">AVAILABLE</span>... | [Github](url) |
"""
artifacts = []
# Find table rows in markdown
lines = content.split("\n")
for line in lines:
line = line.strip()
if not line.startswith("|") or ":-" in line:
continue
cells = [c.strip() for c in line.split("|")[1:-1]]
if len(cells) < 2:
continue
# Extract title
title_match = re.search(r"\[([^\]]+)\]", cells[0])
if not title_match:
continue
title = title_match.group(1).strip()
if title.lower() in ("paper title", ""):
continue
# Extract badges from spans
badges = []
badge_cell = cells[1] if len(cells) > 1 else ""
if 'id="aa"' in badge_cell or ">AVAILABLE<" in badge_cell:
badges.append("available")
if 'id="af"' in badge_cell or ">FUNCTIONAL<" in badge_cell:
badges.append("functional")
if 'id="rr"' in badge_cell or ">REPRODUCED<" in badge_cell or ">REPLICATED<" in badge_cell:
badges.append("reproduced")
# Extract URLs from third cell
repo_url = ""
artifact_url = ""
if len(cells) > 2:
url_matches = re.findall(r"\[([^\]]*)\]\(([^)]+)\)", cells[2])
for link_text, href in url_matches:
lt = link_text.lower()
if "github" in lt or "gitlab" in lt or "bitbucket" in lt:
repo_url = href
elif "zenodo" in lt or "figshare" in lt or "doi" in lt:
artifact_url = href
elif not repo_url:
repo_url = href
# Also check for bare URLs
if not repo_url:
bare_urls = re.findall(r"(https?://github\.com/[^\s<|]+)", cells[2])
if bare_urls:
repo_url = bare_urls[0]
if title and (badges or repo_url or artifact_url):
artifact = {
"title": title,
"badges": ",".join(badges) if badges else "",
}
if repo_url:
artifact["repository_url"] = repo_url
if artifact_url:
artifact["artifact_url"] = artifact_url
artifacts.append(artifact)
return artifacts
|