Generate artifact storage source statistics.
Counts artifacts by storage source (GitHub, Zenodo, Figshare, OSF, etc.)
and creates both summary data and detailed CSV for visualization.
Usage
python generate_artifact_sources_table.py --conf_regex '.*20[12][0-9]' --output_dir ../acm-rep-2026-paper/reproducibility
Determine the source of an artifact from its URL.
Source code in src/generators/generate_artifact_sources_table.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83 | def extract_source(url):
"""Determine the source of an artifact from its URL."""
if not url:
return "unknown"
url_lower = url.lower()
if "github.com" in url_lower or "github.io" in url_lower:
return "GitHub"
if "zenodo" in url_lower or "zenodo.org" in url_lower:
return "Zenodo"
if "figshare" in url_lower:
return "Figshare"
if "osf.io" in url_lower:
return "OSF"
if "gitlab" in url_lower:
return "GitLab"
if "bitbucket" in url_lower:
return "Bitbucket"
if "archive.org" in url_lower or "arxiv" in url_lower:
return "Archive.org"
if "dataverse" in url_lower:
return "Dataverse"
if "archive" in url_lower:
return "Archive site"
if "doi.org" in url_lower:
# Try to resolve DOI to actual repository
resolved = _resolve_doi_prefix(url_lower)
return resolved if resolved else "DOI"
return "Other"
|
get_artifact_url(artifact)
Extract the first valid URL from an artifact.
Source code in src/generators/generate_artifact_sources_table.py
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104 | def get_artifact_url(artifact):
"""Extract the first valid URL from an artifact."""
# New format: artifact_urls is the canonical list
urls = artifact.get("artifact_urls", [])
if isinstance(urls, list):
for u in urls:
norm = _normalise_url(u)
if norm:
return norm
# Legacy fallback
for key in ["repository_url", "artifact_url", "github_url", "second_repository_url", "bitbucket_url"]:
val = artifact.get(key, "")
if isinstance(val, list):
val = val[0] if val else ""
val = _normalise_url(val)
if val:
return val
return None
|
get_artifact_urls(artifact)
Extract all normalized URLs from an artifact.
Source code in src/generators/generate_artifact_sources_table.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136 | def get_artifact_urls(artifact):
"""Extract all normalized URLs from an artifact."""
urls = []
# New format: artifact_urls is the canonical list
art_urls = artifact.get("artifact_urls", [])
if isinstance(art_urls, list):
for u in art_urls:
norm = _normalise_url(u)
if norm:
urls.append(norm)
# Legacy fallback for old-format data
if not urls:
for key in ["repository_url", "artifact_url", "github_url", "second_repository_url", "bitbucket_url"]:
val = artifact.get(key, "")
if isinstance(val, list):
candidates = val
else:
candidates = [val]
for candidate in candidates:
norm = _normalise_url(candidate)
if norm:
urls.append(norm)
deduped = []
seen = set()
for url in urls:
if url not in seen:
seen.add(url)
deduped.append(url)
return deduped
|
count_sources_by_conference(all_results: dict[str, list[dict]]) -> dict[str, int]
Count artifacts by source for each conference.
Source code in src/generators/generate_artifact_sources_table.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162 | def count_sources_by_conference(all_results: dict[str, list[dict]]) -> dict[str, int]:
"""Count artifacts by source for each conference."""
stats = defaultdict(lambda: defaultdict(int))
stats["overall"] = defaultdict(int)
for conf_year, artifacts in all_results.items():
conf_name = re.match(r"^([a-zA-Z]+)", conf_year)
if not conf_name:
continue
conf_name = conf_name.group(1).upper()
# Determine area from prefix (this is a heuristic)
for artifact in artifacts:
urls = get_artifact_urls(artifact)
sources = {extract_source(url) for url in urls} if urls else {"unknown"}
for source in sources:
stats[conf_name][source] += 1
stats["overall"][source] += 1
if urls:
stats[conf_name]["total"] += 1
stats["overall"]["total"] += 1
return dict(stats)
|
count_sources_by_area(all_results: dict[str, list[dict]]) -> dict[str, int]
Count artifacts by source for systems vs security.
Source code in src/generators/generate_artifact_sources_table.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214 | def count_sources_by_area(all_results: dict[str, list[dict]]) -> dict[str, int]:
"""Count artifacts by source for systems vs security."""
sys_sources = defaultdict(int)
sec_sources = defaultdict(int)
sys_no_source = 0
sec_no_source = 0
for conf_year, artifacts in all_results.items():
# Determine if this is a systems or security conference
conf_name = re.match(r"^([a-zA-Z]+)", conf_year)
if not conf_name:
continue
conf_name = conf_name.group(1).upper()
area = conf_area(conf_name)
if area == "systems":
target_dict = sys_sources
is_systems = True
elif area == "security":
target_dict = sec_sources
is_systems = False
else:
# Try to infer: check for "Security" in second part of conf_year
if "security" in conf_year.lower():
target_dict = sec_sources
is_systems = False
else:
target_dict = sys_sources
is_systems = True
for artifact in artifacts:
urls = get_artifact_urls(artifact)
if urls:
sources = {extract_source(url) for url in urls}
for source in sources:
target_dict[source] += 1
target_dict["total"] += 1
else:
# Count artifacts without URLs separately
if is_systems:
sys_no_source += 1
else:
sec_no_source += 1
return {
"systems": dict(sys_sources),
"security": dict(sec_sources),
"systems_no_source": sys_no_source,
"security_no_source": sec_no_source,
}
|
count_sources_overall(all_results)
Count artifacts by source overall.
Source code in src/generators/generate_artifact_sources_table.py
217
218
219
220
221
222
223
224
225
226
227
228
229
230 | def count_sources_overall(all_results):
"""Count artifacts by source overall."""
sources = defaultdict(int)
for _conf_year, artifacts in all_results.items():
for artifact in artifacts:
urls = get_artifact_urls(artifact)
source_set = {extract_source(url) for url in urls} if urls else {"unknown"}
for source in source_set:
sources[source] += 1
if urls:
sources["total"] += 1
return dict(sources)
|