Generate artifact storage source statistics over time.
Tracks how GitHub, Zenodo, and other platforms have changed over the years
for artifact evaluation repositories.
Usage
python generate_artifact_sources_timeline.py --output_dir ../acm-rep-2026-paper/reproducibility
Determine the source of an artifact from its URL.
Source code in src/generators/generate_artifact_sources_timeline.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75 | def extract_source(url):
"""Determine the source of an artifact from its URL."""
if not url:
return None
url_lower = url.lower()
if "github.com" in url_lower or "github.io" in url_lower:
return "GitHub"
if "zenodo" in url_lower or "zenodo.org" in url_lower:
return "Zenodo"
if "figshare" in url_lower:
return "Figshare"
if "osf.io" in url_lower:
return "OSF"
if "gitlab" in url_lower:
return "GitLab"
if "bitbucket" in url_lower:
return "Bitbucket"
if "archive.org" in url_lower or "arxiv" in url_lower:
return "Archive"
if "dataverse" in url_lower:
return "Dataverse"
if "doi.org" in url_lower:
# Try to resolve DOI to actual repository
resolved = _resolve_doi_prefix(url_lower)
return resolved if resolved else "DOI"
return "Other"
|
get_artifact_url(artifact)
Extract the first valid URL from an artifact.
Source code in src/generators/generate_artifact_sources_timeline.py
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96 | def get_artifact_url(artifact):
"""Extract the first valid URL from an artifact."""
# New format: artifact_urls is the canonical list
urls = artifact.get("artifact_urls", [])
if isinstance(urls, list):
for u in urls:
norm = _normalise_url(u)
if norm:
return norm
# Legacy fallback
for key in ["repository_url", "artifact_url", "github_url", "second_repository_url", "bitbucket_url"]:
val = artifact.get(key, "")
if isinstance(val, list):
val = val[0] if val else ""
val = _normalise_url(val)
if val:
return val
return None
|
Extract year from conference name like 'osdi2024' -> 2024.
Returns None if no year found.
Source code in src/generators/generate_artifact_sources_timeline.py
99
100
101
102
103
104
105
106
107 | def extract_year_from_confname(conf_year_str):
"""
Extract year from conference name like 'osdi2024' -> 2024.
Returns None if no year found.
"""
match = re.search(r"(\d{4})$", conf_year_str)
if match:
return int(match.group(1))
return None
|
count_sources_by_year(all_results: dict[str, list[dict]]) -> dict[int, int]
Count artifacts by source for each year.
Returns dict: year -> {source: count}
Source code in src/generators/generate_artifact_sources_timeline.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129 | def count_sources_by_year(all_results: dict[str, list[dict]]) -> dict[int, int]:
"""
Count artifacts by source for each year.
Returns dict: year -> {source: count}
"""
stats = defaultdict(lambda: defaultdict(int))
for conf_year, artifacts in all_results.items():
year = extract_year_from_confname(conf_year)
if not year:
continue
for artifact in artifacts:
url = get_artifact_url(artifact)
source = extract_source(url)
if source:
stats[year][source] += 1
return dict(stats)
|
generate_csv(output_dir)
Generate CSV file with artifact sources by year.
Source code in src/generators/generate_artifact_sources_timeline.py
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180 | def generate_csv(output_dir):
"""Generate CSV file with artifact sources by year."""
# Get all artifacts (from both systems and security)
logger.info("Fetching artifact evaluation results...")
sys_results = get_ae_results(r".*20[12][0-9]", "sys")
sec_results = get_ae_results(r".*20[12][0-9]", "sec")
all_results = {**sys_results, **sec_results}
# Count sources by year
stats_by_year = count_sources_by_year(all_results)
# Sort years
years = sorted(stats_by_year.keys())
# Get all unique sources
all_sources = set()
for year_stats in stats_by_year.values():
all_sources.update(year_stats.keys())
# Sort sources with GitHub and Zenodo first, then alphabetically
sources = sorted(
all_sources, key=lambda x: (0 if x == "GitHub" else (1 if x == "Zenodo" else (2 if x == "Other" else 3)), x)
)
# Write CSV
os.makedirs(output_dir, exist_ok=True)
csv_path = os.path.join(output_dir, "fig_sources_over_time.csv")
with open(csv_path, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=["Year"] + sources)
writer.writeheader()
for year in years:
row = {"Year": year}
for source in sources:
row[source] = stats_by_year[year].get(source, 0)
writer.writerow(row)
logger.info(f"✓ Generated {csv_path}")
# Print summary
logger.info("\nArtifact sources over time:")
logger.info(f"{'Year':<6} {' '.join(f'{s:>10}' for s in sources)}")
for year in years:
counts = [str(stats_by_year[year].get(s, 0)) for s in sources]
logger.info(f"{year:<6} {' '.join(f'{c:>10}' for c in counts)}")
return csv_path
|