Skip to content

generate_artifact_sources_table

src.generators.generate_artifact_sources_table

Generate artifact storage source statistics.

Counts artifacts by storage source (GitHub, Zenodo, Figshare, OSF, etc.) and creates both summary data and detailed CSV for visualization.

Usage

python generate_artifact_sources_table.py --conf_regex '.*20[12][0-9]' --output_dir ../acm-rep-2026-paper/reproducibility

extract_source(url)

Determine the source of an artifact from its URL.

Source code in src/generators/generate_artifact_sources_table.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def extract_source(url):
    """Determine the source of an artifact from its URL."""
    if not url:
        return "unknown"

    url_lower = url.lower()

    if "github.com" in url_lower or "github.io" in url_lower:
        return "GitHub"
    if "zenodo" in url_lower or "zenodo.org" in url_lower:
        return "Zenodo"
    if "figshare" in url_lower:
        return "Figshare"
    if "osf.io" in url_lower:
        return "OSF"
    if "gitlab" in url_lower:
        return "GitLab"
    if "bitbucket" in url_lower:
        return "Bitbucket"
    if "archive.org" in url_lower or "arxiv" in url_lower:
        return "Archive.org"
    if "dataverse" in url_lower:
        return "Dataverse"
    if "archive" in url_lower:
        return "Archive site"
    if "doi.org" in url_lower:
        # Try to resolve DOI to actual repository
        resolved = _resolve_doi_prefix(url_lower)
        return resolved if resolved else "DOI"
    return "Other"

get_artifact_url(artifact)

Extract the first valid URL from an artifact.

Source code in src/generators/generate_artifact_sources_table.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def get_artifact_url(artifact):
    """Extract the first valid URL from an artifact."""
    # New format: artifact_urls is the canonical list
    urls = artifact.get("artifact_urls", [])
    if isinstance(urls, list):
        for u in urls:
            norm = _normalise_url(u)
            if norm:
                return norm
    # Legacy fallback
    for key in ["repository_url", "artifact_url", "github_url", "second_repository_url", "bitbucket_url"]:
        val = artifact.get(key, "")
        if isinstance(val, list):
            val = val[0] if val else ""
        val = _normalise_url(val)
        if val:
            return val

    return None

get_artifact_urls(artifact)

Extract all normalized URLs from an artifact.

Source code in src/generators/generate_artifact_sources_table.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def get_artifact_urls(artifact):
    """Extract all normalized URLs from an artifact."""
    urls = []
    # New format: artifact_urls is the canonical list
    art_urls = artifact.get("artifact_urls", [])
    if isinstance(art_urls, list):
        for u in art_urls:
            norm = _normalise_url(u)
            if norm:
                urls.append(norm)
    # Legacy fallback for old-format data
    if not urls:
        for key in ["repository_url", "artifact_url", "github_url", "second_repository_url", "bitbucket_url"]:
            val = artifact.get(key, "")
            if isinstance(val, list):
                candidates = val
            else:
                candidates = [val]
            for candidate in candidates:
                norm = _normalise_url(candidate)
                if norm:
                    urls.append(norm)

    deduped = []
    seen = set()
    for url in urls:
        if url not in seen:
            seen.add(url)
            deduped.append(url)
    return deduped

count_sources_by_conference(all_results: dict[str, list[dict]]) -> dict[str, int]

Count artifacts by source for each conference.

Source code in src/generators/generate_artifact_sources_table.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def count_sources_by_conference(all_results: dict[str, list[dict]]) -> dict[str, int]:
    """Count artifacts by source for each conference."""
    stats = defaultdict(lambda: defaultdict(int))
    stats["overall"] = defaultdict(int)

    for conf_year, artifacts in all_results.items():
        conf_name = re.match(r"^([a-zA-Z]+)", conf_year)
        if not conf_name:
            continue
        conf_name = conf_name.group(1).upper()

        # Determine area from prefix (this is a heuristic)

        for artifact in artifacts:
            urls = get_artifact_urls(artifact)
            sources = {extract_source(url) for url in urls} if urls else {"unknown"}
            for source in sources:
                stats[conf_name][source] += 1
                stats["overall"][source] += 1
            if urls:
                stats[conf_name]["total"] += 1
                stats["overall"]["total"] += 1

    return dict(stats)

count_sources_by_area(all_results: dict[str, list[dict]]) -> dict[str, int]

Count artifacts by source for systems vs security.

Source code in src/generators/generate_artifact_sources_table.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
def count_sources_by_area(all_results: dict[str, list[dict]]) -> dict[str, int]:
    """Count artifacts by source for systems vs security."""
    sys_sources = defaultdict(int)
    sec_sources = defaultdict(int)
    sys_no_source = 0
    sec_no_source = 0

    for conf_year, artifacts in all_results.items():
        # Determine if this is a systems or security conference
        conf_name = re.match(r"^([a-zA-Z]+)", conf_year)
        if not conf_name:
            continue
        conf_name = conf_name.group(1).upper()

        area = conf_area(conf_name)
        if area == "systems":
            target_dict = sys_sources
            is_systems = True
        elif area == "security":
            target_dict = sec_sources
            is_systems = False
        else:
            # Try to infer: check for "Security" in second part of conf_year
            if "security" in conf_year.lower():
                target_dict = sec_sources
                is_systems = False
            else:
                target_dict = sys_sources
                is_systems = True

        for artifact in artifacts:
            urls = get_artifact_urls(artifact)
            if urls:
                sources = {extract_source(url) for url in urls}
                for source in sources:
                    target_dict[source] += 1
                target_dict["total"] += 1
            else:
                # Count artifacts without URLs separately
                if is_systems:
                    sys_no_source += 1
                else:
                    sec_no_source += 1

    return {
        "systems": dict(sys_sources),
        "security": dict(sec_sources),
        "systems_no_source": sys_no_source,
        "security_no_source": sec_no_source,
    }

count_sources_overall(all_results)

Count artifacts by source overall.

Source code in src/generators/generate_artifact_sources_table.py
217
218
219
220
221
222
223
224
225
226
227
228
229
230
def count_sources_overall(all_results):
    """Count artifacts by source overall."""
    sources = defaultdict(int)

    for _conf_year, artifacts in all_results.items():
        for artifact in artifacts:
            urls = get_artifact_urls(artifact)
            source_set = {extract_source(url) for url in urls} if urls else {"unknown"}
            for source in source_set:
                sources[source] += 1
            if urls:
                sources["total"] += 1

    return dict(sources)