Skip to content

ReproDB Pipeline

generate_artifact_sources_table

reprodb-pipeline

generate_artifact_sources_table¶

`src.generators.generate_artifact_sources_table` ¶

Generate artifact storage source statistics.

Counts artifacts by storage source (GitHub, Zenodo, Figshare, OSF, etc.) and creates both summary data and detailed CSV for visualization.

Usage

python generate_artifact_sources_table.py --conf_regex '.*20[12][0-9]' --output_dir ../acm-rep-2026-paper/reproducibility

`extract_source(url)` ¶

Determine the source of an artifact from its URL.

Source code in src/generators/generate_artifact_sources_table.py

def extract_source(url):
    """Determine the source of an artifact from its URL."""
    if not url:
        return "unknown"

    url_lower = url.lower()

    if "github.com" in url_lower or "github.io" in url_lower:
        return "GitHub"
    if "zenodo" in url_lower or "zenodo.org" in url_lower:
        return "Zenodo"
    if "figshare" in url_lower:
        return "Figshare"
    if "osf.io" in url_lower:
        return "OSF"
    if "gitlab" in url_lower:
        return "GitLab"
    if "bitbucket" in url_lower:
        return "Bitbucket"
    if "archive.org" in url_lower or "arxiv" in url_lower:
        return "Archive.org"
    if "dataverse" in url_lower:
        return "Dataverse"
    if "archive" in url_lower:
        return "Archive site"
    if "doi.org" in url_lower:
        # Try to resolve DOI to actual repository
        resolved = _resolve_doi_prefix(url_lower)
        return resolved if resolved else "DOI"
    return "Other"

`get_artifact_url(artifact)` ¶

Extract the first valid URL from an artifact.

Source code in src/generators/generate_artifact_sources_table.py

def get_artifact_url(artifact):
    """Extract the first valid URL from an artifact."""
    # New format: artifact_urls is the canonical list
    urls = artifact.get("artifact_urls", [])
    if isinstance(urls, list):
        for u in urls:
            norm = _normalise_url(u)
            if norm:
                return norm
    # Legacy fallback
    for key in ["repository_url", "artifact_url", "github_url", "second_repository_url", "bitbucket_url"]:
        val = artifact.get(key, "")
        if isinstance(val, list):
            val = val[0] if val else ""
        val = _normalise_url(val)
        if val:
            return val

    return None

`get_artifact_urls(artifact)` ¶

Extract all normalized URLs from an artifact.

Source code in src/generators/generate_artifact_sources_table.py

def get_artifact_urls(artifact):
    """Extract all normalized URLs from an artifact."""
    urls = []
    # New format: artifact_urls is the canonical list
    art_urls = artifact.get("artifact_urls", [])
    if isinstance(art_urls, list):
        for u in art_urls:
            norm = _normalise_url(u)
            if norm:
                urls.append(norm)
    # Legacy fallback for old-format data
    if not urls:
        for key in ["repository_url", "artifact_url", "github_url", "second_repository_url", "bitbucket_url"]:
            val = artifact.get(key, "")
            if isinstance(val, list):
                candidates = val
            else:
                candidates = [val]
            for candidate in candidates:
                norm = _normalise_url(candidate)
                if norm:
                    urls.append(norm)

    deduped = []
    seen = set()
    for url in urls:
        if url not in seen:
            seen.add(url)
            deduped.append(url)
    return deduped

`count_sources_by_conference(all_results: dict[str, list[dict]]) -> dict[str, int]` ¶

Count artifacts by source for each conference.

Source code in src/generators/generate_artifact_sources_table.py

def count_sources_by_conference(all_results: dict[str, list[dict]]) -> dict[str, int]:
    """Count artifacts by source for each conference."""
    stats = defaultdict(lambda: defaultdict(int))
    stats["overall"] = defaultdict(int)

    for conf_year, artifacts in all_results.items():
        conf_name = re.match(r"^([a-zA-Z]+)", conf_year)
        if not conf_name:
            continue
        conf_name = conf_name.group(1).upper()

        # Determine area from prefix (this is a heuristic)

        for artifact in artifacts:
            urls = get_artifact_urls(artifact)
            sources = {extract_source(url) for url in urls} if urls else {"unknown"}
            for source in sources:
                stats[conf_name][source] += 1
                stats["overall"][source] += 1
            if urls:
                stats[conf_name]["total"] += 1
                stats["overall"]["total"] += 1

    return dict(stats)

`count_sources_by_area(all_results: dict[str, list[dict]]) -> dict[str, int]` ¶

Count artifacts by source for systems vs security.

Source code in src/generators/generate_artifact_sources_table.py

def count_sources_by_area(all_results: dict[str, list[dict]]) -> dict[str, int]:
    """Count artifacts by source for systems vs security."""
    sys_sources = defaultdict(int)
    sec_sources = defaultdict(int)
    sys_no_source = 0
    sec_no_source = 0

    for conf_year, artifacts in all_results.items():
        # Determine if this is a systems or security conference
        conf_name = re.match(r"^([a-zA-Z]+)", conf_year)
        if not conf_name:
            continue
        conf_name = conf_name.group(1).upper()

        area = conf_area(conf_name)
        if area == "systems":
            target_dict = sys_sources
            is_systems = True
        elif area == "security":
            target_dict = sec_sources
            is_systems = False
        else:
            # Try to infer: check for "Security" in second part of conf_year
            if "security" in conf_year.lower():
                target_dict = sec_sources
                is_systems = False
            else:
                target_dict = sys_sources
                is_systems = True

        for artifact in artifacts:
            urls = get_artifact_urls(artifact)
            if urls:
                sources = {extract_source(url) for url in urls}
                for source in sources:
                    target_dict[source] += 1
                target_dict["total"] += 1
            else:
                # Count artifacts without URLs separately
                if is_systems:
                    sys_no_source += 1
                else:
                    sec_no_source += 1

    return {
        "systems": dict(sys_sources),
        "security": dict(sec_sources),
        "systems_no_source": sys_no_source,
        "security_no_source": sec_no_source,
    }

`count_sources_overall(all_results)` ¶

Count artifacts by source overall.

Source code in src/generators/generate_artifact_sources_table.py

def count_sources_overall(all_results):
    """Count artifacts by source overall."""
    sources = defaultdict(int)

    for _conf_year, artifacts in all_results.items():
        for artifact in artifacts:
            urls = get_artifact_urls(artifact)
            source_set = {extract_source(url) for url in urls} if urls else {"unknown"}
            for source in source_set:
                sources[source] += 1
            if urls:
                sources["total"] += 1

    return dict(sources)