generate_repo_stats¶

`src.generators.repository.generate_repo_stats` ¶

Generate repository statistics (stars, forks, etc.) for the website.

Collects stats from GitHub/Zenodo/Figshare for all scraped artifacts and writes: - _data/repo_stats.yml — per-conference/year aggregates (for website) - assets/data/repo_stats_detail.json — per-repo detail (for analysis/figures)

Usage

python generate_repo_stats.py --conf_regex '.*20[12][0-9]' --output_dir ../reprodb.github.io/src

`collect_stats_for_results(results, url_keys=None)` ¶

Collect repository stats for all artifacts.

Expands multi-valued URL fields, deduplicates URLs, then fetches GitHub/Zenodo/Figshare stats in parallel. Returns a list of per-URL stat dicts.

Source code in src/generators/repository/generate_repo_stats.py

def collect_stats_for_results(results, url_keys=None):
    """Collect repository stats for all artifacts.

    Expands multi-valued URL fields, deduplicates URLs, then fetches
    GitHub/Zenodo/Figshare stats in parallel.  Returns a list of
    per-URL stat dicts.
    """
    if url_keys is None:
        url_keys = ["repository_url", "artifact_url", "github_url", "second_repository_url", "bitbucket_url"]

    # First pass: extract ALL URLs from list-valued fields and create expanded artifact entries
    # This ensures we collect stats for every artifact location, not just the first
    expanded_artifacts = {}
    for conf_year, artifacts in results.items():
        expanded_artifacts[conf_year] = []
        for artifact in artifacts:
            # Collect all URLs from this artifact (including multi-valued fields)
            all_urls_by_key = {}

            # Add single-valued URL fields
            for url_key in url_keys:
                if url_key in artifact and artifact[url_key]:
                    all_urls_by_key[url_key] = [artifact[url_key]]

            # Add URLs from list-valued fields (artifact_urls, additional_urls, etc.)
            for list_key in ["artifact_urls", "additional_urls"]:
                if list_key in artifact and isinstance(artifact[list_key], list):
                    for url in artifact[list_key]:
                        if isinstance(url, str) and url:
                            # Map back to single key: artifact_urls -> artifact_url
                            flat_key = list_key.rstrip("s")
                            if flat_key not in all_urls_by_key:
                                all_urls_by_key[flat_key] = []
                            if url not in all_urls_by_key[flat_key]:
                                all_urls_by_key[flat_key].append(url)

            # Create separate artifact entry for each URL to process
            if all_urls_by_key:
                for url_key, urls in all_urls_by_key.items():
                    for url in urls:
                        artifact_copy = {
                            k: v for k, v in artifact.items() if k not in ["artifact_urls", "additional_urls"]
                        }
                        artifact_copy[url_key] = url
                        expanded_artifacts[conf_year].append(artifact_copy)
            else:
                # No URLs found, keep original artifact
                expanded_artifacts[conf_year].append(artifact)

    results = expanded_artifacts

    # Filter url_keys to only those that actually appear in the data
    present_keys = set()
    for artifacts in results.values():
        for artifact in artifacts:
            for key in url_keys:
                if key in artifact and artifact[key]:
                    present_keys.add(key)
    url_keys = [k for k in url_keys if k in present_keys]
    if not url_keys:
        logger.warning("  Warning: No URL keys found in artifact data. No repository stats to collect.")
        return []
    logger.info(f"  Scanning URL fields: {', '.join(url_keys)}")

    # Check which URLs exist
    results, _, _ = check_artifact_exists(results, url_keys)

    # Build deduplicated list of (url, conf_name, year, title) tuples to fetch.
    # For GitHub URLs, deduplicate at the owner/repo level so different
    # branches/tags/paths within the same repo don't create separate stats
    # entries. Non-GitHub URLs are deduplicated by their full URL.
    fetch_tasks = []
    seen_urls: set[str] = set()  # normalized keys used for dedup
    # Keep track of all (url, conf, year, title) per normalized GitHub repo
    # so we can record each paper that uses a given repo.
    github_repo_papers: dict[str, list[tuple[str, str, int, str]]] = defaultdict(list)
    for conf_year, artifacts in results.items():
        conf_name, year = extract_conference_name(conf_year)
        if year is None:
            continue
        for artifact in artifacts:
            for url_key in url_keys:
                url = artifact.get(url_key, "")
                exists_key = f"{url_key}_exists"
                if not artifact.get(exists_key, False) or not url:
                    continue

                # Normalize for deduplication
                if "github.com/" in url:
                    # Exclude repos in the excluded list
                    if _is_excluded_repo(url):
                        logger.debug(f"  Excluded repo: {url}")
                        continue
                    norm = _normalise_github_repo_url(url) or url.rstrip("/")
                else:
                    norm = url.rstrip("/")

                title = artifact.get("title", "Unknown")
                if norm in seen_urls:
                    # Still track the paper for this repo
                    if "github.com/" in url:
                        github_repo_papers[norm].append((url, conf_name, year, title))
                    continue
                seen_urls.add(norm)
                fetch_tasks.append((url, conf_name, year, title))
                if "github.com/" in url:
                    github_repo_papers[norm].append((url, conf_name, year, title))

    max_workers = 8
    logger.info(f"  Collecting stats for {len(fetch_tasks)} unique URLs ({max_workers} workers)")

    def _fetch_stats(url):
        """Fetch stats for a single URL (thread-safe via disk cache)."""
        try:
            if "github.com/" in url:
                return github_stats(url), "github"
            if "zenodo" in url:
                return zenodo_stats(url), "zenodo"
            if "figshare" in url:
                return figshare_stats(url), "figshare"
        except Exception as e:
            logger.error(f"  Error collecting stats for {url}: {e}")
        return None, "unknown"

    all_stats = []
    stats_collected = 0
    # Track discovered GitHub URLs from Zenodo/Figshare linked_github_urls
    discovered_github: list[tuple[str, str, int, str]] = []  # (url, conf, year, title)
    with ThreadPoolExecutor(max_workers=max_workers) as pool:
        pending = {pool.submit(_fetch_stats, url): (url, conf, yr, title) for url, conf, yr, title in fetch_tasks}
        for i, future in enumerate(as_completed(pending), 1):
            url, conf_name, year, title = pending[future]
            stats, source = future.result()
            if stats:
                stats_collected += 1
                entry = {
                    "conference": conf_name,
                    "year": year,
                    "title": title,
                    "url": url,
                    "source": source,
                }
                entry.update(stats)
                all_stats.append(entry)

                # For GitHub repos used by multiple papers, emit additional
                # entries so each paper is represented in per-conference stats.
                if source == "github":
                    norm = _normalise_github_repo_url(url)
                    if norm and norm in github_repo_papers:
                        for extra_url, extra_conf, extra_yr, extra_title in github_repo_papers[norm]:
                            if extra_title == title and extra_conf == conf_name and extra_yr == year:
                                continue  # skip the primary entry already added
                            extra_entry = dict(entry)
                            extra_entry["conference"] = extra_conf
                            extra_entry["year"] = extra_yr
                            extra_entry["title"] = extra_title
                            extra_entry["url"] = extra_url
                            all_stats.append(extra_entry)

                # Collect any linked GitHub URLs discovered from Zenodo/Figshare
                for gh_url in stats.get("linked_github_urls", []):
                    gh_norm = _normalise_github_repo_url(gh_url) or gh_url
                    if gh_norm not in seen_urls and not _is_excluded_repo(gh_url):
                        seen_urls.add(gh_norm)
                        discovered_github.append((gh_url, conf_name, year, title))
            if i % 100 == 0 or i == len(fetch_tasks):
                logger.info(f"  Progress: {i}/{len(fetch_tasks)} URLs fetched, {stats_collected} stats collected")

    # Second pass: fetch GitHub stats for repos discovered via Zenodo/Figshare links
    if discovered_github:
        logger.info(
            f"  Discovered {len(discovered_github)} additional GitHub repos from Zenodo/Figshare linked_github_urls"
        )
        with ThreadPoolExecutor(max_workers=max_workers) as pool:
            pending2 = {
                pool.submit(_fetch_stats, url): (url, conf, yr, title) for url, conf, yr, title in discovered_github
            }
            for future in as_completed(pending2):
                url, conf_name, year, title = pending2[future]
                stats, source = future.result()
                if stats:
                    stats_collected += 1
                    entry = {
                        "conference": conf_name,
                        "year": year,
                        "title": title,
                        "url": url,
                        "source": source,
                    }
                    entry.update(stats)
                    all_stats.append(entry)

    return all_stats

`aggregate_stats(all_stats)` ¶

Aggregate per-conference and per-year statistics.

When the same GitHub repository (by name, i.e. owner/repo) appears for multiple papers, each paper gets its own entry in the detail list but the repo's stars/forks are counted only once in aggregate totals (per-conference, per-year, and overall).

Source code in src/generators/repository/generate_repo_stats.py

def aggregate_stats(all_stats):
    """Aggregate per-conference and per-year statistics.

    When the same GitHub repository (by ``name``, i.e. ``owner/repo``)
    appears for multiple papers, each paper gets its own entry in the
    detail list but the repo's stars/forks are counted only **once** in
    aggregate totals (per-conference, per-year, and overall).
    """
    # Per-conference aggregates
    by_conf = defaultdict(
        lambda: {
            "github_repos": 0,
            "total_stars": 0,
            "total_forks": 0,
            "max_stars": 0,
            "max_forks": 0,
            "zenodo_repos": 0,
            "total_views": 0,
            "total_downloads": 0,
            "years": defaultdict(
                lambda: {"github_repos": 0, "stars": 0, "forks": 0, "_star_values": [], "_fork_values": []}
            ),
            "all_github_entries": [],
            "_seen_repos": set(),  # track repos already counted for this conf
        }
    )

    by_year = defaultdict(
        lambda: {
            "github_repos": 0,
            "total_stars": 0,
            "total_forks": 0,
            "max_stars": 0,
            "max_forks": 0,
            "zenodo_repos": 0,
            "total_views": 0,
            "total_downloads": 0,
            "_seen_repos": set(),
            "_star_values": [],
            "_fork_values": [],
        }
    )

    overall = {
        "github_repos": 0,
        "total_stars": 0,
        "total_forks": 0,
        "max_stars": 0,
        "max_forks": 0,
        "zenodo_repos": 0,
        "total_views": 0,
        "total_downloads": 0,
        "avg_stars": 0,
        "avg_forks": 0,
        "median_stars": 0,
        "median_forks": 0,
    }
    overall_seen_repos: set[str] = set()
    overall_star_values: list[int] = []
    overall_fork_values: list[int] = []

    for s in all_stats:
        conf = s["conference"]
        year = s["year"]

        if s["source"] == "github":
            stars = s.get("github_stars", 0) or 0
            forks = s.get("github_forks", 0) or 0
            repo_name = s.get("name", "") or ""

            # Determine the normalized URL for display — prefer original (has tag info)
            url = s.get("url", "")

            # Always add to the detail list (one entry per paper×repo)
            by_conf[conf]["all_github_entries"].append(
                {
                    "title": s.get("title", "Unknown"),
                    "url": url,
                    "conference": conf,
                    "year": year,
                    "area": _conf_area(conf),
                    "stars": stars,
                    "forks": forks,
                    "description": (s.get("description", "") or "")[:120],
                    "language": s.get("language", "") or "",
                    "name": repo_name,
                    "pushed_at": s.get("pushed_at", ""),
                }
            )

            # Only count stars/forks once per unique repo in aggregates
            repo_key = repo_name.lower() if repo_name else url.rstrip("/")

            if repo_key not in by_conf[conf]["_seen_repos"]:
                by_conf[conf]["_seen_repos"].add(repo_key)
                by_conf[conf]["github_repos"] += 1
                by_conf[conf]["total_stars"] += stars
                by_conf[conf]["total_forks"] += forks
                by_conf[conf]["max_stars"] = max(by_conf[conf]["max_stars"], stars)
                by_conf[conf]["max_forks"] = max(by_conf[conf]["max_forks"], forks)

            year_key = f"{repo_key}@{year}"
            if year_key not in by_conf[conf].get("_seen_year_repos", set()):
                by_conf[conf].setdefault("_seen_year_repos", set()).add(year_key)
                by_conf[conf]["years"][year]["github_repos"] += 1
                by_conf[conf]["years"][year]["stars"] += stars
                by_conf[conf]["years"][year]["forks"] += forks
                by_conf[conf]["years"][year]["_star_values"].append(stars)
                by_conf[conf]["years"][year]["_fork_values"].append(forks)

            if repo_key not in by_year[year]["_seen_repos"]:
                by_year[year]["_seen_repos"].add(repo_key)
                by_year[year]["github_repos"] += 1
                by_year[year]["total_stars"] += stars
                by_year[year]["total_forks"] += forks
                by_year[year]["max_stars"] = max(by_year[year]["max_stars"], stars)
                by_year[year]["max_forks"] = max(by_year[year]["max_forks"], forks)
                by_year[year]["_star_values"].append(stars)
                by_year[year]["_fork_values"].append(forks)

            if repo_key not in overall_seen_repos:
                overall_seen_repos.add(repo_key)
                overall["github_repos"] += 1
                overall["total_stars"] += stars
                overall["total_forks"] += forks
                overall["max_stars"] = max(overall["max_stars"], stars)
                overall["max_forks"] = max(overall["max_forks"], forks)
                overall_star_values.append(stars)
                overall_fork_values.append(forks)

        elif s["source"] == "zenodo":
            views = s.get("zenodo_views", 0) or 0
            downloads = s.get("zenodo_downloads", 0) or 0

            by_conf[conf]["zenodo_repos"] += 1
            by_conf[conf]["total_views"] += views
            by_conf[conf]["total_downloads"] += downloads

            by_year[year]["zenodo_repos"] += 1
            by_year[year]["total_views"] += views
            by_year[year]["total_downloads"] += downloads

            overall["zenodo_repos"] += 1
            overall["total_views"] += views
            overall["total_downloads"] += downloads

    if overall["github_repos"] > 0:
        overall["avg_stars"] = round(overall["total_stars"] / overall["github_repos"], 1)
        overall["avg_forks"] = round(overall["total_forks"] / overall["github_repos"], 1)
        overall["median_stars"] = round(statistics.median(overall_star_values), 1)
        overall["median_forks"] = round(statistics.median(overall_fork_values), 1)
        if len(overall_star_values) >= 2:
            q_stars = statistics.quantiles(overall_star_values, n=4)
            q_forks = statistics.quantiles(overall_fork_values, n=4)
            overall["p25_stars"] = round(q_stars[0], 1)
            overall["p75_stars"] = round(q_stars[2], 1)
            overall["p25_forks"] = round(q_forks[0], 1)
            overall["p75_forks"] = round(q_forks[2], 1)
        else:
            overall["p25_stars"] = overall["median_stars"]
            overall["p75_stars"] = overall["median_stars"]
            overall["p25_forks"] = overall["median_forks"]
            overall["p75_forks"] = overall["median_forks"]

    # Convert to serializable format
    conf_stats = []
    for conf_name in sorted(by_conf.keys()):
        d = by_conf[conf_name]
        avg_stars = round(d["total_stars"] / d["github_repos"], 1) if d["github_repos"] > 0 else 0
        avg_forks = round(d["total_forks"] / d["github_repos"], 1) if d["github_repos"] > 0 else 0
        # Compute conference-level medians from all_github_entries (deduplicated by _seen_repos)
        conf_star_vals = sorted(e["stars"] for e in d["all_github_entries"])
        conf_fork_vals = sorted(e["forks"] for e in d["all_github_entries"])
        median_stars = round(statistics.median(conf_star_vals), 1) if conf_star_vals else 0
        median_forks = round(statistics.median(conf_fork_vals), 1) if conf_fork_vals else 0
        if len(conf_star_vals) >= 2:
            q_stars = statistics.quantiles(conf_star_vals, n=4)
            q_forks = statistics.quantiles(conf_fork_vals, n=4)
            p25_stars = round(q_stars[0], 1)
            p75_stars = round(q_stars[2], 1)
            p25_forks = round(q_forks[0], 1)
            p75_forks = round(q_forks[2], 1)
        else:
            p25_stars = median_stars
            p75_stars = median_stars
            p25_forks = median_forks
            p75_forks = median_forks
        year_list = []
        for yr in sorted(d["years"].keys()):
            yd = d["years"][yr]
            yr_median_stars = round(statistics.median(yd["_star_values"]), 1) if yd["_star_values"] else 0
            yr_median_forks = round(statistics.median(yd["_fork_values"]), 1) if yd["_fork_values"] else 0
            if len(yd["_star_values"]) >= 2:
                yr_q_stars = statistics.quantiles(yd["_star_values"], n=4)
                yr_q_forks = statistics.quantiles(yd["_fork_values"], n=4)
                yr_p25_stars = round(yr_q_stars[0], 1)
                yr_p75_stars = round(yr_q_stars[2], 1)
                yr_p25_forks = round(yr_q_forks[0], 1)
                yr_p75_forks = round(yr_q_forks[2], 1)
            else:
                yr_p25_stars = yr_median_stars
                yr_p75_stars = yr_median_stars
                yr_p25_forks = yr_median_forks
                yr_p75_forks = yr_median_forks
            year_list.append(
                {
                    "year": yr,
                    "github_repos": yd["github_repos"],
                    "total_stars": yd["stars"],
                    "total_forks": yd["forks"],
                    "avg_stars": round(yd["stars"] / yd["github_repos"], 1) if yd["github_repos"] > 0 else 0,
                    "avg_forks": round(yd["forks"] / yd["github_repos"], 1) if yd["github_repos"] > 0 else 0,
                    "median_stars": yr_median_stars,
                    "median_forks": yr_median_forks,
                    "p25_stars": yr_p25_stars,
                    "p75_stars": yr_p75_stars,
                    "p25_forks": yr_p25_forks,
                    "p75_forks": yr_p75_forks,
                }
            )
        # Top 5 repos by stars
        top_repos = sorted(d["all_github_entries"], key=lambda x: x["stars"], reverse=True)[:5]
        conf_stats.append(
            {
                "name": conf_name,
                "github_repos": d["github_repos"],
                "total_stars": d["total_stars"],
                "total_forks": d["total_forks"],
                "avg_stars": avg_stars,
                "avg_forks": avg_forks,
                "median_stars": median_stars,
                "median_forks": median_forks,
                "p25_stars": p25_stars,
                "p75_stars": p75_stars,
                "p25_forks": p25_forks,
                "p75_forks": p75_forks,
                "max_stars": d["max_stars"],
                "max_forks": d["max_forks"],
                "years": year_list,
                "top_repos": top_repos,
            }
        )

    year_stats = []
    for yr in sorted(by_year.keys()):
        d = by_year[yr]
        avg_stars = round(d["total_stars"] / d["github_repos"], 1) if d["github_repos"] > 0 else 0
        avg_forks = round(d["total_forks"] / d["github_repos"], 1) if d["github_repos"] > 0 else 0
        median_stars = round(statistics.median(d["_star_values"]), 1) if d["_star_values"] else 0
        median_forks = round(statistics.median(d["_fork_values"]), 1) if d["_fork_values"] else 0
        if len(d["_star_values"]) >= 2:
            q_stars = statistics.quantiles(d["_star_values"], n=4)
            q_forks = statistics.quantiles(d["_fork_values"], n=4)
            yr_p25_stars = round(q_stars[0], 1)
            yr_p75_stars = round(q_stars[2], 1)
            yr_p25_forks = round(q_forks[0], 1)
            yr_p75_forks = round(q_forks[2], 1)
        else:
            yr_p25_stars = median_stars
            yr_p75_stars = median_stars
            yr_p25_forks = median_forks
            yr_p75_forks = median_forks
        year_stats.append(
            {
                "year": yr,
                "github_repos": d["github_repos"],
                "total_stars": d["total_stars"],
                "total_forks": d["total_forks"],
                "avg_stars": avg_stars,
                "avg_forks": avg_forks,
                "median_stars": median_stars,
                "median_forks": median_forks,
                "p25_stars": yr_p25_stars,
                "p75_stars": yr_p75_stars,
                "p25_forks": yr_p25_forks,
                "p75_forks": yr_p75_forks,
                "max_stars": d["max_stars"],
                "max_forks": d["max_forks"],
            }
        )

    overall["last_updated"] = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")

    # Per-repo detail: all GitHub entries with individual star/fork counts
    all_github_detail = []
    for conf_name in sorted(by_conf.keys()):
        all_github_detail.extend(by_conf[conf_name]["all_github_entries"])

    return {
        "overall": overall,
        "by_conference": conf_stats,
        "by_year": year_stats,
        "all_github_repos": all_github_detail,
    }

generate_repo_stats¶

src.generators.repository.generate_repo_stats ¶

collect_stats_for_results(results, url_keys=None) ¶

aggregate_stats(all_stats) ¶

`src.generators.repository.generate_repo_stats` ¶

`collect_stats_for_results(results, url_keys=None)` ¶

`aggregate_stats(all_stats)` ¶