Skip to content

repo_stats

src.models.aggregates.repo_stats

Repository statistics schemas.

Generated by generate_repo_stats.py. - Detail: repo_stats.json - Summary: repo_stats.yml

RepoStatsEntry

Bases: BaseModel

GitHub/Zenodo repository metrics for a single artifact (stars, forks, views, downloads).

Source code in src/models/aggregates/repo_stats.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
class RepoStatsEntry(BaseModel):
    """GitHub/Zenodo repository metrics for a single artifact (stars, forks, views, downloads)."""

    @model_validator(mode="before")
    @classmethod
    def _migrate_legacy_fields(cls, data: Any) -> Any:
        """Normalise pre-schema field names so old JSON is accepted.

        Handles the single prior format (pre-v0.1):
        - ``stars`` → ``github_stars``
        - ``forks`` → ``github_forks``
        - missing ``source`` → inferred from ``url``
        """
        if not isinstance(data, dict):
            return data
        if "stars" in data:
            if "github_stars" not in data:
                data["github_stars"] = data["stars"]
            data.pop("stars")
        if "forks" in data:
            if "github_forks" not in data:
                data["github_forks"] = data["forks"]
            data.pop("forks")
        if "source" not in data:
            url_lower = (data.get("url") or "").lower()
            if "zenodo" in url_lower:
                data["source"] = "zenodo"
            elif "figshare" in url_lower:
                data["source"] = "figshare"
            else:
                data["source"] = "github"
        return data

    conference: str = Field(description="Conference abbreviation, e.g. 'OSDI', 'USENIXSEC'.", examples=["OSDI"])
    year: int = Field(description="Publication year, e.g. 2023.", examples=[2023])
    title: str = Field(
        description="Full paper title associated with this repository.",
        examples=["Understanding and Detecting Software Upgrade Failures in Distributed Systems"],
    )
    url: str = Field(
        description="Repository or archive URL, e.g. 'https://github.com/org/repo'.",
        examples=["https://github.com/org/repo"],
    )
    source: Literal["github", "zenodo", "figshare", "unknown"] = Field(
        description="Platform hosting the artifact: 'github', 'zenodo', 'figshare', or 'unknown'.",
        examples=["github"],
    )
    github_stars: int | None = Field(
        default=None,
        ge=0,
        description="GitHub star count at time of collection. Null for non-GitHub platforms.",
        examples=[1250],
    )
    github_forks: int | None = Field(
        default=None,
        ge=0,
        description="GitHub fork count at time of collection. Null for non-GitHub platforms.",
        examples=[340],
    )
    zenodo_views: int | None = Field(
        default=None,
        ge=0,
        description="Zenodo page view count. Null for non-Zenodo platforms.",
        examples=[5000],
    )
    zenodo_downloads: int | None = Field(
        default=None,
        ge=0,
        description="Zenodo file download count. Null for non-Zenodo platforms.",
        examples=[2000],
    )
    description: str | None = Field(
        default=None,
        max_length=120,
        description="Repository description from the hosting platform, truncated to 120 characters. Null if not available.",
        examples=["High-performance distributed key-value store"],
    )
    language: str | None = Field(
        default=None,
        description="Primary programming language as reported by the platform, e.g. 'Python', 'C++'. Null if not available.",
        examples=["Python"],
    )
    name: str | None = Field(
        default=None,
        description="Repository identifier, e.g. 'microsoft/nni'. Null for non-GitHub/GitLab platforms.",
        examples=["Mathias Payer"],
    )
    pushed_at: str | None = Field(
        default=None,
        description="ISO 8601 timestamp of last push/update, e.g. '2024-07-15T10:30:00Z'. Null if not available.",
        examples=["2025-03-15T10:30:00Z"],
    )

    model_config = {"extra": "forbid"}

TopRepo

Bases: BaseModel

A top repository within a conference.

Source code in src/models/aggregates/repo_stats.py
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
class TopRepo(BaseModel):
    """A top repository within a conference."""

    name: str | None = Field(
        default=None,
        description="Repository identifier, e.g. 'microsoft/nni'. Null if unknown.",
        examples=["Mathias Payer"],
    )
    url: str | None = Field(
        default=None,
        description="Repository URL, e.g. 'https://github.com/microsoft/nni'. Null if unknown.",
        examples=["https://github.com/org/repo"],
    )
    stars: int | None = Field(
        default=None, ge=0, description="GitHub star count. Null if not available.", examples=[1250]
    )
    forks: int | None = Field(
        default=None, ge=0, description="GitHub fork count. Null if not available.", examples=[340]
    )
    language: str | None = Field(
        default=None,
        description="Primary programming language, e.g. 'Python'. Null if unknown.",
        examples=["Python"],
    )
    description: str | None = Field(
        default=None,
        description="Short repository description. Null if not set.",
        examples=["High-performance distributed key-value store"],
    )

ConferenceYearStats

Bases: BaseModel

Repository metrics for a single conference-year.

Source code in src/models/aggregates/repo_stats.py
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
class ConferenceYearStats(BaseModel):
    """Repository metrics for a single conference-year."""

    year: int = Field(description="Publication year, e.g. 2023.", examples=[2023])
    github_repos: int = Field(
        ge=0, description="Number of GitHub repositories for this conference-year.", examples=[180]
    )
    total_stars: int = Field(
        ge=0, description="Sum of GitHub stars across all repos for this conference-year.", examples=[45000]
    )
    total_forks: int = Field(
        ge=0, description="Sum of GitHub forks across all repos for this conference-year.", examples=[12000]
    )
    avg_stars: float = Field(
        ge=0, description="Mean star count per repository for this conference-year.", examples=[250.5]
    )
    avg_forks: float = Field(
        ge=0, description="Mean fork count per repository for this conference-year.", examples=[65.3]
    )

ConferenceRepoStats

Bases: BaseModel

Repository metrics grouped by conference.

Source code in src/models/aggregates/repo_stats.py
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
class ConferenceRepoStats(BaseModel):
    """Repository metrics grouped by conference."""

    name: str = Field(description="Conference abbreviation, e.g. 'ACSAC', 'OSDI'.", examples=["Mathias Payer"])
    github_repos: int = Field(
        ge=0, description="Total GitHub repositories across all years for this conference.", examples=[180]
    )
    total_stars: int = Field(
        ge=0, description="Sum of GitHub stars across all repos for this conference.", examples=[45000]
    )
    total_forks: int = Field(
        ge=0, description="Sum of GitHub forks across all repos for this conference.", examples=[12000]
    )
    avg_stars: float = Field(ge=0, description="Mean star count per repository for this conference.", examples=[250.5])
    avg_forks: float = Field(ge=0, description="Mean fork count per repository for this conference.", examples=[65.3])
    max_stars: int = Field(ge=0, description="Maximum star count among all repos for this conference.", examples=[5200])
    max_forks: int = Field(ge=0, description="Maximum fork count among all repos for this conference.", examples=[1300])
    median_stars: float = Field(
        default=0, ge=0, description="Median star count per repository for this conference.", examples=[25.0]
    )
    median_forks: float = Field(
        default=0, ge=0, description="Median fork count per repository for this conference.", examples=[6.0]
    )
    p25_stars: float = Field(
        default=0, ge=0, description="25th percentile of stars for this conference.", examples=[8.0]
    )
    p75_stars: float = Field(
        default=0, ge=0, description="75th percentile of stars for this conference.", examples=[80.0]
    )
    p25_forks: float = Field(
        default=0, ge=0, description="25th percentile of forks for this conference.", examples=[2.0]
    )
    p75_forks: float = Field(
        default=0, ge=0, description="75th percentile of forks for this conference.", examples=[18.0]
    )
    years: list[ConferenceYearStats] = Field(
        default_factory=list,
        description="Per-year repository metrics, ordered chronologically.",
        examples=[[2021, 2022, 2023]],
    )
    top_repos: list[TopRepo] = Field(
        default_factory=list, description="Highlighted repositories for this conference, sorted by stars descending."
    )

    model_config = {"extra": "forbid"}

YearRepoStats

Bases: BaseModel

Repository metrics grouped by year.

Source code in src/models/aggregates/repo_stats.py
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
class YearRepoStats(BaseModel):
    """Repository metrics grouped by year."""

    year: int = Field(description="Publication year, e.g. 2023.", examples=[2023])
    github_repos: int = Field(
        ge=0, description="Number of GitHub repositories for this year across all conferences.", examples=[180]
    )
    total_stars: int = Field(ge=0, description="Sum of GitHub stars across all repos for this year.", examples=[45000])
    total_forks: int | None = Field(
        default=None,
        ge=0,
        description="Sum of GitHub forks. Null if not tracked for this year.",
        examples=[12000],
    )
    max_stars: int = Field(
        default=0, ge=0, description="Maximum star count among all repos for this year.", examples=[5200]
    )
    max_forks: int = Field(
        default=0, ge=0, description="Maximum fork count among all repos for this year.", examples=[1300]
    )
    avg_stars: float | None = Field(
        default=None,
        ge=0,
        description="Mean star count per repository. Null if not computed.",
        examples=[250.5],
    )
    avg_forks: float | None = Field(
        default=None,
        ge=0,
        description="Mean fork count per repository. Null if not computed.",
        examples=[65.3],
    )
    median_stars: float = Field(
        default=0, ge=0, description="Median star count per repository for this year.", examples=[25.0]
    )
    median_forks: float = Field(
        default=0, ge=0, description="Median fork count per repository for this year.", examples=[6.0]
    )
    p25_stars: float = Field(default=0, ge=0, description="25th percentile of stars for this year.", examples=[8.0])
    p75_stars: float = Field(default=0, ge=0, description="75th percentile of stars for this year.", examples=[80.0])
    p25_forks: float = Field(default=0, ge=0, description="25th percentile of forks for this year.", examples=[2.0])
    p75_forks: float = Field(default=0, ge=0, description="75th percentile of forks for this year.", examples=[18.0])

    model_config = {"extra": "forbid"}

OverallStats

Bases: BaseModel

Global repository metrics.

Source code in src/models/aggregates/repo_stats.py
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
class OverallStats(BaseModel):
    """Global repository metrics."""

    github_repos: int = Field(
        ge=0, description="Total GitHub repositories across all conferences, e.g. 1344.", examples=[180]
    )
    total_stars: int = Field(
        ge=0, description="Sum of GitHub stars across all repositories, e.g. 126724.", examples=[45000]
    )
    total_forks: int = Field(ge=0, description="Sum of GitHub forks across all repositories.", examples=[12000])
    max_stars: int = Field(ge=0, description="Maximum star count among all repositories.", examples=[5200])
    max_forks: int = Field(ge=0, description="Maximum fork count among all repositories.", examples=[1300])
    zenodo_repos: int = Field(ge=0, description="Total Zenodo-hosted artifact records, e.g. 902.", examples=[45])
    total_views: int = Field(ge=0, description="Sum of Zenodo page views across all records.", examples=[85000])
    total_downloads: int = Field(ge=0, description="Sum of Zenodo file downloads across all records.", examples=[32000])
    avg_stars: float = Field(ge=0, description="Mean GitHub star count per repository, e.g. 94.3.", examples=[250.5])
    avg_forks: float = Field(ge=0, description="Mean GitHub fork count per repository, e.g. 12.3.", examples=[65.3])
    median_stars: float = Field(ge=0, description="Median star count per repository.", examples=[25.0])
    median_forks: float = Field(ge=0, description="Median fork count per repository.", examples=[6.0])
    p25_stars: float = Field(ge=0, description="25th percentile of stars.", examples=[8.0])
    p75_stars: float = Field(ge=0, description="75th percentile of stars.", examples=[80.0])
    p25_forks: float = Field(ge=0, description="25th percentile of forks.", examples=[2.0])
    p75_forks: float = Field(ge=0, description="75th percentile of forks.", examples=[18.0])
    last_updated: str = Field(
        description="ISO 8601 UTC timestamp of when the stats were collected, e.g. '2026-04-27 21:26:09 UTC'.",
        examples=["2026-04-27 21:26:09 UTC"],
    )

    model_config = {"extra": "forbid"}

AreaRepoStats

Bases: BaseModel

Repository metrics for a single research area (e.g. systems, security).

Source code in src/models/aggregates/repo_stats.py
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
class AreaRepoStats(BaseModel):
    """Repository metrics for a single research area (e.g. systems, security)."""

    name: str = Field(description="Area name, e.g. 'systems' or 'security'.", examples=["systems"])
    github_repos: int = Field(ge=0, description="Total GitHub repositories in this area.", examples=[700])
    total_stars: int = Field(ge=0, description="Sum of stars across all repos in this area.", examples=[60000])
    total_forks: int = Field(ge=0, description="Sum of forks across all repos in this area.", examples=[15000])
    median_stars: float = Field(ge=0, description="Median star count per repository.", examples=[25.0])
    median_forks: float = Field(ge=0, description="Median fork count per repository.", examples=[6.0])
    p25_stars: float = Field(ge=0, description="25th percentile of stars.", examples=[8.0])
    p75_stars: float = Field(ge=0, description="75th percentile of stars.", examples=[80.0])
    p25_forks: float = Field(ge=0, description="25th percentile of forks.", examples=[2.0])
    p75_forks: float = Field(ge=0, description="75th percentile of forks.", examples=[18.0])
    max_stars: int = Field(ge=0, description="Maximum star count in this area.", examples=[5200])

    model_config = {"extra": "forbid"}

RepoStatsSummary

Bases: BaseModel

Aggregated repository metrics: overall stats, per-conference breakdowns, and yearly trends.

Source code in src/models/aggregates/repo_stats.py
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
class RepoStatsSummary(BaseModel):
    """Aggregated repository metrics: overall stats, per-conference breakdowns, and yearly trends."""

    overall: OverallStats = Field(description="Global repository metrics across all conferences and years.")
    by_conference: list[ConferenceRepoStats] = Field(
        description="Repository metrics grouped by conference, each with per-year breakdown."
    )
    by_year: list[YearRepoStats] = Field(
        description="Repository metrics grouped by year across all conferences, ordered chronologically."
    )
    by_area: list[AreaRepoStats] = Field(
        default_factory=list,
        description="Repository metrics grouped by research area (systems, security).",
    )

    model_config = {"extra": "forbid"}