Skip to content

generate_artifact_citations

src.generators.generate_artifact_citations

Generate artifact DOI citation counts via OpenAlex and Semantic Scholar and write per-artifact metadata.

Outputs

assets/data/artifact_citations.json assets/data/artifact_citations_summary.json

Usage

python generate_artifact_citations.py --data_dir ../reprodb.github.io

is_artifact_doi(doi: str) -> bool

Check if DOI is from an artifact repository (not a paper publisher).

Source code in src/generators/generate_artifact_citations.py
64
65
66
67
68
def is_artifact_doi(doi: str) -> bool:
    """Check if DOI is from an artifact repository (not a paper publisher)."""
    if not doi:
        return False
    return doi.lower().startswith(ALLOWED_ARTIFACT_DOI_PREFIXES)

fetch_zenodo_doi(record_id: str, cache: dict) -> str

Get DOI for a Zenodo record. Always returns the Zenodo DOI (10.5281/zenodo.{record_id}) to ensure we get artifact citations, not paper citations from DOIs that authors may have linked.

Source code in src/generators/generate_artifact_citations.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
def fetch_zenodo_doi(record_id: str, cache: dict) -> str:
    """
    Get DOI for a Zenodo record.
    Always returns the Zenodo DOI (10.5281/zenodo.{record_id}) to ensure we get
    artifact citations, not paper citations from DOIs that authors may have linked.
    """
    if record_id in cache:
        return cache[record_id]

    # Construct Zenodo DOI directly from record ID
    # This ensures we always get artifact citations, not paper citations
    doi = f"10.5281/zenodo.{record_id}".lower()

    # Verify the record exists by checking the API
    url = f"https://zenodo.org/api/records/{record_id}"
    try:
        fetch_json(url, timeout=30)
        # Record exists, use the constructed Zenodo DOI
        cache[record_id] = doi
        return doi
    except Exception:
        # Record doesn't exist or API error
        cache[record_id] = ""
        return ""

fetch_openalex_citing_dois(base_url: str, limit: int) -> tuple[list[str], bool]

Fetch citing DOIs from OpenAlex using the filter API. base_url should be like: https://api.openalex.org/works?filter=cites:W12345

Source code in src/generators/generate_artifact_citations.py
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
def fetch_openalex_citing_dois(base_url: str, limit: int) -> tuple[list[str], bool]:
    """
    Fetch citing DOIs from OpenAlex using the filter API.
    base_url should be like: https://api.openalex.org/works?filter=cites:W12345
    """
    citing_dois = set()
    truncated = False
    cursor = "*"
    while True:
        url = f"{base_url}&per_page=200&cursor={urllib.parse.quote(cursor, safe='')}"
        payload = fetch_json(url, timeout=25)
        for work in payload.get("results", []) or []:
            doi_val = work.get("doi") or (work.get("ids", {}) or {}).get("doi") or ""
            norm = normalize_doi(doi_val)
            if norm:
                citing_dois.add(norm)
                if len(citing_dois) >= limit:
                    truncated = True
                    break
        if truncated:
            break
        cursor = (payload.get("meta", {}) or {}).get("next_cursor")
        if not cursor:
            break
        time.sleep(0.2)
    return sorted(citing_dois), truncated