Skip to content

ReproDB Pipeline

citation_apis

reprodb-pipeline

citation_apis¶

`src.utils.apis.citation_apis` ¶

Shared citation-lookup helpers for OpenAlex, Semantic Scholar, and DOI handling.

Every citation generator should import from here rather than reimplementing its own API callers, DOI extractors, or rate-limiting constants.

`extract_paper_doi(paper_url: str | None) -> str` ¶

Extract a bare DOI from a paper_url value (https://doi.org/… or bare).

Returns "" if none found.

Source code in src/utils/apis/citation_apis.py

def extract_paper_doi(paper_url: str | None) -> str:
    """Extract a bare DOI from a paper_url value (``https://doi.org/…`` or bare).

    Returns ``""`` if none found.
    """
    if not paper_url:
        return ""
    doi = paper_url.strip()
    for prefix in _DOI_PREFIXES_STRIP:
        if doi.lower().startswith(prefix):
            doi = doi[len(prefix) :]
            break
    if doi.startswith("10.") and "/" in doi:
        return doi.rstrip(".,);")
    return ""

`extract_doi(url: str | None) -> str` ¶

Extract a DOI from an arbitrary URL using regex.

Handles Zenodo record URLs as a special case. Returns "" if none found.

Source code in src/utils/apis/citation_apis.py

def extract_doi(url: str | None) -> str:
    """Extract a DOI from an arbitrary URL using regex.

    Handles Zenodo record URLs as a special case.  Returns ``""`` if none found.
    """
    if not url or not isinstance(url, str):
        return ""
    match = DOI_REGEX.search(url)
    if match:
        return match.group(0).rstrip(".,);").lower()
    # Zenodo record URL → DOI
    m = re.search(r"zenodo\.org/(?:record|records)/(\d+)", url, re.I)
    if m:
        return f"10.5281/zenodo.{m.group(1)}".lower()
    return ""

`normalize_doi(value: str) -> str` ¶

Normalise a DOI string (strip https://doi.org/ prefix, lowercase).

Source code in src/utils/apis/citation_apis.py

def normalize_doi(value: str) -> str:
    """Normalise a DOI string (strip ``https://doi.org/`` prefix, lowercase)."""
    if not value:
        return ""
    for prefix in _DOI_PREFIXES_STRIP:
        if value.lower().startswith(prefix):
            value = value[len(prefix) :]
            break
    match = DOI_REGEX.search(value)
    if match:
        return match.group(0).rstrip(".,);").lower()
    return ""

`is_artifact_doi(doi: str) -> bool` ¶

Return True if doi belongs to an artifact repository (Zenodo, Figshare).

Source code in src/utils/apis/citation_apis.py

def is_artifact_doi(doi: str) -> bool:
    """Return *True* if *doi* belongs to an artifact repository (Zenodo, Figshare)."""
    if not doi:
        return False
    return doi.lower().startswith(ARTIFACT_DOI_PREFIXES)

`cache_key(doi: str) -> str` ¶

Deterministic cache key from a DOI or normalised title.

Source code in src/utils/apis/citation_apis.py

def cache_key(doi: str) -> str:
    """Deterministic cache key from a DOI or normalised title."""
    return hashlib.sha256(doi.lower().encode()).hexdigest()

`create_session() -> requests.Session` ¶

Return a requests session pre-configured with the pipeline User-Agent.

Source code in src/utils/apis/citation_apis.py

def create_session() -> requests.Session:
    """Return a *requests* session pre-configured with the pipeline User-Agent."""
    session = requests.Session()
    session.headers["User-Agent"] = USER_AGENT
    return session

`fetch_json_urllib(url: str, *, timeout: int = 20, headers: dict | None = None) -> dict` ¶

GET url and parse the JSON response (stdlib urllib).

Source code in src/utils/apis/citation_apis.py

def fetch_json_urllib(url: str, *, timeout: int = 20, headers: dict | None = None) -> dict:
    """GET *url* and parse the JSON response (stdlib ``urllib``)."""
    req_headers = {"User-Agent": USER_AGENT}
    if headers:
        req_headers.update(headers)
    req = urllib.request.Request(url, headers=req_headers)
    with urllib.request.urlopen(req, timeout=timeout) as resp:  # noqa: S310
        data: dict = json.loads(resp.read().decode("utf-8", "ignore"))
        return data

`openalex_lookup(doi: str, session: requests.Session) -> dict | None` ¶

Query OpenAlex for a paper by DOI.

Returns {"cited_by_count", "openalex_id", "title"} or None.

Source code in src/utils/apis/citation_apis.py

def openalex_lookup(doi: str, session: requests.Session) -> dict | None:
    """Query OpenAlex for a paper by DOI.

    Returns ``{"cited_by_count", "openalex_id", "title"}`` or *None*.
    """
    url = f"{OPENALEX_BASE}/works/https://doi.org/{urllib.parse.quote(doi, safe='')}"
    try:
        resp = session.get(url, timeout=20)
        if resp.status_code == 404:
            return None
        resp.raise_for_status()
        data = resp.json()
        return {
            "cited_by_count": data.get("cited_by_count"),
            "openalex_id": data.get("id", ""),
            "title": data.get("title", ""),
        }
    except (requests.RequestException, ValueError):
        return None

`openalex_lookup_with_retry(doi: str, session: requests.Session, *, max_attempts: int = 4, fetch_citing_dois: bool = False) -> dict` ¶

Query OpenAlex by DOI with retries. Returns {count, citing_dois, error}.

Source code in src/utils/apis/citation_apis.py

def openalex_lookup_with_retry(
    doi: str,
    session: requests.Session,
    *,
    max_attempts: int = 4,
    fetch_citing_dois: bool = False,
) -> dict:
    """Query OpenAlex by DOI with retries.  Returns ``{count, citing_dois, error}``."""
    url = f"{OPENALEX_BASE}/works/https://doi.org/{urllib.parse.quote(doi, safe='')}"
    last_err = ""
    for attempt in range(max_attempts):
        try:
            resp = session.get(url, timeout=25)
            if resp.status_code == 404:
                return {"count": None, "citing_dois": [], "error": ""}
            resp.raise_for_status()
            payload = resp.json()
            cited = payload.get("cited_by_count")
            cited_val = cited if isinstance(cited, int) else None
            citing_dois: list[str] = []
            if fetch_citing_dois and cited_val and cited_val > 0:
                openalex_id = payload.get("id", "")
                if openalex_id:
                    work_id = openalex_id.split("/")[-1] if "/" in openalex_id else openalex_id
                    citing_works_url = f"{OPENALEX_BASE}/works?filter=cites:{work_id}"
                    citing_dois = openalex_fetch_citing_dois(citing_works_url, session)
            return {"count": cited_val, "citing_dois": citing_dois, "error": ""}
        except Exception as e:
            last_err = f"{type(e).__name__}: {e}"
            logger.debug("[OpenAlex] retrying DOI %s after error: %s", doi, last_err)
            time.sleep(0.6 * (attempt + 1))
    return {"count": None, "citing_dois": [], "error": last_err}

`openalex_title_search(title: str, session: requests.Session) -> dict | None` ¶

Fall back to OpenAlex title search when DOI is unavailable.

Returns {"cited_by_count", "openalex_id", "title"} or None.

Source code in src/utils/apis/citation_apis.py

def openalex_title_search(title: str, session: requests.Session) -> dict | None:
    """Fall back to OpenAlex title search when DOI is unavailable.

    Returns ``{"cited_by_count", "openalex_id", "title"}`` or *None*.
    """
    norm = normalize_title(title)
    if not norm or len(norm) < 10:
        return None
    url = f"{OPENALEX_BASE}/works?filter=title.search:{urllib.parse.quote(norm)}&per_page=3"
    try:
        resp = session.get(url, timeout=20)
        resp.raise_for_status()
        results = resp.json().get("results", [])
    except (requests.RequestException, ValueError):
        return None

    query_words = set(norm.split())
    for work in results:
        work_title = normalize_title(work.get("title", ""))
        work_words = set(work_title.split())
        if not query_words or not work_words:
            continue
        jaccard = len(query_words & work_words) / len(query_words | work_words)
        if jaccard >= 0.6:
            return {
                "cited_by_count": work.get("cited_by_count"),
                "openalex_id": work.get("id", ""),
                "title": work.get("title", ""),
            }
    return None

`openalex_fetch_citing_dois(base_url: str, session: requests.Session) -> list[str]` ¶

Paginate through OpenAlex citing-works and collect DOIs.

Source code in src/utils/apis/citation_apis.py

def openalex_fetch_citing_dois(base_url: str, session: requests.Session) -> list[str]:
    """Paginate through OpenAlex citing-works and collect DOIs."""
    citing_dois: set[str] = set()
    cursor: str | None = "*"
    while cursor:
        url = f"{base_url}&per_page=200&cursor={urllib.parse.quote(cursor, safe='')}"
        try:
            resp = session.get(url, timeout=25)
            resp.raise_for_status()
            payload = resp.json()
        except Exception:
            break
        for work in payload.get("results", []) or []:
            doi_val = work.get("doi") or (work.get("ids", {}) or {}).get("doi") or ""
            norm = normalize_doi(doi_val)
            if norm:
                citing_dois.add(norm)
        cursor = (payload.get("meta", {}) or {}).get("next_cursor")
        if not cursor:
            break
        time.sleep(0.2)
    return sorted(citing_dois)

`s2_lookup(doi: str, session: requests.Session, *, timeout: int = 8) -> int | None` ¶

Query Semantic Scholar for citation count by DOI.

Source code in src/utils/apis/citation_apis.py

def s2_lookup(doi: str, session: requests.Session, *, timeout: int = 8) -> int | None:
    """Query Semantic Scholar for citation count by DOI."""
    url = f"{S2_BASE}/paper/DOI:{urllib.parse.quote(doi, safe='')}?fields=citationCount"
    headers = _s2_headers()
    try:
        resp = session.get(url, timeout=timeout, headers=headers)
        if resp.status_code == 404:
            return None
        resp.raise_for_status()
        count = resp.json().get("citationCount")
        return count if isinstance(count, int) else None
    except (requests.RequestException, ValueError):
        return None

`s2_lookup_with_retry(doi: str, session: requests.Session, *, max_attempts: int | None = None, request_timeout: int | None = None, fetch_citing_dois: bool = False) -> dict` ¶

Query S2 by DOI with retries. Returns {count, citing_dois, error}.

Source code in src/utils/apis/citation_apis.py

def s2_lookup_with_retry(
    doi: str,
    session: requests.Session,
    *,
    max_attempts: int | None = None,
    request_timeout: int | None = None,
    fetch_citing_dois: bool = False,
) -> dict:
    """Query S2 by DOI with retries.  Returns ``{count, citing_dois, error}``."""
    if max_attempts is None:
        max_attempts = int(os.environ.get("SEMANTIC_SCHOLAR_MAX_ATTEMPTS", "2"))
    if request_timeout is None:
        request_timeout = int(os.environ.get("SEMANTIC_SCHOLAR_TIMEOUT", "8"))
    headers = _s2_headers()
    paper_id = f"DOI:{doi}"
    base_url = f"{S2_BASE}/paper/{urllib.parse.quote(paper_id, safe='')}"
    last_err = ""
    for attempt in range(max_attempts):
        try:
            resp = session.get(f"{base_url}?fields=citationCount", timeout=request_timeout, headers=headers)
            if resp.status_code == 404:
                return {"count": None, "citing_dois": [], "error": ""}
            resp.raise_for_status()
            cited = resp.json().get("citationCount")
            cited_val = cited if isinstance(cited, int) else None
            citing_dois: list[str] = []
            if fetch_citing_dois:
                citing_dois = s2_fetch_citing_dois(base_url, session, headers=headers)
            return {"count": cited_val, "citing_dois": citing_dois, "error": ""}
        except Exception as e:
            last_err = f"{type(e).__name__}: {e}"
            if attempt + 1 < max_attempts:
                logger.debug("[SemanticScholar] retrying DOI %s after error: %s", doi, last_err)
            time.sleep(0.6 * (attempt + 1))
    return {"count": None, "citing_dois": [], "error": last_err}

`s2_fetch_citing_dois(base_url: str, session: requests.Session, *, headers: dict[str, str] | None = None) -> list[str]` ¶

Paginate through S2 citations and collect DOIs.

Source code in src/utils/apis/citation_apis.py

def s2_fetch_citing_dois(
    base_url: str,
    session: requests.Session,
    *,
    headers: dict[str, str] | None = None,
) -> list[str]:
    """Paginate through S2 citations and collect DOIs."""
    if headers is None:
        headers = _s2_headers()
    citing_dois: set[str] = set()
    offset = 0
    page_size = 100
    while True:
        url = f"{base_url}/citations?fields=externalIds&limit={page_size}&offset={offset}"
        try:
            resp = session.get(url, timeout=25, headers=headers)
            resp.raise_for_status()
            payload = resp.json()
        except Exception:
            break
        for entry in payload.get("data", []) or []:
            ext_ids = (entry.get("citingPaper", {}) or {}).get("externalIds", {}) or {}
            doi_val = ext_ids.get("DOI", "")
            norm = normalize_doi(doi_val)
            if norm:
                citing_dois.add(norm)
        next_offset = payload.get("next")
        if next_offset is None:
            if len(payload.get("data", []) or []) < page_size:
                break
            offset += page_size
        else:
            offset = next_offset
        time.sleep(0.2)
    return sorted(citing_dois)

`s2_reachable(session: requests.Session | None = None) -> bool` ¶

Preflight check — return True if Semantic Scholar is reachable.

Source code in src/utils/apis/citation_apis.py

def s2_reachable(session: requests.Session | None = None) -> bool:
    """Preflight check — return *True* if Semantic Scholar is reachable."""
    check_url = f"{S2_BASE}/paper/DOI%3A10.1038%2Fnature12373?fields=paperId"
    timeout = int(os.environ.get("SEMANTIC_SCHOLAR_PREFLIGHT_TIMEOUT", "3"))
    try:
        if session is not None:
            session.get(check_url, timeout=timeout).raise_for_status()
        else:
            fetch_json_urllib(check_url, timeout=timeout)
        return True
    except Exception as e:
        logger.info("[SemanticScholar] preflight failed: %s: %s", type(e).__name__, e)
        return False

`best_citation_count(openalex_count: int | None, s2_count: int | None) -> int | None` ¶

Return the maximum of two citation counts, or None if both are missing.

Source code in src/utils/apis/citation_apis.py

def best_citation_count(openalex_count: int | None, s2_count: int | None) -> int | None:
    """Return the maximum of two citation counts, or *None* if both are missing."""
    counts = [c for c in (openalex_count, s2_count) if isinstance(c, int)]
    return max(counts) if counts else None