Skip to content

artifact_urls

src.utils.normalization.artifact_urls

Shared helpers for classifying artifact URLs by hosting source.

Provides

resolve_doi_prefix(url) — Map a DOI to its repository name (Zenodo, Figshare, …). extract_source(url) — Determine the hosting source of an artifact URL. get_artifact_url(artifact, normalise_fn) — Extract the first valid URL from an artifact dict. get_artifact_urls(artifact, normalise_fn) — Extract all valid URLs from an artifact dict.

resolve_doi_prefix(url: str) -> str | None

Map a DOI URL to the name of its repository (e.g. 'Zenodo').

Returns None if the DOI prefix is unrecognised or the URL contains no DOI.

Source code in src/utils/normalization/artifact_urls.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def resolve_doi_prefix(url: str) -> str | None:
    """Map a DOI URL to the name of its repository (e.g. ``'Zenodo'``).

    Returns ``None`` if the DOI prefix is unrecognised or the URL contains no DOI.
    """
    doi_match = re.search(
        r"(?:doi\.org/)?(?:https?://doi\.org/)?(10\.\d+(?:[/\.][\w.\-]+)*)",
        url,
    )
    if not doi_match:
        return None

    prefix_parts = doi_match.group(1).split("/")[0].split(".")[0:2]
    doi_prefix = ".".join(prefix_parts)
    return DOI_PREFIX_TO_REPO.get(doi_prefix)

extract_source(url: str) -> str

Classify an artifact URL by its hosting platform.

Returns a human-readable label such as 'GitHub', 'Zenodo', 'Figshare', 'OSF', 'GitLab', 'Bitbucket', 'DOI', 'Other', or 'unknown' when url is empty/None.

Source code in src/utils/normalization/artifact_urls.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def extract_source(url: str) -> str:
    """Classify an artifact URL by its hosting platform.

    Returns a human-readable label such as ``'GitHub'``, ``'Zenodo'``,
    ``'Figshare'``, ``'OSF'``, ``'GitLab'``, ``'Bitbucket'``, ``'DOI'``,
    ``'Other'``, or ``'unknown'`` when *url* is empty/None.
    """
    if not url:
        return "unknown"

    url_lower = url.lower()

    if "github.com" in url_lower or "github.io" in url_lower:
        return "GitHub"
    if "zenodo" in url_lower or "zenodo.org" in url_lower:
        return "Zenodo"
    if "figshare" in url_lower:
        return "Figshare"
    if "osf.io" in url_lower:
        return "OSF"
    if "gitlab" in url_lower:
        return "GitLab"
    if "bitbucket" in url_lower:
        return "Bitbucket"
    if "archive.org" in url_lower or "arxiv" in url_lower:
        return "Archive"
    if "dataverse" in url_lower:
        return "Dataverse"
    if "doi.org" in url_lower:
        resolved = resolve_doi_prefix(url_lower)
        return resolved if resolved else "DOI"
    return "Other"

get_artifact_url(artifact: dict, normalise_fn: object = None) -> str | None

Return the first valid URL from artifact, or None.

Parameters

artifact : dict An artifact record (as produced by parse_results_md). normalise_fn : callable, optional A function str -> str|None that normalises raw URL values. Defaults to identity (returns the value as-is).

Source code in src/utils/normalization/artifact_urls.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def get_artifact_url(artifact: dict, normalise_fn: object = None) -> str | None:
    """Return the first valid URL from *artifact*, or ``None``.

    Parameters
    ----------
    artifact : dict
        An artifact record (as produced by ``parse_results_md``).
    normalise_fn : callable, optional
        A function ``str -> str|None`` that normalises raw URL values.
        Defaults to identity (returns the value as-is).
    """
    normalise: Callable[[str], str | None] = normalise_fn or (lambda v: v or None)  # type: ignore[assignment]

    # New format: ``artifact_urls`` is the canonical list
    urls = artifact.get("artifact_urls", [])
    if isinstance(urls, list):
        for u in urls:
            norm = normalise(u)
            if norm:
                return norm

    # Legacy fallback — single-valued URL fields
    for key in _LEGACY_URL_KEYS:
        val = artifact.get(key, "")
        if isinstance(val, list):
            val = val[0] if val else ""
        norm = normalise(val)
        if norm:
            return norm

    return None

get_artifact_urls(artifact: dict, normalise_fn=None) -> list[str]

Return all valid URLs from artifact.

Parameters

artifact : dict An artifact record. normalise_fn : callable, optional A function str -> str|None that normalises raw URL values.

Source code in src/utils/normalization/artifact_urls.py
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
def get_artifact_urls(artifact: dict, normalise_fn=None) -> list[str]:
    """Return *all* valid URLs from *artifact*.

    Parameters
    ----------
    artifact : dict
        An artifact record.
    normalise_fn : callable, optional
        A function ``str -> str|None`` that normalises raw URL values.
    """
    normalise = normalise_fn or (lambda v: v or None)
    urls: list[str] = []

    art_urls = artifact.get("artifact_urls", [])
    if isinstance(art_urls, list):
        for u in art_urls:
            norm = normalise(u)
            if norm:
                urls.append(norm)

    # Legacy fallback — only used when no ``artifact_urls`` were found
    if not urls:
        for key in _LEGACY_URL_KEYS:
            val = artifact.get(key, "")
            if isinstance(val, list):
                val = val[0] if val else ""
            norm = normalise(val)
            if norm and norm not in urls:
                urls.append(norm)

    return urls