Skip to content

export_artifact_citations

src.generators.export_artifact_citations

Export artifact citations in a simple format showing artifact DOI and citing DOIs.

Usage

python3 export_artifact_citations.py --data_dir ../reprodb.github.io python3 export_artifact_citations.py --data_dir ../reprodb.github.io --output citations_export.txt

export_citations(data_dir: str, output_file: str = None) -> None

Export artifact citations to a simple DOI mapping format.

Source code in src/generators/export_artifact_citations.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def export_citations(data_dir: str, output_file: str = None) -> None:
    """Export artifact citations to a simple DOI mapping format."""

    citations_path = os.path.join(data_dir, "assets", "data", "artifact_citations.json")

    if not os.path.exists(citations_path):
        logger.error(f"Error: {citations_path} not found.")
        logger.info("Run generate_artifact_citations.py first.")
        sys.exit(1)

    with open(citations_path, "r") as f:
        artifacts = json.load(f)

    # Open output file or use stdout
    out_cm = open(output_file, "w") if output_file else contextlib.nullcontext(sys.stdout)  # noqa: SIM115

    with out_cm as out:
        artifacts_with_citations = 0
        total_citing_dois = 0

        for artifact in artifacts:
            doi = artifact.get("doi")
            if not doi:
                continue

            # Collect citing DOIs from both sources
            citing_dois = []

            # Add OpenAlex citing DOIs
            openalex_citing = artifact.get("citing_dois_openalex", [])
            if openalex_citing:
                citing_dois.extend(openalex_citing)

            # Add Semantic Scholar citing DOIs
            semantic_citing = artifact.get("citing_dois_semantic_scholar", [])
            if semantic_citing:
                citing_dois.extend(semantic_citing)

            # Remove duplicates while preserving order
            seen = set()
            unique_citing_dois = []
            for citing_doi in citing_dois:
                if citing_doi not in seen:
                    seen.add(citing_doi)
                    unique_citing_dois.append(citing_doi)

            # Only output if there are citations
            if unique_citing_dois:
                artifacts_with_citations += 1
                total_citing_dois += len(unique_citing_dois)

                # Format: artifact_doi: ["citing_doi_1", "citing_doi_2", ...]
                citing_dois_str = json.dumps(unique_citing_dois)
                out.write(f"{doi}: {citing_dois_str}\n")

        # Print summary to stderr so it doesn't interfere with output
        logger.info("\n# Summary:")
        logger.info(f"# Total artifacts with DOIs: {sum(1 for a in artifacts if a.get('doi'))}")
        logger.info(f"# Artifacts with citing DOIs: {artifacts_with_citations}")
        logger.info(f"# Total citing DOIs collected: {total_citing_dois}")

    if output_file:
        logger.info(f"\nWrote citations export to: {output_file}")