Skip to content

sys_sec_artifacts_results_scrape

src.scrapers.sys_sec_artifacts_results_scrape

parse_html_results(content)

Parse HTML-table-based results pages (used by OSDI, ATC, etc.). These pages use tags with ids to indicate badges: span#aa = Available, span#af = Functional, span#rr = Reproduced And markdown tables with rows like: | Title | AVAILABLE... | Github |

Source code in src/scrapers/sys_sec_artifacts_results_scrape.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def parse_html_results(content):
    """
    Parse HTML-table-based results pages (used by OSDI, ATC, etc.).
    These pages use <span> tags with ids to indicate badges:
      span#aa = Available, span#af = Functional, span#rr = Reproduced
    And markdown tables with rows like:
      | [Title](url) | <span id="aa">AVAILABLE</span>... | [Github](url) |
    """
    artifacts = []

    # Use BeautifulSoup to parse the HTML fragments within the markdown
    soup = BeautifulSoup(content, "html.parser")

    # Find all table rows
    rows = soup.find_all("tr")
    for row in rows:
        cells = row.find_all("td")
        if len(cells) < 2:
            continue

        # Extract title from first cell
        title_cell = cells[0]
        title_link = title_cell.find("a")
        title = title_link.get_text(strip=True) if title_link else title_cell.get_text(strip=True)
        if not title or title.lower() in ("paper title", ""):
            continue

        paper_url = ""
        if title_link and title_link.get("href"):
            paper_url = title_link["href"]

        # Extract badges from second cell (span tags)
        badge_cell = cells[1] if len(cells) > 1 else None
        badges = []
        if badge_cell:
            spans = badge_cell.find_all("span")
            for span in spans:
                span_id = span.get("id", "")
                span_text = span.get_text(strip=True).lower()
                if span_id == "aa" or "available" in span_text:
                    badges.append("available")
                elif span_id == "af" or "functional" in span_text:
                    badges.append("functional")
                elif span_id == "rr" or "reproduc" in span_text or "replicated" in span_text:
                    badges.append("reproduced")

        # Extract repository URL from third cell
        repo_url = ""
        artifact_url = ""
        if len(cells) > 2:
            url_cell = cells[2]
            links = url_cell.find_all("a")
            for link in links:
                href = link.get("href", "")
                link_text = link.get_text(strip=True).lower()
                if (
                    "github" in link_text
                    or "github.com" in href
                    or "gitlab" in link_text
                    or "gitlab" in href
                    or "bitbucket" in link_text
                ):
                    repo_url = href
                elif "zenodo" in link_text or "zenodo.org" in href or "figshare" in link_text or "doi.org" in href:
                    artifact_url = href
                elif not repo_url:
                    repo_url = href

        if title and (badges or repo_url or artifact_url):
            artifact = {
                "title": title,
                "badges": ",".join(badges) if badges else "",
            }
            if paper_url:
                artifact["paper_url"] = paper_url
            if repo_url:
                artifact["repository_url"] = repo_url
            if artifact_url:
                artifact["artifact_url"] = artifact_url
            artifacts.append(artifact)

    return artifacts

parse_markdown_table_results(content)

Fallback parser for markdown tables that weren't converted to HTML. Parses raw markdown rows like: | Title | AVAILABLE... | Github |

Source code in src/scrapers/sys_sec_artifacts_results_scrape.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
def parse_markdown_table_results(content):
    """
    Fallback parser for markdown tables that weren't converted to HTML.
    Parses raw markdown rows like:
    | [Title](url) | <span id="aa">AVAILABLE</span>... | [Github](url) |
    """
    artifacts = []

    # Find table rows in markdown
    lines = content.split("\n")
    for line in lines:
        line = line.strip()
        if not line.startswith("|") or ":-" in line:
            continue

        cells = [c.strip() for c in line.split("|")[1:-1]]
        if len(cells) < 2:
            continue

        # Extract title
        title_match = re.search(r"\[([^\]]+)\]", cells[0])
        if not title_match:
            continue
        title = title_match.group(1).strip()
        if title.lower() in ("paper title", ""):
            continue

        # Extract badges from spans
        badges = []
        badge_cell = cells[1] if len(cells) > 1 else ""
        if 'id="aa"' in badge_cell or ">AVAILABLE<" in badge_cell:
            badges.append("available")
        if 'id="af"' in badge_cell or ">FUNCTIONAL<" in badge_cell:
            badges.append("functional")
        if 'id="rr"' in badge_cell or ">REPRODUCED<" in badge_cell or ">REPLICATED<" in badge_cell:
            badges.append("reproduced")

        # Extract URLs from third cell
        repo_url = ""
        artifact_url = ""
        if len(cells) > 2:
            url_matches = re.findall(r"\[([^\]]*)\]\(([^)]+)\)", cells[2])
            for link_text, href in url_matches:
                lt = link_text.lower()
                if "github" in lt or "gitlab" in lt or "bitbucket" in lt:
                    repo_url = href
                elif "zenodo" in lt or "figshare" in lt or "doi" in lt:
                    artifact_url = href
                elif not repo_url:
                    repo_url = href
            # Also check for bare URLs
            if not repo_url:
                bare_urls = re.findall(r"(https?://github\.com/[^\s<|]+)", cells[2])
                if bare_urls:
                    repo_url = bare_urls[0]

        if title and (badges or repo_url or artifact_url):
            artifact = {
                "title": title,
                "badges": ",".join(badges) if badges else "",
            }
            if repo_url:
                artifact["repository_url"] = repo_url
            if artifact_url:
                artifact["artifact_url"] = artifact_url
            artifacts.append(artifact)

    return artifacts