Skip to content

ReproDB Pipeline

sys_sec_artifacts_results_scrape

reprodb-pipeline

sys_sec_artifacts_results_scrape¶

`src.scrapers.sys_sec_artifacts_results_scrape` ¶

`parse_html_results(content)` ¶

Parse HTML-table-based results pages (used by OSDI, ATC, etc.). These pages use tags with ids to indicate badges: span#aa = Available, span#af = Functional, span#rr = Reproduced And markdown tables with rows like: | Title | AVAILABLE... | Github |

Source code in src/scrapers/sys_sec_artifacts_results_scrape.py

def parse_html_results(content):
    """
    Parse HTML-table-based results pages (used by OSDI, ATC, etc.).
    These pages use <span> tags with ids to indicate badges:
      span#aa = Available, span#af = Functional, span#rr = Reproduced
    And markdown tables with rows like:
      | [Title](url) | <span id="aa">AVAILABLE</span>... | [Github](url) |
    """
    artifacts = []

    # Use BeautifulSoup to parse the HTML fragments within the markdown
    soup = BeautifulSoup(content, "html.parser")

    # Find all table rows
    rows = soup.find_all("tr")
    for row in rows:
        cells = row.find_all("td")
        if len(cells) < 2:
            continue

        # Extract title from first cell
        title_cell = cells[0]
        title_link = title_cell.find("a")
        title = title_link.get_text(strip=True) if title_link else title_cell.get_text(strip=True)
        if not title or title.lower() in ("paper title", ""):
            continue

        paper_url = ""
        if title_link and title_link.get("href"):
            paper_url = title_link["href"]

        # Extract badges from second cell (span tags)
        badge_cell = cells[1] if len(cells) > 1 else None
        badges = []
        if badge_cell:
            spans = badge_cell.find_all("span")
            for span in spans:
                span_id = span.get("id", "")
                span_text = span.get_text(strip=True).lower()
                if span_id == "aa" or "available" in span_text:
                    badges.append("available")
                elif span_id == "af" or "functional" in span_text:
                    badges.append("functional")
                elif span_id == "rr" or "reproduc" in span_text or "replicated" in span_text:
                    badges.append("reproduced")

        # Extract repository URL from third cell
        repo_url = ""
        artifact_url = ""
        if len(cells) > 2:
            url_cell = cells[2]
            links = url_cell.find_all("a")
            for link in links:
                href = link.get("href", "")
                link_text = link.get_text(strip=True).lower()
                if (
                    "github" in link_text
                    or "github.com" in href
                    or "gitlab" in link_text
                    or "gitlab" in href
                    or "bitbucket" in link_text
                ):
                    repo_url = href
                elif "zenodo" in link_text or "zenodo.org" in href or "figshare" in link_text or "doi.org" in href:
                    artifact_url = href
                elif not repo_url:
                    repo_url = href

        if title and (badges or repo_url or artifact_url):
            artifact = {
                "title": title,
                "badges": ",".join(badges) if badges else "",
            }
            if paper_url:
                artifact["paper_url"] = paper_url
            if repo_url:
                artifact["repository_url"] = repo_url
            if artifact_url:
                artifact["artifact_url"] = artifact_url
            artifacts.append(artifact)

    return artifacts

`parse_markdown_table_results(content)` ¶

Fallback parser for markdown tables that weren't converted to HTML. Parses raw markdown rows like: | Title | AVAILABLE... | Github |

Source code in src/scrapers/sys_sec_artifacts_results_scrape.py

def parse_markdown_table_results(content):
    """
    Fallback parser for markdown tables that weren't converted to HTML.
    Parses raw markdown rows like:
    | [Title](url) | <span id="aa">AVAILABLE</span>... | [Github](url) |
    """
    artifacts = []

    # Find table rows in markdown
    lines = content.split("\n")
    for line in lines:
        line = line.strip()
        if not line.startswith("|") or ":-" in line:
            continue

        cells = [c.strip() for c in line.split("|")[1:-1]]
        if len(cells) < 2:
            continue

        # Extract title
        title_match = re.search(r"\[([^\]]+)\]", cells[0])
        if not title_match:
            continue
        title = title_match.group(1).strip()
        if title.lower() in ("paper title", ""):
            continue

        # Extract badges from spans
        badges = []
        badge_cell = cells[1] if len(cells) > 1 else ""
        if 'id="aa"' in badge_cell or ">AVAILABLE<" in badge_cell:
            badges.append("available")
        if 'id="af"' in badge_cell or ">FUNCTIONAL<" in badge_cell:
            badges.append("functional")
        if 'id="rr"' in badge_cell or ">REPRODUCED<" in badge_cell or ">REPLICATED<" in badge_cell:
            badges.append("reproduced")

        # Extract URLs from third cell
        repo_url = ""
        artifact_url = ""
        if len(cells) > 2:
            url_matches = re.findall(r"\[([^\]]*)\]\(([^)]+)\)", cells[2])
            for link_text, href in url_matches:
                lt = link_text.lower()
                if "github" in lt or "gitlab" in lt or "bitbucket" in lt:
                    repo_url = href
                elif "zenodo" in lt or "figshare" in lt or "doi" in lt:
                    artifact_url = href
                elif not repo_url:
                    repo_url = href
            # Also check for bare URLs
            if not repo_url:
                bare_urls = re.findall(r"(https?://github\.com/[^\s<|]+)", cells[2])
                if bare_urls:
                    repo_url = bare_urls[0]

        if title and (badges or repo_url or artifact_url):
            artifact = {
                "title": title,
                "badges": ",".join(badges) if badges else "",
            }
            if repo_url:
                artifact["repository_url"] = repo_url
            if artifact_url:
                artifact["artifact_url"] = artifact_url
            artifacts.append(artifact)

    return artifacts