orchestrator¶

`src.orchestrator` ¶

Python pipeline orchestrator.

Python-native pipeline orchestrator that uses :mod:src.stages for dependency ordering and :mod:src.config for configuration. Independent stages within the same tier run in parallel via :class:concurrent.futures.ThreadPoolExecutor.

Usage::

python -m src.orchestrator                     # local staging
python -m src.orchestrator --deploy            # write to live site
python -m src.orchestrator --log-format json   # structured logs

`run_pipeline(cfg: PipelineConfig, *, max_workers: int = 4, message: str = '') -> bool` ¶

Execute the full pipeline using the stage dependency graph.

Returns True if all required stages succeeded.

Source code in src/orchestrator.py

def run_pipeline(cfg: PipelineConfig, *, max_workers: int = 4, message: str = "") -> bool:
    """Execute the full pipeline using the stage dependency graph.

    Returns True if all required stages succeeded.
    """
    cfg.ensure_dirs()
    _detect_github_token()
    _seed_staging(cfg)
    _check_dblp(cfg)

    if cfg.http_proxy:
        os.environ.setdefault("http_proxy", cfg.http_proxy)
        os.environ.setdefault("HTTP_PROXY", cfg.http_proxy)
    if cfg.https_proxy:
        os.environ.setdefault("https_proxy", cfg.https_proxy)
        os.environ.setdefault("HTTPS_PROXY", cfg.https_proxy)

    groups = parallel_groups(STAGES)
    total_stages = sum(len(g) for g in groups)
    completed = 0
    failed: list[str] = []
    timings: dict[str, float] = {}

    pipeline_start = time.monotonic()

    for tier_idx, tier in enumerate(groups):
        logger.info("── Tier %d/%d (%d stages) ──", tier_idx + 1, len(groups), len(tier))

        if len(tier) == 1:
            name, ok, elapsed = _run_stage(tier[0], cfg)
            completed += 1
            timings[name] = elapsed
            if not ok:
                failed.append(name)
                return False
        else:
            with ThreadPoolExecutor(max_workers=min(max_workers, len(tier))) as pool:
                futures = {pool.submit(_run_stage, stage, cfg): stage for stage in tier}
                for future in as_completed(futures):
                    name, ok, elapsed = future.result()
                    completed += 1
                    timings[name] = elapsed
                    if not ok:
                        failed.append(name)

            if failed:
                logger.error("Pipeline aborted — required stage(s) failed: %s", ", ".join(failed))
                return False

    total_elapsed = time.monotonic() - pipeline_start
    logger.info("Pipeline complete! %d/%d stages in %.1fs", completed, total_stages, total_elapsed)

    # Log timing summary
    for name, elapsed in sorted(timings.items(), key=lambda x: -x[1]):
        if elapsed > 0:
            logger.info("  %-25s %6.1fs", name, elapsed)

    # ── Post-pipeline invariant checks ───────────────────────────────────
    logger.info("── Running invariant checks ──")
    violations = check_invariants(cfg.output_dir)
    errors = [v for v in violations if v.severity == "error"]
    warnings = [v for v in violations if v.severity == "warning"]
    for v in warnings:
        logger.warning("⚠ %s", v)
    for v in errors:
        logger.error("✗ %s", v)
    if errors:
        logger.error("Invariant check failed: %d error(s), %d warning(s)", len(errors), len(warnings))
        return False
    if warnings:
        logger.info("Invariant checks passed with %d warning(s)", len(warnings))
    else:
        logger.info("✓ All invariant checks passed")

    # ── Cross-run monotonicity checks ────────────────────────────────────
    reference = load_snapshot()
    current_snapshot = create_summary(cfg.output_dir)
    if reference is not None:
        logger.info("── Running monotonicity checks ──")
        mono_violations = check_monotonicity(reference, current_snapshot)
        mono_errors = [mv for mv in mono_violations if mv.severity == "error"]
        mono_warnings = [mw for mw in mono_violations if mw.severity == "warning"]
        for mw in mono_warnings:
            logger.warning("⚠ %s", mw)
        for me in mono_errors:
            logger.error("✗ %s", me)
        if mono_errors:
            logger.error(
                "Monotonicity check failed: %d error(s), %d warning(s)",
                len(mono_errors),
                len(mono_warnings),
            )
            return False
        if mono_warnings:
            logger.info("Monotonicity checks passed with %d warning(s)", len(mono_warnings))
        else:
            logger.info("✓ All monotonicity checks passed")
    else:
        logger.info("No reference snapshot — skipping monotonicity checks")

    # Update reference snapshot
    save_snapshot(current_snapshot)

    # ── Write run metadata ───────────────────────────────────────────────
    write_run_metadata(
        cfg.output_dir,
        timings=timings,
        dblp_file=cfg.dblp_file,
    )

    # ── Save results snapshot ────────────────────────────────────────────
    if cfg.save_results:
        logger.info("── Saving results snapshot ──")
        save_results(cfg, message=message)

    return True

orchestrator¶

src.orchestrator ¶

run_pipeline(cfg: PipelineConfig, *, max_workers: int = 4, message: str = '') -> bool ¶

`src.orchestrator` ¶

`run_pipeline(cfg: PipelineConfig, *, max_workers: int = 4, message: str = '') -> bool` ¶