Pipeline configuration dataclass.
Centralises the arguments shared across generators, enrichers, and the
pipeline orchestrator so that configuration is validated once at startup
rather than scattered across individual argparse blocks.
PipelineConfig
dataclass
Immutable configuration for a single pipeline run.
Source code in src/config.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102 | @dataclass
class PipelineConfig:
"""Immutable configuration for a single pipeline run."""
# ── Paths ────────────────────────────────────────────────────────────
output_dir: Path = field(default_factory=lambda: Path("output/staging"))
data_dir: Path = field(default_factory=lambda: Path("data"))
results_dir: Path = field(default_factory=lambda: Path("../reprodb-pipeline-results"))
log_dir: Path = field(default_factory=lambda: Path("logs"))
dblp_file: Path = field(default_factory=lambda: Path("data/dblp/dblp.xml.gz"))
# ── Scraping / filtering ────────────────────────────────────────────
conf_regex: str = r".*20[12][0-9]"
# ── Proxy settings ──────────────────────────────────────────────────
http_proxy: str | None = None
https_proxy: str | None = None
# ── Pipeline behaviour ──────────────────────────────────────────────
deploy: bool = False
save_results: bool = False
push: bool = False
refresh: bool = False
def __post_init__(self) -> None:
"""Coerce strings to Path and resolve proxy defaults."""
self.output_dir = Path(self.output_dir)
self.data_dir = Path(self.data_dir)
self.results_dir = Path(self.results_dir)
self.log_dir = Path(self.log_dir)
self.dblp_file = Path(self.dblp_file)
# Mirror the shell script's auto-detection behaviour
if self.https_proxy is None and self.http_proxy is not None:
self.https_proxy = self.http_proxy
@classmethod
def from_env(cls) -> PipelineConfig:
"""Build config from environment variables (``PIPELINE_*``)."""
kwargs: dict[str, object] = {}
env_map = {
"PIPELINE_OUTPUT_DIR": "output_dir",
"PIPELINE_DATA_DIR": "data_dir",
"PIPELINE_RESULTS_DIR": "results_dir",
"PIPELINE_LOG_DIR": "log_dir",
"PIPELINE_DBLP_FILE": "dblp_file",
"PIPELINE_CONF_REGEX": "conf_regex",
"PIPELINE_DEPLOY": "deploy",
"PIPELINE_SAVE_RESULTS": "save_results",
"PIPELINE_PUSH": "push",
"PIPELINE_REFRESH": "refresh",
}
for env_key, attr in env_map.items():
val = os.environ.get(env_key)
if val is not None:
if attr in ("deploy", "save_results", "push", "refresh"):
kwargs[attr] = val.lower() in ("1", "true", "yes")
else:
kwargs[attr] = val
# Proxy from standard env vars
kwargs.setdefault("http_proxy", os.environ.get("http_proxy") or os.environ.get("HTTP_PROXY"))
kwargs.setdefault("https_proxy", os.environ.get("https_proxy") or os.environ.get("HTTPS_PROXY"))
return cls(**kwargs) # type: ignore[arg-type]
@property
def assets_data(self) -> Path:
"""``<output_dir>/assets/data`` convenience path."""
return self.output_dir / "assets" / "data"
@property
def jekyll_data(self) -> Path:
"""``<output_dir>/_data`` convenience path."""
return self.output_dir / "_data"
@property
def build_dir(self) -> Path:
"""``<output_dir>/_build`` — intermediate files consumed by later stages but not deployed to the website."""
return self.output_dir / "_build"
def ensure_dirs(self) -> None:
"""Create required output directories."""
self.output_dir.mkdir(parents=True, exist_ok=True)
self.assets_data.mkdir(parents=True, exist_ok=True)
self.jekyll_data.mkdir(parents=True, exist_ok=True)
self.build_dir.mkdir(parents=True, exist_ok=True)
self.log_dir.mkdir(parents=True, exist_ok=True)
|
assets_data: Path
property
<output_dir>/assets/data convenience path.
jekyll_data: Path
property
<output_dir>/_data convenience path.
build_dir: Path
property
<output_dir>/_build — intermediate files consumed by later stages but not deployed to the website.
__post_init__() -> None
Coerce strings to Path and resolve proxy defaults.
Source code in src/config.py
39
40
41
42
43
44
45
46
47
48
49 | def __post_init__(self) -> None:
"""Coerce strings to Path and resolve proxy defaults."""
self.output_dir = Path(self.output_dir)
self.data_dir = Path(self.data_dir)
self.results_dir = Path(self.results_dir)
self.log_dir = Path(self.log_dir)
self.dblp_file = Path(self.dblp_file)
# Mirror the shell script's auto-detection behaviour
if self.https_proxy is None and self.http_proxy is not None:
self.https_proxy = self.http_proxy
|
from_env() -> PipelineConfig
classmethod
Build config from environment variables (PIPELINE_*).
Source code in src/config.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79 | @classmethod
def from_env(cls) -> PipelineConfig:
"""Build config from environment variables (``PIPELINE_*``)."""
kwargs: dict[str, object] = {}
env_map = {
"PIPELINE_OUTPUT_DIR": "output_dir",
"PIPELINE_DATA_DIR": "data_dir",
"PIPELINE_RESULTS_DIR": "results_dir",
"PIPELINE_LOG_DIR": "log_dir",
"PIPELINE_DBLP_FILE": "dblp_file",
"PIPELINE_CONF_REGEX": "conf_regex",
"PIPELINE_DEPLOY": "deploy",
"PIPELINE_SAVE_RESULTS": "save_results",
"PIPELINE_PUSH": "push",
"PIPELINE_REFRESH": "refresh",
}
for env_key, attr in env_map.items():
val = os.environ.get(env_key)
if val is not None:
if attr in ("deploy", "save_results", "push", "refresh"):
kwargs[attr] = val.lower() in ("1", "true", "yes")
else:
kwargs[attr] = val
# Proxy from standard env vars
kwargs.setdefault("http_proxy", os.environ.get("http_proxy") or os.environ.get("HTTP_PROXY"))
kwargs.setdefault("https_proxy", os.environ.get("https_proxy") or os.environ.get("HTTPS_PROXY"))
return cls(**kwargs) # type: ignore[arg-type]
|
ensure_dirs() -> None
Create required output directories.
Source code in src/config.py
| def ensure_dirs(self) -> None:
"""Create required output directories."""
self.output_dir.mkdir(parents=True, exist_ok=True)
self.assets_data.mkdir(parents=True, exist_ok=True)
self.jekyll_data.mkdir(parents=True, exist_ok=True)
self.build_dir.mkdir(parents=True, exist_ok=True)
self.log_dir.mkdir(parents=True, exist_ok=True)
|