Pre-extract structured data from the local DBLP XML dump.
Parses dblp.xml.gz in a single pass and writes JSON lookup files that
every downstream pipeline step can load instead of hitting the DBLP API.
Outputs (under output_dir):
.cache/dblp_extracted/papers_by_venue.json
{conf: {year_str: [{title, authors, doi, dblp_key}]}}
.cache/dblp_extracted/affiliations.json
{author_name: affiliation}
The extraction is cached: if the DBLP file has not changed (same mtime)
the previous JSON files are reused.
Usage
python -m src.utils.dblp_extract --dblp_file data/dblp/dblp.xml.gz
NOTE — DBLP API policy
~~~~~~~~~~~~~~~~~~~~~~
We deliberately avoid the DBLP web API (https://dblp.org/search/…). The
local XML dump contains the same data and avoids rate-limiting issues that
grow worse as the number of tracked conferences increases. All new code
should use the extracted JSON files produced by this module. Do NOT add
new DBLP API calls.
Parse dblp.xml.gz and write JSON lookup files.
Returns (papers_path, affiliations_path).
Source code in src/utils/dblp_extract.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233 | def extract_dblp(dblp_file: str) -> tuple[str, str]:
"""Parse dblp.xml.gz and write JSON lookup files.
Returns (papers_path, affiliations_path).
"""
extract_dir = _extract_dir()
os.makedirs(extract_dir, exist_ok=True)
papers_path = os.path.join(extract_dir, "papers_by_venue.json")
affiliations_path = os.path.join(extract_dir, "affiliations.json")
# Re-use cached files if the DBLP dump hasn't changed
if _is_fresh(dblp_file, extract_dir):
logger.warning("DBLP extraction cache is fresh — skipping parse")
return papers_path, affiliations_path
logger.info(f"Parsing DBLP XML ({dblp_file}) …")
# {conf -> {year_str -> [paper_dict]}}
papers: defaultdict[str, defaultdict[str, list[dict]]] = defaultdict(lambda: defaultdict(list))
# {author_name -> affiliation}
affiliations = {}
dblp_stream = _PatchedDTDStream(GzipFile(filename=dblp_file))
iteration = 0
for _, elem in ET.iterparse(
dblp_stream,
events=("end",),
tag=("inproceedings", "article", "www"),
load_dtd=True,
recover=True,
huge_tree=True,
):
# --- Person records: extract affiliations ---
if elem.tag == "www":
authors = [a.text for a in elem.findall("author") if a.text]
affil = None
for note in elem.findall("note"):
if note.get("type") == "affiliation" and note.text:
affil = note.text.strip()
break # take the first (most recent) affiliation
if affil:
for name in authors:
if name not in affiliations:
affiliations[name] = affil
elem.clear()
continue
# --- Papers ---
booktitle = elem.findtext("booktitle") or elem.findtext("journal") or ""
conf = venue_to_conference(booktitle)
if conf:
year_str = elem.findtext("year")
if year_str:
title = elem.findtext("title") or ""
# Strip trailing period (DBLP convention)
title = title.rstrip(".")
# Extract DOI from <ee> elements
doi = ""
for ee in elem.findall("ee"):
if ee.text and "doi.org/" in ee.text:
doi = ee.text.split("doi.org/")[-1]
break
authors = [a.text for a in elem.findall("author") if a.text]
dblp_key = elem.get("key", "")
papers[conf][year_str].append(
{
"title": title,
"authors": authors,
"doi": doi,
"dblp_key": dblp_key,
}
)
iteration += 1
if iteration % 2_000_000 == 0:
logger.info(f" … {iteration // 1_000_000}M elements")
elem.clear()
dblp_stream._raw.close()
total_papers = sum(len(plist) for conf_years in papers.values() for plist in conf_years.values())
logger.info(
f" Done — {iteration} elements, {total_papers} conference papers, {len(affiliations)} author affiliations"
)
# Write JSON files (ensure_ascii=False preserves Unicode characters)
with open(papers_path, "w", encoding="utf-8") as f:
json.dump(papers, f, separators=(",", ":"), ensure_ascii=False)
with open(affiliations_path, "w", encoding="utf-8") as f:
json.dump(affiliations, f, separators=(",", ":"), ensure_ascii=False)
# Record the DBLP file mtime for freshness checks
with open(_mtime_file(extract_dir), "w") as f:
f.write(str(os.path.getmtime(dblp_file)))
sz_p = os.path.getsize(papers_path) // 1024 // 1024
sz_a = os.path.getsize(affiliations_path) // 1024 // 1024
logger.info(f" → {papers_path} ({sz_p} MB)")
logger.info(f" → {affiliations_path} ({sz_a} MB)")
return papers_path, affiliations_path
|
Load the pre-extracted papers index.
Returns dict: conf (str) → year_str (str) → list of paper dicts.
Each paper dict has keys: title, authors, doi, dblp_key.
Source code in src/utils/dblp_extract.py
239
240
241
242
243
244
245
246
247
248
249
250 | def load_papers_by_venue(repo_root: str | None = None) -> dict[str, dict[str, list[dict]]]:
"""Load the pre-extracted papers index.
Returns dict: conf (str) → year_str (str) → list of paper dicts.
Each paper dict has keys: title, authors, doi, dblp_key.
"""
path = os.path.join(_extract_dir(repo_root), "papers_by_venue.json")
if not os.path.exists(path):
return {}
with open(path) as f:
result: dict[str, dict[str, list[dict]]] = json.load(f)
return result
|
Load the pre-extracted author → affiliation mapping.
Returns dict: author_name (str) → affiliation (str).
Source code in src/utils/dblp_extract.py
253
254
255
256
257
258
259
260
261
262
263 | def load_affiliations(repo_root: str | None = None) -> dict[str, str]:
"""Load the pre-extracted author → affiliation mapping.
Returns dict: author_name (str) → affiliation (str).
"""
path = os.path.join(_extract_dir(repo_root), "affiliations.json")
if not os.path.exists(path):
return {}
with open(path) as f:
result: dict[str, str] = json.load(f)
return result
|
Look up an author's affiliation from the pre-extracted DBLP data.
Tries exact match, then case-insensitive. Returns the affiliation
string or None.
Source code in src/utils/dblp_extract.py
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288 | def find_affiliation(name, repo_root=None):
"""Look up an author's affiliation from the pre-extracted DBLP data.
Tries exact match, then case-insensitive. Returns the affiliation
string or *None*.
"""
global _affiliations_cache, _affiliations_lower_cache
if _affiliations_cache is None:
_affiliations_cache = load_affiliations(repo_root)
_affiliations_lower_cache = {k.lower(): v for k, v in _affiliations_cache.items()}
if not _affiliations_cache:
return None
# Exact match
if name in _affiliations_cache:
return _affiliations_cache[name]
# Case-insensitive fallback
assert _affiliations_lower_cache is not None
return _affiliations_lower_cache.get(name.lower())
|
Convenience: return list of paper dicts for a conference/year.
Falls back to empty list if data is not available.
Source code in src/utils/dblp_extract.py
291
292
293
294
295
296
297 | def papers_for_venue_year(conf: str, year: int, repo_root: str | None = None) -> list[dict]:
"""Convenience: return list of paper dicts for a conference/year.
Falls back to empty list if data is not available.
"""
data = load_papers_by_venue(repo_root)
return data.get(conf, {}).get(str(year), [])
|
Return dict: (conf, year_int) → paper_count.
Source code in src/utils/dblp_extract.py
300
301
302
303
304
305
306
307 | def paper_count_by_venue_year(repo_root=None):
"""Return dict: (conf, year_int) → paper_count."""
data = load_papers_by_venue(repo_root)
counts = {}
for conf, years in data.items():
for year_str, paper_list in years.items():
counts[(conf, int(year_str))] = len(paper_list)
return counts
|