Skip to content

Commit

Permalink
Fix proceeding filename
Browse files Browse the repository at this point in the history
The basename in the RSS feed changed from 20138 on, from
NNNNN-data.xml to NNNNN.xml
Handle both situations
  • Loading branch information
oaubert committed May 16, 2024
1 parent 8c7b23e commit 9e7b211
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions optv/parliaments/DE/scraper/fetch_proceedings.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import lxml.html
import os
from pathlib import Path
import re
import sys
import urllib.request
import urllib3
Expand Down Expand Up @@ -49,8 +50,13 @@ def download_plenary_protocols(destination_dir: str, fullscan: bool = False, per
link_href = link.attrib["href"]
link_count += 1
basename = os.path.basename(link_href)
# Rename NNNN-data.xml to NNNN-proceedings.xml to be more clear
basename = basename.replace('-data.xml', '-proceedings.xml')
# Get session id from filename.
# The basename is either NNNNN-data.xml or NNNNN.xml (from 20138 on)
ids = re.findall(r'^(\d+)', basename)
if not ids:
raise ValueError(f"Invalid filename {basename} - cannot extract session id")
session_id = ids[0]
basename = f"{session_id}-proceedings.xml"
filename = dest / basename
if filename.exists():
# Existing file.
Expand Down

0 comments on commit 9e7b211

Please sign in to comment.