diff --git a/.github/workflows-templates/build_and_publish.yml b/.github/workflows/build_and_publish.yml similarity index 92% rename from .github/workflows-templates/build_and_publish.yml rename to .github/workflows/build_and_publish.yml index f1bc847..d28f9c8 100644 --- a/.github/workflows-templates/build_and_publish.yml +++ b/.github/workflows/build_and_publish.yml @@ -13,6 +13,8 @@ on: push: branches: ["main"] workflow_dispatch: + schedule: + - cron: '0 9 * * *' jobs: @@ -31,8 +33,11 @@ jobs: run: | export PATH="/root/.local/bin:$PATH" script/test + project process-current-year dataset build --all - dataset version auto --auto-ban major --all --publish + dataset version static --auto-ban major --all --publish + project remove-current-year-parquets + - name: Push new data id: auto-commit-action diff --git a/.github/workflows-templates/test.yml b/.github/workflows/test.yml similarity index 100% rename from .github/workflows-templates/test.yml rename to .github/workflows/test.yml diff --git a/.gitignore b/.gitignore index 43cfa41..b5cca77 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,7 @@ data/private/* _render/_parts _render/_papermills docs/_site -UNKNOWN.egg-info \ No newline at end of file +UNKNOWN.egg-info +data/interim/results/ +data/process +data/packages/parliamentary_motions/*.parquet \ No newline at end of file diff --git a/data/packages/parliamentary_motions/agreements.resource.yaml b/data/packages/parliamentary_motions/agreements.resource.yaml new file mode 100644 index 0000000..188aa22 --- /dev/null +++ b/data/packages/parliamentary_motions/agreements.resource.yaml @@ -0,0 +1,80 @@ +title: Agreements +description: dataset of agreements extracted from parliamentary debates +custom: + row_count: 3234 +path: agreements.parquet +name: agreements +profile: data-resource +scheme: file +format: parquet +hashing: md5 +encoding: utf-8 +schema: + fields: + - name: gid + type: string + description: unique identifier for the agreement - debate gid plus paragraph pid + constraints: + unique: true + example: uk.org.publicwhip/debate/2019-01-08b.277.4.2 + - name: date + type: string + description: date of the debate + constraints: + unique: false + example: '2019-01-08' + - name: major_heading_id + type: string + description: ID of the major heading (if relevant) + constraints: + unique: false + example: uk.org.publicwhip/debate/2019-01-08b.211.0 + - name: minor_heading_id + type: string + description: ID of the minor heading (if relevant) + constraints: + unique: false + example: '' + - name: speech_id + type: string + description: ID of the speech containing the agreement + constraints: + unique: false + example: uk.org.publicwhip/debate/2019-01-08b.277.4 + - name: paragraph_pid + type: string + description: paragraph ID of the agreement + constraints: + unique: false + example: a100.1/12 + - name: agreed_text + type: string + description: Text that contains the agreement + constraints: + unique: false + example: '(2) That a further day not later than 5 August be allotted for the consideration + of the following Estimate for financial year 2021-22: Foreign, Commonwealth + and Development Office, insofar as it relates to the spending of the Foreign, + Commonwealth and Development Office on Official Development Assistance and the + British Council.—(David Rutley.) Question agreed to.' + - name: motion_title + type: string + description: Title of the motion + constraints: + unique: false + example: '' + - name: motion_gid + type: string + description: ID of the motion + constraints: + unique: true + example: uk.org.publicwhip/debate/2019-01-08b.277.4.3 + - name: chamber + type: string + description: Chamber in which the agreement was made + constraints: + unique: false + enum: + - house-of-commons + example: house-of-commons +hash: 13aa74d2792ce8e82708d4dcb4bab4dd diff --git a/data/packages/parliamentary_motions/datapackage.yaml b/data/packages/parliamentary_motions/datapackage.yaml new file mode 100644 index 0000000..cf9f1c2 --- /dev/null +++ b/data/packages/parliamentary_motions/datapackage.yaml @@ -0,0 +1,46 @@ +name: parliamentary_motions +title: Parliamentary motions +description : | + Motions and agreements extracted from parliamentary debates +version: 0.1.0 +licenses: + + - name: CC-BY-4.0 + path: https://creativecommons.org/licenses/by/4.0/ + title: Creative Commons Attribution 4.0 International License + +contributors: + - title: mySociety + path: https://mysociety.org + role: author + + +custom: + build: "parl_motion_detector.process:move_to_package" + tests: + - test_parliamentary_motions + dataset_order: 0 + download_options: + gate: default + survey: default + header_text: default + formats: + csv: true + parquet: true + gpkg: false + geojson: false + is_geodata: false + composite: + xlsx: + include: all + exclude: none + render: true + sqlite: + include: all + exclude: none + render: true + json: + include: all + exclude: none + render: true + diff --git a/data/packages/parliamentary_motions/division-links.resource.yaml b/data/packages/parliamentary_motions/division-links.resource.yaml new file mode 100644 index 0000000..e8f6f4b --- /dev/null +++ b/data/packages/parliamentary_motions/division-links.resource.yaml @@ -0,0 +1,34 @@ +title: Division Links +description: Lookup between GID for a division and the relevant motion +custom: + row_count: 1094 +path: division-links.parquet +name: division-links +profile: data-resource +scheme: file +format: parquet +hashing: md5 +encoding: utf-8 +schema: + fields: + - name: division_gid + type: string + description: GID of the division + constraints: + unique: true + example: uk.org.publicwhip/debate/2019-01-08b.243.1 + - name: motion_gid + type: string + description: GID of the motion + constraints: + unique: true + example: uk.org.publicwhip/debate/2019-01-08b.212.3.1 + - name: chamber + type: string + description: Chamber in which the motion was made + constraints: + unique: false + enum: + - house-of-commons + example: house-of-commons +hash: 4f802b3551fbc111caa68a36d6d2eddc diff --git a/data/packages/parliamentary_motions/motions.resource.yaml b/data/packages/parliamentary_motions/motions.resource.yaml new file mode 100644 index 0000000..d9c758b --- /dev/null +++ b/data/packages/parliamentary_motions/motions.resource.yaml @@ -0,0 +1,54 @@ +title: Motions +description: Motions extracted from parliamentary debates +custom: + row_count: 4328 +path: motions.parquet +name: motions +profile: data-resource +scheme: file +format: parquet +hashing: md5 +encoding: utf-8 +schema: + fields: + - name: gid + type: string + description: unique identifier for the motion - debate gid plus paragraph pid + constraints: + unique: true + example: uk.org.publicwhip/debate/2019-01-08b.212.3.1 + - name: speech_id + type: string + description: ID of the speech containing the motion (or the first entry) + constraints: + unique: false + example: uk.org.publicwhip/debate/2019-01-08b.212.3 + - name: date + type: string + description: date of the debate + constraints: + unique: false + example: '2019-01-08' + - name: motion_title + type: string + description: Title of the motion + constraints: + unique: false + example: '' + - name: motion_text + type: string + description: Text of the motion + constraints: + unique: false + example: "After Clause 46 - Register of members: information to be included and\ + \ powers to obtain it\nQuestion put, That amendment (a) to Lords amendment 23\ + \ be made." + - name: chamber + type: string + description: Chamber in which the motion was made + constraints: + unique: false + enum: + - house-of-commons + example: house-of-commons +hash: 933981b426e92431d3f5171e08d055d9 diff --git a/data/processed/parquet/house-of-commons-2019-agreements.parquet b/data/processed/parquet/house-of-commons-2019-agreements.parquet new file mode 100644 index 0000000..aa0bace Binary files /dev/null and b/data/processed/parquet/house-of-commons-2019-agreements.parquet differ diff --git a/data/processed/parquet/house-of-commons-2019-division-links.parquet b/data/processed/parquet/house-of-commons-2019-division-links.parquet new file mode 100644 index 0000000..ad8692c Binary files /dev/null and b/data/processed/parquet/house-of-commons-2019-division-links.parquet differ diff --git a/data/processed/parquet/house-of-commons-2019-motions.parquet b/data/processed/parquet/house-of-commons-2019-motions.parquet new file mode 100644 index 0000000..c8c181e Binary files /dev/null and b/data/processed/parquet/house-of-commons-2019-motions.parquet differ diff --git a/data/processed/parquet/house-of-commons-2020-agreements.parquet b/data/processed/parquet/house-of-commons-2020-agreements.parquet new file mode 100644 index 0000000..8fe08dd Binary files /dev/null and b/data/processed/parquet/house-of-commons-2020-agreements.parquet differ diff --git a/data/processed/parquet/house-of-commons-2020-division-links.parquet b/data/processed/parquet/house-of-commons-2020-division-links.parquet new file mode 100644 index 0000000..43a999c Binary files /dev/null and b/data/processed/parquet/house-of-commons-2020-division-links.parquet differ diff --git a/data/processed/parquet/house-of-commons-2020-motions.parquet b/data/processed/parquet/house-of-commons-2020-motions.parquet new file mode 100644 index 0000000..1a08f87 Binary files /dev/null and b/data/processed/parquet/house-of-commons-2020-motions.parquet differ diff --git a/data/processed/parquet/house-of-commons-2021-agreements.parquet b/data/processed/parquet/house-of-commons-2021-agreements.parquet new file mode 100644 index 0000000..f2e74a4 Binary files /dev/null and b/data/processed/parquet/house-of-commons-2021-agreements.parquet differ diff --git a/data/processed/parquet/house-of-commons-2021-division-links.parquet b/data/processed/parquet/house-of-commons-2021-division-links.parquet new file mode 100644 index 0000000..735f86d Binary files /dev/null and b/data/processed/parquet/house-of-commons-2021-division-links.parquet differ diff --git a/data/processed/parquet/house-of-commons-2021-motions.parquet b/data/processed/parquet/house-of-commons-2021-motions.parquet new file mode 100644 index 0000000..02027a4 Binary files /dev/null and b/data/processed/parquet/house-of-commons-2021-motions.parquet differ diff --git a/data/processed/parquet/house-of-commons-2022-agreements.parquet b/data/processed/parquet/house-of-commons-2022-agreements.parquet new file mode 100644 index 0000000..710385a Binary files /dev/null and b/data/processed/parquet/house-of-commons-2022-agreements.parquet differ diff --git a/data/processed/parquet/house-of-commons-2022-division-links.parquet b/data/processed/parquet/house-of-commons-2022-division-links.parquet new file mode 100644 index 0000000..ae31cd4 Binary files /dev/null and b/data/processed/parquet/house-of-commons-2022-division-links.parquet differ diff --git a/data/processed/parquet/house-of-commons-2022-motions.parquet b/data/processed/parquet/house-of-commons-2022-motions.parquet new file mode 100644 index 0000000..fa072de Binary files /dev/null and b/data/processed/parquet/house-of-commons-2022-motions.parquet differ diff --git a/data/processed/parquet/house-of-commons-2023-agreements.parquet b/data/processed/parquet/house-of-commons-2023-agreements.parquet new file mode 100644 index 0000000..43487cb Binary files /dev/null and b/data/processed/parquet/house-of-commons-2023-agreements.parquet differ diff --git a/data/processed/parquet/house-of-commons-2023-division-links.parquet b/data/processed/parquet/house-of-commons-2023-division-links.parquet new file mode 100644 index 0000000..e5c0178 Binary files /dev/null and b/data/processed/parquet/house-of-commons-2023-division-links.parquet differ diff --git a/data/processed/parquet/house-of-commons-2023-motions.parquet b/data/processed/parquet/house-of-commons-2023-motions.parquet new file mode 100644 index 0000000..759e12a Binary files /dev/null and b/data/processed/parquet/house-of-commons-2023-motions.parquet differ diff --git a/docs/index.md b/docs/index.md index 7501521..3becebc 100644 --- a/docs/index.md +++ b/docs/index.md @@ -3,6 +3,6 @@ title: "Download parl_motion_detector" layout: datasets/front --- -# Dataset download site title +# Parliamentary motion detection -Here is some text underneath that. \ No newline at end of file +Extracted motions and agreements. \ No newline at end of file diff --git a/notebooks/matchup.ipynb b/notebooks/matchup.ipynb index 4c44373..2cc3d36 100644 --- a/notebooks/matchup.ipynb +++ b/notebooks/matchup.ipynb @@ -24,6 +24,7 @@ "T = TypeVar(\"T\")\n", "\n", "data_dir = Path(\"..\", \"data\")\n", + "\n", "tests_path = Path(\"..\", \"data\", \"tests\", \"mapper\")" ] }, @@ -276,7 +277,102 @@ "2024-08-25\n", "2024-08-26\n", "2024-08-27\n", - "2024-08-28\n" + "2024-08-28\n", + "2024-08-29\n", + "2024-08-30\n", + "2024-08-31\n", + "2024-09-01\n", + "2024-09-02\n", + "2024-09-03\n", + "2024-09-04\n", + "2024-09-05\n", + "2024-09-06\n", + "2024-09-07\n", + "2024-09-08\n", + "2024-09-09\n", + "2024-09-10\n", + "2024-09-11\n", + "2024-09-12\n", + "2024-09-13\n", + "2024-09-14\n", + "2024-09-15\n", + "2024-09-16\n", + "2024-09-17\n", + "2024-09-18\n", + "2024-09-19\n", + "2024-09-20\n", + "2024-09-21\n", + "2024-09-22\n", + "2024-09-23\n", + "2024-09-24\n", + "2024-09-25\n", + "2024-09-26\n", + "2024-09-27\n", + "2024-09-28\n", + "2024-09-29\n", + "2024-09-30\n", + "2024-10-01\n", + "2024-10-02\n", + "2024-10-03\n", + "2024-10-04\n", + "2024-10-05\n", + "2024-10-06\n", + "2024-10-07\n", + "2024-10-08\n", + "2024-10-09\n", + "2024-10-10\n", + "2024-10-11\n", + "2024-10-12\n", + "2024-10-13\n", + "2024-10-14\n" + ] + }, + { + "data": { + "text/html": [ + "
Agreement uk.org.publicwhip/debate/2024-10-14c.673.0.12 not assigned - no relevant motions.\n",
+       "
\n" + ], + "text/plain": [ + "Agreement uk.org.publicwhip/debate/\u001b[1;36m2024\u001b[0m-\u001b[1;36m10\u001b[0m-14c.\u001b[1;36m673.0\u001b[0m.\u001b[1;36m12\u001b[0m not assigned - no relevant motions.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-10-15\n", + "2024-10-16\n", + "2024-10-17\n", + "2024-10-18\n", + "2024-10-19\n", + "2024-10-20\n", + "2024-10-21\n", + "2024-10-22\n", + "2024-10-23\n", + "2024-10-24\n", + "2024-10-25\n", + "2024-10-26\n", + "2024-10-27\n", + "2024-10-28\n", + "2024-10-29\n", + "2024-10-30\n", + "2024-10-31\n", + "2024-11-01\n", + "2024-11-02\n", + "2024-11-03\n", + "2024-11-04\n", + "2024-11-05\n", + "2024-11-06\n", + "2024-11-07\n", + "2024-11-08\n", + "2024-11-09\n", + "2024-11-10\n", + "2024-11-11\n", + "2024-11-12\n" ] } ], @@ -296,7 +392,8 @@ "\"\"\"\n", "\n", "year = 2024\n", - "current_date = datetime.date(2024, 11, 12)\n", + "current_date = datetime.datetime.now().date()\n", + "chamber = Transcript.Chamber.COMMONS\n", "# all dates in year to date\n", "dates_in_year = [\n", " datetime.date(year, 1, 1) + datetime.timedelta(days=i) for i in range(365)\n", @@ -307,7 +404,6 @@ "# dates_in_year = [\"2023-10-25\"]\n", "\n", "for debate_date in dates_in_year:\n", - " print(debate_date)\n", " try:\n", " transcript_path = get_latest_for_date(\n", " datetime.date.fromisoformat(debate_date), download_path=data_dir\n", @@ -321,228 +417,127 @@ " transcript_path.write_text(txt)\n", " transcript = Transcript.from_xml_path(transcript_path)\n", "\n", - " mm = mapper.MotionMapper(transcript, debate_date, data_dir)\n", + " mm = mapper.MotionMapper(\n", + " transcript, debate_date=debate_date, data_dir=data_dir, chamber=chamber\n", + " )\n", "\n", - " mm.assign()" + " mm.assign()\n", + " results = mm.export()\n", + " results.to_data_dir(data_dir / \"interim\" / \"results\")" ] }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "rh = mapper.ResultsHolder.from_data_dir_composite(\n", + " data_dir / \"interim\" / \"results\", date=\"2024\", chamber=Transcript.Chamber.COMMONS\n", + ")\n", + "\n", + "rh.export(data_dir / \"processed\" / \"parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
{\n",
-       "    'division_motions': {\n",
-       "        'uk.org.publicwhip/debate/2020-12-10b.1059.0': Motion(\n",
-       "            date='2020-12-10',\n",
-       "            motion_title='After Clause 1 - COMMON FRAMEWORKS PROCESS',\n",
-       "            major_heading_id='uk.org.publicwhip/debate/2020-12-10b.1037.0',\n",
-       "            minor_heading_id='uk.org.publicwhip/debate/2020-12-10b.1037.3',\n",
-       "            major_heading_title='United Kingdom Internal Market Bill',\n",
-       "            minor_heading_title='After Clause 1 - COMMON FRAMEWORKS PROCESS',\n",
-       "            speech_start_pid='b1037.5/2',\n",
-       "            speech_id='uk.org.publicwhip/debate/2020-12-10b.1037.5',\n",
-       "            final_speech_id='uk.org.publicwhip/debate/2020-12-10b.1037.5',\n",
-       "            end_reason='Valid end character',\n",
-       "            motion_lines=[\n",
-       "                'That this House agrees with the Lords in their amendments 8B, 8C, 8D, 8F, \n",
-       "8G, 8H, 8J and 8K, but disagrees with the Lords in their amendment 8L, insists on its \n",
-       "disagreement with the Lords in their amendments 13 and 56, and proposes amendment (a) to the \n",
-       "Bill in lieu of Lords amendments 8L, 13 and 56.',\n",
-       "                'That this House insists on its disagreement with the Lords in their \n",
-       "amendments 14 and 52 to 54 but does not insist on its disagreement with the Lords in their \n",
-       "amendment 55.'\n",
-       "            ],\n",
-       "            flags=[],\n",
-       "            gid='uk.org.publicwhip/debate/2020-12-10b.1037.5.2'\n",
-       "        ),\n",
-       "        'uk.org.publicwhip/debate/2020-12-10b.1064.0': Motion(\n",
-       "            date='2020-12-10',\n",
-       "            motion_title='After Clause 1 - COMMON FRAMEWORKS PROCESS',\n",
-       "            major_heading_id='uk.org.publicwhip/debate/2020-12-10b.1037.0',\n",
-       "            minor_heading_id='uk.org.publicwhip/debate/2020-12-10b.1037.3',\n",
-       "            major_heading_title='United Kingdom Internal Market Bill',\n",
-       "            minor_heading_title='After Clause 1 - COMMON FRAMEWORKS PROCESS',\n",
-       "            speech_start_pid='b1059.1/14',\n",
-       "            speech_id='uk.org.publicwhip/debate/2020-12-10b.1059.1',\n",
-       "            final_speech_id='uk.org.publicwhip/debate/2020-12-10b.1059.1',\n",
-       "            end_reason='amendment closed with name',\n",
-       "            motion_lines=[\n",
-       "                'Motion made, and Question put,',\n",
-       "                'That this House disagrees with the Lords in their Amendments Nos. 48B and \n",
-       "48C.—(David Duguid.)'\n",
-       "            ],\n",
-       "            flags=[],\n",
-       "            gid='uk.org.publicwhip/debate/2020-12-10b.1059.1.14'\n",
-       "        ),\n",
-       "        'uk.org.publicwhip/debate/2020-12-10b.1055.0': Motion(\n",
-       "            date='2020-12-10',\n",
-       "            motion_title='After Clause 1 - COMMON FRAMEWORKS PROCESS',\n",
-       "            major_heading_id='uk.org.publicwhip/debate/2020-12-10b.1037.0',\n",
-       "            minor_heading_id='uk.org.publicwhip/debate/2020-12-10b.1037.3',\n",
-       "            major_heading_title='United Kingdom Internal Market Bill',\n",
-       "            minor_heading_title='After Clause 1 - COMMON FRAMEWORKS PROCESS',\n",
-       "            speech_start_pid='b1055.1/3',\n",
-       "            speech_id='uk.org.publicwhip/debate/2020-12-10b.1055.1',\n",
-       "            final_speech_id='uk.org.publicwhip/debate/2020-12-10b.1055.1',\n",
-       "            end_reason='Valid end character',\n",
-       "            motion_lines=[\n",
-       "                'Resolved,',\n",
-       "                'That this House disagrees with the Lords in their Amendments Nos. 1B, 1C and\n",
-       "1D.'\n",
-       "            ],\n",
-       "            flags=['after_decision'],\n",
-       "            gid='uk.org.publicwhip/debate/2020-12-10b.1055.1.3'\n",
-       "        )\n",
-       "    },\n",
-       "    'agreement_motions': {\n",
-       "        'uk.org.publicwhip/debate/2020-12-10b.1100.0.2': Motion(\n",
-       "            date='2020-12-10',\n",
-       "            motion_title='The Future of the High Street',\n",
-       "            major_heading_id='uk.org.publicwhip/debate/2020-12-10b.1070.0',\n",
-       "            minor_heading_id='',\n",
-       "            major_heading_title='The Future of the High Street',\n",
-       "            minor_heading_title='None',\n",
-       "            speech_start_pid='b1070.1/1',\n",
-       "            speech_id='uk.org.publicwhip/debate/2020-12-10b.1070.1',\n",
-       "            final_speech_id='uk.org.publicwhip/debate/2020-12-10b.1070.1',\n",
-       "            end_reason='Valid end character',\n",
-       "            motion_lines=[\n",
-       "                'I beg to move,',\n",
-       "                'That this House has considered the future of the high street.'\n",
-       "            ],\n",
-       "            flags=['main_question'],\n",
-       "            gid='uk.org.publicwhip/debate/2020-12-10b.1070.1.1'\n",
-       "        ),\n",
-       "        'uk.org.publicwhip/debate/2020-12-10b.1106.4.2': Motion(\n",
-       "            date='2020-12-10',\n",
-       "            motion_title='Adjournment Debate: Encouragement of Terrorism Offences',\n",
-       "            major_heading_id='uk.org.publicwhip/debate/2020-12-10b.1101.0',\n",
+       "
ResultsHolder(\n",
+       "    date='2024-01-08',\n",
+       "    chamber=<Chamber.COMMONS: 'house-of-commons'>,\n",
+       "    division_motions=[],\n",
+       "    agreement_motions=[\n",
+       "        Agreement(\n",
+       "            date='2024-01-08',\n",
+       "            speech_id='uk.org.publicwhip/debate/2024-01-08d.132.1',\n",
+       "            major_heading_id='uk.org.publicwhip/debate/2024-01-08d.120.0',\n",
        "            minor_heading_id='',\n",
-       "            major_heading_title='Encouragement of Terrorism Offences',\n",
-       "            minor_heading_title='None',\n",
-       "            speech_start_pid='b1101.1/1',\n",
-       "            speech_id='uk.org.publicwhip/debate/2020-12-10b.1101.1',\n",
-       "            final_speech_id='uk.org.publicwhip/debate/2020-12-10b.1101.1',\n",
-       "            end_reason='one line motion',\n",
-       "            motion_lines=[\n",
-       "                'Motion made, and Question proposed, That this House do now adjourn.—(James \n",
-       "Morris.)'\n",
-       "            ],\n",
-       "            flags=['one_line_motion'],\n",
-       "            gid='uk.org.publicwhip/debate/2020-12-10b.1101.1.1'\n",
+       "            paragraph_pid='d132.1/3',\n",
+       "            end_reason='one_line_agreement',\n",
+       "            agreement_pid='',\n",
+       "            agreed_text='Question put and agreed to.',\n",
+       "            preceeding_text='I have campaigned on Gilsland for only 14 years; in \n",
+       "Herefordshire, there is Pontrilas and there are other stations—whether they were killed by Dr\n",
+       "Beeching or others down the years—that are sought as an opportunity for a reopening of our \n",
+       "railway infrastructure. As we have seen with the Waverley line in Scotland, there is a \n",
+       "definite desire for such railways to be reinvigorated and for new stations to return. Without\n",
+       "a shadow of a doubt, I will personally take the issue up with the trains Minister.',\n",
+       "            after_text='House adjourned.',\n",
+       "            motion=Motion(\n",
+       "                date='2024-01-08',\n",
+       "                motion_title='Adjournment Debate: Rural Transport',\n",
+       "                major_heading_id='uk.org.publicwhip/debate/2024-01-08d.120.0',\n",
+       "                minor_heading_id='',\n",
+       "                major_heading_title='Rural Transport',\n",
+       "                minor_heading_title='None',\n",
+       "                speech_start_pid='d120.1/1',\n",
+       "                speech_id='uk.org.publicwhip/debate/2024-01-08d.120.1',\n",
+       "                final_speech_id='uk.org.publicwhip/debate/2024-01-08d.120.1',\n",
+       "                end_reason='one line motion',\n",
+       "                motion_lines=[\n",
+       "                    'Motion made, and Question proposed, That this House do now \n",
+       "adjourn.—(Suzanne Webb.)'\n",
+       "                ],\n",
+       "                flags=['one_line_motion'],\n",
+       "                gid='uk.org.publicwhip/debate/2024-01-08d.120.1.1'\n",
+       "            ),\n",
+       "            motion_assignment_reason='single motion and decision',\n",
+       "            gid='uk.org.publicwhip/debate/2024-01-08d.132.1.3'\n",
        "        )\n",
-       "    }\n",
-       "}\n",
+       "    ]\n",
+       ")\n",
        "
\n" ], "text/plain": [ - "\u001b[1m{\u001b[0m\n", - " \u001b[32m'division_motions'\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1059.0'\u001b[0m: \u001b[1;35mMotion\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdate\u001b[0m=\u001b[32m'2020-12-10'\u001b[0m,\n", - " \u001b[33mmotion_title\u001b[0m=\u001b[32m'After Clause 1 - COMMON FRAMEWORKS PROCESS'\u001b[0m,\n", - " \u001b[33mmajor_heading_id\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1037.0'\u001b[0m,\n", - " \u001b[33mminor_heading_id\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1037.3'\u001b[0m,\n", - " \u001b[33mmajor_heading_title\u001b[0m=\u001b[32m'United Kingdom Internal Market Bill'\u001b[0m,\n", - " \u001b[33mminor_heading_title\u001b[0m=\u001b[32m'After Clause 1 - COMMON FRAMEWORKS PROCESS'\u001b[0m,\n", - " \u001b[33mspeech_start_pid\u001b[0m=\u001b[32m'b1037.5/2'\u001b[0m,\n", - " \u001b[33mspeech_id\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1037.5'\u001b[0m,\n", - " \u001b[33mfinal_speech_id\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1037.5'\u001b[0m,\n", - " \u001b[33mend_reason\u001b[0m=\u001b[32m'Valid end character'\u001b[0m,\n", - " \u001b[33mmotion_lines\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[32m'That this House agrees with the Lords in their amendments 8B, 8C, 8D, 8F, \u001b[0m\n", - "\u001b[32m8G, 8H, 8J and 8K, but disagrees with the Lords in their amendment 8L, insists on its \u001b[0m\n", - "\u001b[32mdisagreement with the Lords in their amendments 13 and 56, and proposes amendment \u001b[0m\u001b[32m(\u001b[0m\u001b[32ma\u001b[0m\u001b[32m)\u001b[0m\u001b[32m to the \u001b[0m\n", - "\u001b[32mBill in lieu of Lords amendments 8L, 13 and 56.'\u001b[0m,\n", - " \u001b[32m'That this House insists on its disagreement with the Lords in their \u001b[0m\n", - "\u001b[32mamendments 14 and 52 to 54 but does not insist on its disagreement with the Lords in their \u001b[0m\n", - "\u001b[32mamendment 55.'\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33mflags\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33mgid\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1037.5.2'\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1064.0'\u001b[0m: \u001b[1;35mMotion\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdate\u001b[0m=\u001b[32m'2020-12-10'\u001b[0m,\n", - " \u001b[33mmotion_title\u001b[0m=\u001b[32m'After Clause 1 - COMMON FRAMEWORKS PROCESS'\u001b[0m,\n", - " \u001b[33mmajor_heading_id\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1037.0'\u001b[0m,\n", - " \u001b[33mminor_heading_id\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1037.3'\u001b[0m,\n", - " \u001b[33mmajor_heading_title\u001b[0m=\u001b[32m'United Kingdom Internal Market Bill'\u001b[0m,\n", - " \u001b[33mminor_heading_title\u001b[0m=\u001b[32m'After Clause 1 - COMMON FRAMEWORKS PROCESS'\u001b[0m,\n", - " \u001b[33mspeech_start_pid\u001b[0m=\u001b[32m'b1059.1/14'\u001b[0m,\n", - " \u001b[33mspeech_id\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1059.1'\u001b[0m,\n", - " \u001b[33mfinal_speech_id\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1059.1'\u001b[0m,\n", - " \u001b[33mend_reason\u001b[0m=\u001b[32m'amendment closed with name'\u001b[0m,\n", - " \u001b[33mmotion_lines\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[32m'Motion made, and Question put,'\u001b[0m,\n", - " \u001b[32m'That this House disagrees with the Lords in their Amendments Nos. 48B and \u001b[0m\n", - "\u001b[32m48C.—\u001b[0m\u001b[32m(\u001b[0m\u001b[32mDavid Duguid.\u001b[0m\u001b[32m)\u001b[0m\u001b[32m'\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33mflags\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33mgid\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1059.1.14'\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1055.0'\u001b[0m: \u001b[1;35mMotion\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdate\u001b[0m=\u001b[32m'2020-12-10'\u001b[0m,\n", - " \u001b[33mmotion_title\u001b[0m=\u001b[32m'After Clause 1 - COMMON FRAMEWORKS PROCESS'\u001b[0m,\n", - " \u001b[33mmajor_heading_id\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1037.0'\u001b[0m,\n", - " \u001b[33mminor_heading_id\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1037.3'\u001b[0m,\n", - " \u001b[33mmajor_heading_title\u001b[0m=\u001b[32m'United Kingdom Internal Market Bill'\u001b[0m,\n", - " \u001b[33mminor_heading_title\u001b[0m=\u001b[32m'After Clause 1 - COMMON FRAMEWORKS PROCESS'\u001b[0m,\n", - " \u001b[33mspeech_start_pid\u001b[0m=\u001b[32m'b1055.1/3'\u001b[0m,\n", - " \u001b[33mspeech_id\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1055.1'\u001b[0m,\n", - " \u001b[33mfinal_speech_id\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1055.1'\u001b[0m,\n", - " \u001b[33mend_reason\u001b[0m=\u001b[32m'Valid end character'\u001b[0m,\n", - " \u001b[33mmotion_lines\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[32m'Resolved,'\u001b[0m,\n", - " \u001b[32m'That this House disagrees with the Lords in their Amendments Nos. 1B, 1C and\u001b[0m\n", - "\u001b[32m1D.'\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33mflags\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'after_decision'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33mgid\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1055.1.3'\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m}\u001b[0m,\n", - " \u001b[32m'agreement_motions'\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1100.0.2'\u001b[0m: \u001b[1;35mMotion\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdate\u001b[0m=\u001b[32m'2020-12-10'\u001b[0m,\n", - " \u001b[33mmotion_title\u001b[0m=\u001b[32m'The Future of the High Street'\u001b[0m,\n", - " \u001b[33mmajor_heading_id\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1070.0'\u001b[0m,\n", + "\u001b[1;35mResultsHolder\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33mdate\u001b[0m=\u001b[32m'2024-01-08'\u001b[0m,\n", + " \u001b[33mchamber\u001b[0m=\u001b[1m<\u001b[0m\u001b[1;95mChamber.COMMONS:\u001b[0m\u001b[39m \u001b[0m\u001b[32m'house-of-commons'\u001b[0m\u001b[1m>\u001b[0m,\n", + " \u001b[33mdivision_motions\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m,\n", + " \u001b[33magreement_motions\u001b[0m=\u001b[1m[\u001b[0m\n", + " \u001b[1;35mAgreement\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33mdate\u001b[0m=\u001b[32m'2024-01-08'\u001b[0m,\n", + " \u001b[33mspeech_id\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2024-01-08d.132.1'\u001b[0m,\n", + " \u001b[33mmajor_heading_id\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2024-01-08d.120.0'\u001b[0m,\n", " \u001b[33mminor_heading_id\u001b[0m=\u001b[32m''\u001b[0m,\n", - " \u001b[33mmajor_heading_title\u001b[0m=\u001b[32m'The Future of the High Street'\u001b[0m,\n", - " \u001b[33mminor_heading_title\u001b[0m=\u001b[32m'None'\u001b[0m,\n", - " \u001b[33mspeech_start_pid\u001b[0m=\u001b[32m'b1070.1/1'\u001b[0m,\n", - " \u001b[33mspeech_id\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1070.1'\u001b[0m,\n", - " \u001b[33mfinal_speech_id\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1070.1'\u001b[0m,\n", - " \u001b[33mend_reason\u001b[0m=\u001b[32m'Valid end character'\u001b[0m,\n", - " \u001b[33mmotion_lines\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[32m'I beg to move,'\u001b[0m,\n", - " \u001b[32m'That this House has considered the future of the high street.'\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33mflags\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main_question'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33mgid\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1070.1.1'\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1106.4.2'\u001b[0m: \u001b[1;35mMotion\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdate\u001b[0m=\u001b[32m'2020-12-10'\u001b[0m,\n", - " \u001b[33mmotion_title\u001b[0m=\u001b[32m'Adjournment Debate: Encouragement of Terrorism Offences'\u001b[0m,\n", - " \u001b[33mmajor_heading_id\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1101.0'\u001b[0m,\n", - " \u001b[33mminor_heading_id\u001b[0m=\u001b[32m''\u001b[0m,\n", - " \u001b[33mmajor_heading_title\u001b[0m=\u001b[32m'Encouragement of Terrorism Offences'\u001b[0m,\n", - " \u001b[33mminor_heading_title\u001b[0m=\u001b[32m'None'\u001b[0m,\n", - " \u001b[33mspeech_start_pid\u001b[0m=\u001b[32m'b1101.1/1'\u001b[0m,\n", - " \u001b[33mspeech_id\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1101.1'\u001b[0m,\n", - " \u001b[33mfinal_speech_id\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1101.1'\u001b[0m,\n", - " \u001b[33mend_reason\u001b[0m=\u001b[32m'one line motion'\u001b[0m,\n", - " \u001b[33mmotion_lines\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[32m'Motion made, and Question proposed, That this House do now adjourn.—\u001b[0m\u001b[32m(\u001b[0m\u001b[32mJames \u001b[0m\n", - "\u001b[32mMorris.\u001b[0m\u001b[32m)\u001b[0m\u001b[32m'\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33mflags\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'one_line_motion'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33mgid\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2020-12-10b.1101.1.1'\u001b[0m\n", + " \u001b[33mparagraph_pid\u001b[0m=\u001b[32m'd132.1/3'\u001b[0m,\n", + " \u001b[33mend_reason\u001b[0m=\u001b[32m'one_line_agreement'\u001b[0m,\n", + " \u001b[33magreement_pid\u001b[0m=\u001b[32m''\u001b[0m,\n", + " \u001b[33magreed_text\u001b[0m=\u001b[32m'Question put and agreed to.'\u001b[0m,\n", + " \u001b[33mpreceeding_text\u001b[0m=\u001b[32m'I have campaigned on Gilsland for only 14 years; in \u001b[0m\n", + "\u001b[32mHerefordshire, there is Pontrilas and there are other stations—whether they were killed by Dr\u001b[0m\n", + "\u001b[32mBeeching or others down the years—that are sought as an opportunity for a reopening of our \u001b[0m\n", + "\u001b[32mrailway infrastructure. As we have seen with the Waverley line in Scotland, there is a \u001b[0m\n", + "\u001b[32mdefinite desire for such railways to be reinvigorated and for new stations to return. Without\u001b[0m\n", + "\u001b[32ma shadow of a doubt, I will personally take the issue up with the trains Minister.'\u001b[0m,\n", + " \u001b[33mafter_text\u001b[0m=\u001b[32m'House adjourned.'\u001b[0m,\n", + " \u001b[33mmotion\u001b[0m=\u001b[1;35mMotion\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33mdate\u001b[0m=\u001b[32m'2024-01-08'\u001b[0m,\n", + " \u001b[33mmotion_title\u001b[0m=\u001b[32m'Adjournment Debate: Rural Transport'\u001b[0m,\n", + " \u001b[33mmajor_heading_id\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2024-01-08d.120.0'\u001b[0m,\n", + " \u001b[33mminor_heading_id\u001b[0m=\u001b[32m''\u001b[0m,\n", + " \u001b[33mmajor_heading_title\u001b[0m=\u001b[32m'Rural Transport'\u001b[0m,\n", + " \u001b[33mminor_heading_title\u001b[0m=\u001b[32m'None'\u001b[0m,\n", + " \u001b[33mspeech_start_pid\u001b[0m=\u001b[32m'd120.1/1'\u001b[0m,\n", + " \u001b[33mspeech_id\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2024-01-08d.120.1'\u001b[0m,\n", + " \u001b[33mfinal_speech_id\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2024-01-08d.120.1'\u001b[0m,\n", + " \u001b[33mend_reason\u001b[0m=\u001b[32m'one line motion'\u001b[0m,\n", + " \u001b[33mmotion_lines\u001b[0m=\u001b[1m[\u001b[0m\n", + " \u001b[32m'Motion made, and Question proposed, That this House do now \u001b[0m\n", + "\u001b[32madjourn.—\u001b[0m\u001b[32m(\u001b[0m\u001b[32mSuzanne Webb.\u001b[0m\u001b[32m)\u001b[0m\u001b[32m'\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[33mflags\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'one_line_motion'\u001b[0m\u001b[1m]\u001b[0m,\n", + " \u001b[33mgid\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2024-01-08d.120.1.1'\u001b[0m\n", + " \u001b[1m)\u001b[0m,\n", + " \u001b[33mmotion_assignment_reason\u001b[0m=\u001b[32m'single motion and decision'\u001b[0m,\n", + " \u001b[33mgid\u001b[0m=\u001b[32m'uk.org.publicwhip/debate/2024-01-08d.132.1.3'\u001b[0m\n", " \u001b[1m)\u001b[0m\n", - " \u001b[1m}\u001b[0m\n", - "\u001b[1m}\u001b[0m\n" + " \u001b[1m]\u001b[0m\n", + "\u001b[1m)\u001b[0m\n" ] }, "metadata": {}, @@ -550,7 +545,9 @@ } ], "source": [ - "rich.print(mm.export())" + "results = mm.export()\n", + "rich.print(results)\n", + "results.to_data_dir(data_dir / \"interim\" / \"results\")" ] }, { diff --git a/src/data_common b/src/data_common index d5aad46..f344ce9 160000 --- a/src/data_common +++ b/src/data_common @@ -1 +1 @@ -Subproject commit d5aad46259f4cd1aa39040d1a8870e60b40131f0 +Subproject commit f344ce934eac895d78656658b779805f5ffa6bae diff --git a/src/parl_motion_detector/__main__.py b/src/parl_motion_detector/__main__.py index 1cfde09..92b8ef7 100644 --- a/src/parl_motion_detector/__main__.py +++ b/src/parl_motion_detector/__main__.py @@ -1,7 +1,18 @@ +from pathlib import Path + import rich_click as click +from .process import ( + delete_current_year_parquets, + move_to_package, + render_historical, + render_latest, + render_year, +) from .snapshot import generate_all_snapshots +data_dir = Path(__file__).parent.parent.parent / "data" + @click.group() def cli(): @@ -14,8 +25,55 @@ def main(): @cli.command() def refresh_snapshot(): + """ + Refresh motion snapshots for tests + """ generate_all_snapshots() +@cli.command() +def process_current_year(): + """ + Update data for current year + """ + render_latest(data_dir) + move_to_package(data_dir) + + +@cli.command() +def process_historical(): + """ + Regenerate parquets for historical information + """ + render_historical(data_dir) + move_to_package(data_dir) + + +@cli.command() +def recreate_package(): + """ + Just create the overal parquets for packages + """ + move_to_package(data_dir) + + +@cli.command() +@click.argument("year", type=int) +def process_year(year: int): + """ + Process an arbitary year + """ + render_year(data_dir, year=year) + move_to_package(data_dir) + + +@cli.command() +def remove_current_year_parquets(): + """ + Remove all parquets for the current year (so not cached every day) + """ + delete_current_year_parquets(data_dir) + + if __name__ == "__main__": main() diff --git a/src/parl_motion_detector/agreements.py b/src/parl_motion_detector/agreements.py index b2c54b9..dd79604 100644 --- a/src/parl_motion_detector/agreements.py +++ b/src/parl_motion_detector/agreements.py @@ -5,6 +5,7 @@ from pathlib import Path from typing import ( Generic, + Optional, Protocol, TypeVar, ) @@ -68,6 +69,21 @@ class Agreement(HasSpeechAndDate): agreed_text: str preceeding_text: str after_text: str + motion: Optional[Motion] = None + motion_assignment_reason: str = "" + + def flat(self): + return { + "gid": self.gid, + "date": self.date, + "major_heading_id": self.major_heading_id, + "minor_heading_id": self.minor_heading_id, + "speech_id": self.speech_id, + "paragraph_pid": self.paragraph_pid, + "agreed_text": self.agreed_text, + "motion_title": self.motion.motion_title if self.motion else "", + "motion_gid": self.motion.gid if self.motion else "", + } @property def preceeding(self): @@ -77,6 +93,12 @@ def preceeding(self): def after(self): return self.after_text + def motion_speech_id(self): + if self.motion: + return self.motion.gid + else: + return "" + def construct_motion(self): if construct_reading_pass(self.after_text.lower()): motion_lines = [self.agreed_text, self.after_text] @@ -117,6 +139,8 @@ class DivisionHolder(HasSpeechAndDate): paragraph_pid: str = "" preceding_speech: str after_speech: str + motion: Optional[Motion] = None + motion_assignment_reason: str = "" @property def preceeding(self): @@ -126,6 +150,12 @@ def preceeding(self): def after(self): return self.after_speech + def motion_speech_id(self): + if self.motion: + return self.motion.gid + else: + return "" + def construct_motion(self): """ Sometimes (like for clauses) there isn't actually a perfect motion to hold onto @@ -233,7 +263,11 @@ def get_divisions(transcript: Transcript, date_str: str) -> DivisionCollection: else: previous_speech = "" try: - next_speech = str(transcript.items[index + 1]) + next_item = transcript.items[index + 1] + if isinstance(next_item, Speech): + next_speech = str(next_item.items[0]) + else: + next_speech = str(next_item) except IndexError: next_speech = "" current_division = DivisionHolder( diff --git a/src/parl_motion_detector/mapper.py b/src/parl_motion_detector/mapper.py index 6c7d2fd..280de3c 100644 --- a/src/parl_motion_detector/mapper.py +++ b/src/parl_motion_detector/mapper.py @@ -3,12 +3,15 @@ import json import re from functools import lru_cache -from itertools import groupby +from itertools import chain, groupby from pathlib import Path from typing import TypeVar +import pandas as pd import rich from mysoc_validator import Transcript +from mysoc_validator.models.transcripts import Chamber +from pydantic import BaseModel, Field from parl_motion_detector.detector import PhraseDetector @@ -192,8 +195,79 @@ def remove_redundant_motions(motions: list[Motion]) -> list[Motion]: return non_redundant_motions +class ResultsHolder(BaseModel): + date: str + chamber: Chamber + division_motions: list[DivisionHolder] = Field(default_factory=list) + agreement_motions: list[Agreement] = Field(default_factory=list) + + def export_motions_parquet(self, output_dir: Path): + all_motions = [ + x.motion.flat() + for x in self.division_motions + self.agreement_motions + if x.motion + ] + df = pd.DataFrame(all_motions) + df["chamber"] = self.chamber + df.to_parquet(output_dir / f"{self.chamber}-{self.date}-motions.parquet") + + def export_divison_links(self, output_dir: Path): + df = pd.DataFrame( + [ + { + "division_gid": x.gid, + "motion_gid": x.motion_speech_id(), + } + for x in self.division_motions + ] + ) + df["chamber"] = self.chamber + df.to_parquet(output_dir / f"{self.chamber}-{self.date}-division-links.parquet") + + def export_agreements(self, output_dir: Path): + df = pd.DataFrame([x.flat() for x in self.agreement_motions]) + df["chamber"] = self.chamber + df.to_parquet(output_dir / f"{self.chamber}-{self.date}-agreements.parquet") + + def export(self, output_dir: Path): + if not output_dir.exists(): + output_dir.mkdir(parents=True) + self.export_motions_parquet(output_dir) + self.export_divison_links(output_dir) + self.export_agreements(output_dir) + + def to_data_dir(self, data_dir: Path): + if not data_dir.exists(): + data_dir.mkdir(parents=True) + with (data_dir / f"{self.chamber}-{self.date}.json").open("w") as f: + f.write(self.model_dump_json(indent=2)) + + @classmethod + def from_data_dir(cls, data_dir: Path, date: str, chamber: Chamber): + with (data_dir / f"{chamber}-{date}.json").open() as f: + return cls.model_validate_json(f.read()) + + @classmethod + def from_data_dir_composite(cls, data_dir: Path, date: str, chamber: Chamber): + items: list[ResultsHolder] = [] + for file_path in data_dir.glob(f"{chamber}-{date}*.json"): + with file_path.open() as f: + item = cls.model_validate_json(f.read()) + items.append(item) + + composite = cls( + date=date, + chamber=chamber, + division_motions=list(chain(*[x.division_motions for x in items])), + agreement_motions=list(chain(*[x.agreement_motions for x in items])), + ) + return composite + + class MotionMapper: - def __init__(self, transcript: Transcript, debate_date: str, data_dir: Path): + def __init__( + self, transcript: Transcript, debate_date: str, chamber: Chamber, data_dir: Path + ): self.transcript = transcript self.speech_id_map = { x.id: n # type: ignore @@ -202,11 +276,12 @@ def __init__(self, transcript: Transcript, debate_date: str, data_dir: Path): } self.data_dir = data_dir self.debate_date = debate_date + self.chamber = chamber self.found_motions = get_motions(transcript, debate_date) self.found_agreements = get_agreements(transcript, debate_date) self.found_divisions = get_divisions(transcript, debate_date) - self.division_assignments: dict[str, Motion] = {} - self.agreement_assignments: dict[str, Motion] = {} + self.division_assignments: list[DivisionHolder] = [] + self.agreement_assignments: list[Agreement] = [] def speech_distance(self, id_a: str, id_b: str) -> int: return abs(self.speech_id_map[id_a] - self.speech_id_map[id_b]) @@ -219,18 +294,20 @@ def snapshot(self): # dictionary to use as a snapshot return { "division_motions": { - k: v.speech_id for k, v in self.division_assignments.items() + x.gid: x.motion_speech_id() for x in self.division_assignments }, "agreement_motions": { - k: v.speech_id for k, v in self.agreement_assignments.items() + x.gid: x.motion_speech_id() for x in self.agreement_assignments }, } - def export(self): - return { - "division_motions": self.division_assignments, - "agreement_motions": self.agreement_assignments, - } + def export(self) -> ResultsHolder: + return ResultsHolder( + date=self.debate_date, + chamber=self.chamber, + division_motions=self.division_assignments, + agreement_motions=self.agreement_assignments, + ) def all_items(self): def ordered_speech(gid: str) -> float: @@ -251,11 +328,13 @@ def assign_motion_decision( assignment_reason: str, ): # print(f"Assigning {motion.speech_id} to {decision.speech_id} - {assignment_reason}") + decision.motion = motion + decision.motion_assignment_reason = assignment_reason match decision: case DivisionHolder(): - self.division_assignments[decision.gid] = motion + self.division_assignments.append(decision) case Agreement(): - self.agreement_assignments[decision.gid] = motion + self.agreement_assignments.append(decision) def decision_position(self, decision: DivisionHolder | Agreement) -> int: return self.speech_id_map.get(decision.speech_id, 0) diff --git a/src/parl_motion_detector/motions.py b/src/parl_motion_detector/motions.py index af7f2a3..754c941 100644 --- a/src/parl_motion_detector/motions.py +++ b/src/parl_motion_detector/motions.py @@ -73,6 +73,15 @@ class Motion(BaseModel): motion_lines: list[str] = Field(default_factory=list) flags: list[Flag] = Field(default_factory=list) + def flat(self) -> dict[str, str]: + return { + "gid": self.gid, + "speech_id": self.speech_id, + "date": self.date, + "motion_title": self.motion_title, + "motion_text": "\n".join(self.motion_lines), + } + @classmethod def merge(cls, motions: list[Motion]) -> Motion: if len(motions) == 0: @@ -497,9 +506,9 @@ def new_motion(speech_start_pid: Optional[str]): except IndexError: try: next_transcript_group = transcript_groups[transcript_index + 1] + next_item = next_transcript_group.speech.items[0] except IndexError: next_item = None - next_item = next_transcript_group.speech.items[0] if discussion_mode(paragraph): speech_is_discussion_mode = True diff --git a/src/parl_motion_detector/process.py b/src/parl_motion_detector/process.py new file mode 100644 index 0000000..321af03 --- /dev/null +++ b/src/parl_motion_detector/process.py @@ -0,0 +1,112 @@ +from __future__ import annotations + +import datetime +from pathlib import Path + +import pandas as pd +from mysoc_validator import Transcript +from mysoc_validator.models.transcripts import Chamber +from tqdm import tqdm + +from parl_motion_detector.downloader import get_latest_for_date + +from .mapper import MotionMapper, ResultsHolder + +data_dir = Path(__file__).parent.parent.parent / "data" + + +def render_year( + data_dir: Path, year: int | None = None, chamber: Chamber = Chamber.COMMONS +): + """ + Render motions for a specify year + """ + current_date = datetime.datetime.now().date() + if year is None: + year = current_date.year + # all dates in year to date + dates_in_year = [ + datetime.date(year, 1, 1) + datetime.timedelta(days=i) for i in range(365) + ] + # all dates in year to date + dates_in_year = [x.isoformat() for x in dates_in_year if x <= current_date] + + for debate_date in tqdm(dates_in_year, desc=str(year)): + try: + transcript_path = get_latest_for_date( + datetime.date.fromisoformat(debate_date), download_path=data_dir + ) + except FileNotFoundError: + continue + # fix 2019 error + txt = transcript_path.read_text() + if "21 14" in txt: + txt = txt.replace("21 14", "2114") + transcript_path.write_text(txt) + transcript = Transcript.from_xml_path(transcript_path) + + mm = MotionMapper( + transcript, debate_date=debate_date, data_dir=data_dir, chamber=chamber + ) + + mm.assign() + results = mm.export() + results.to_data_dir(data_dir / "interim" / "results") + + rh = ResultsHolder.from_data_dir_composite( + data_dir / "interim" / "results", date=str(year), chamber=chamber + ) + rh.export(data_dir / "processed" / "parquet") + + +def render_historical(data_dir: Path): + """ + Render motions for all historical dates + """ + current_year = datetime.datetime.now().year + for year in range(2019, current_year): + render_year(data_dir, year=year, chamber=Chamber.COMMONS) + + +def render_latest(data_dir: Path): + """ + Render motions for the latest date + """ + render_year(data_dir) + + +def delete_current_year_parquets(data_dir: Path): + parquet_dir = data_dir / "processed" / "parquet" + current_year = datetime.datetime.now().year + current_year_str = f"-{current_year}-" + for file in parquet_dir.glob(f"*{current_year_str}*"): + file.unlink() + + +def move_to_package(data_dir: Path = data_dir): + """ + Move all processed data to the package + """ + package_dir = data_dir / "packages" / "parliamentary_motions" + parquet_dir = data_dir / "processed" / "parquet" + + file_endings = ["agreements.parquet", "motions.parquet", "division-links.parquet"] + + for file_ending in file_endings: + dfs = [] + for file in parquet_dir.glob(f"*-{file_ending}"): + dfs.append(pd.read_parquet(file)) + + df = pd.concat(dfs) + # sort by first column + df = df.sort_values(by=df.columns[0]) + + # remove duplicate rows + df = df.drop_duplicates() + + # check there are no duplicated values in the first column + + if df[df.columns[0]].duplicated().sum() != 0: + raise ValueError("Duplicated values in the first column") + + df.to_parquet(package_dir / file_ending) diff --git a/src/parl_motion_detector/snapshot.py b/src/parl_motion_detector/snapshot.py index 6f74b30..e447988 100644 --- a/src/parl_motion_detector/snapshot.py +++ b/src/parl_motion_detector/snapshot.py @@ -38,7 +38,7 @@ def generate_mapper_snapshot(date: str): datetime.date.fromisoformat(date), download_path=debates_path ) transcript = Transcript.from_xml_path(transcript_path) - mapper = MotionMapper(transcript, date, debates_path) + mapper = MotionMapper(transcript, date, Transcript.Chamber.COMMONS, debates_path) mapper.assign() mapper.dump_test_data(tests_path / "mapper") diff --git a/tests/test_mapper.py b/tests/test_mapper.py index 81aabca..2772f95 100644 --- a/tests/test_mapper.py +++ b/tests/test_mapper.py @@ -17,7 +17,7 @@ def compare_date(debate_date: str): ) transcript = Transcript.from_xml_path(transcript_path) - mm = MotionMapper(transcript, debate_date, debates_path) + mm = MotionMapper(transcript, debate_date, Transcript.Chamber.COMMONS, debates_path) mm.assign() snapshot = mm.snapshot() with (tests_path / f"{debate_date}.json").open() as f: