Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Import canonical_data.countries through Airflow #7

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions country/airflow_upload_to_data_lake.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import argparse
import os

import wmfdata as wmf


def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--database",
help="Name of the database to put the table countres into",
default="canonical_data",
)
parser.add_argument("--data_file", help="TSV table", default="countries.tsv")
parser.add_argument(
"--create_table_statement",
help="TSV table",
default="create_canonical_data_countries_table.hql",
)
args = parser.parse_args()

spark = wmf.spark.get_session(type="local")

cwd = os.getcwd()
df = spark.read.csv(f"file:///{cwd}/{args.data_file}", header=True)
print(f"Filling {args.database}.countries with {df.count()} line(s)")

query = f"use {args.database};\n" + open(args.create_table_statement).read()
print(query)
spark.sql(query)

df.write.mode("overwrite").saveAsTable(args.table)


if __name__ == "__main__":
main()
23 changes: 23 additions & 0 deletions country/create_canonical_data_countries_table.hql
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
-- Create table statement for an static table about countries.
--
-- This table belongs to analytics-product
--
-- Parameters:
-- <none>
--
-- Usage
-- spark3-sql \
-- -f create_countries_table.hql \
-- --database canonical_data
--

CREATE TABLE IF NOT EXISTS `countries` (
name STRING COMMENT 'Country name, aligned with the article on English Wikipedia',
iso_code STRING COMMENT 'ISO 3166-1 two-letter country code',
economic_region STRING COMMENT 'Global South/North, according to [[en:Global North and Global South]]',
maxmind_continent STRING COMMENT 'Continent, according to MaxMind databases',
is_protected BOOLEAN COMMENT 'Whether the country appears in [[wikitech:Country_protection_list]]',
is_eu BOOLEAN COMMENT 'Whether the country belongs to the European Union'
)
COMMENT 'Metadata information about countries we release data about.'
USING parquet;