-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_natural_questions.py
64 lines (57 loc) · 2.09 KB
/
parse_natural_questions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import dataclasses
import gzip
import json
import os
import pprint
from collections import defaultdict
from html.parser import HTMLParser
import htmlmin
import jsonlines
import numpy as np
import pandas as pd
import six
from joblib import Parallel, delayed
from lxml import etree
from lxml.html import fromstring
from tqdm import tqdm
from html_parser import TagToRemoveWithContent, get_clean_text_and_metadata
def process_file(file_name):
if file_name in [
"nq-train-00.jsonl.gz",
"nq-train-01.jsonl.gz",
"nq-train-02.jsonl.gz",
"nq-train-03.jsonl.gz",
"nq-train-04.jsonl.gz",
"nq-train-05.jsonl.gz",
"nq-train-06.jsonl.gz",
]:
print(f"{file_name} already processed")
return
print(f"Start process {file_name}")
file_path = os.path.join(data_dir, "train", file_name)
target_path = os.path.join(data_dir, "pre-process", file_name)
with gzip.GzipFile(file_path, "rb") as fi_init:
with gzip.open(target_path, "w") as fi_target:
writer = jsonlines.Writer(fi_target)
for compt, line in tqdm(enumerate(fi_init)):
json_example = json.loads(line)
doc_html = json_example["document_html"] # %%
# tags_to_remove_with_content = [TagToRemoveWithContent(tag="script"), TagToRemoveWithContent(tag="style")]
plain_text, metadata = get_clean_text_and_metadata(
doc_html,
# start_parsing_at_tag="html",
# tags_to_remove_with_content=tags_to_remove_with_content
)
json_example = {
"text": plain_text,
"metadata": [dataclasses.asdict(node) for node in metadata],
}
writer.write(json_example)
print(f"End process {file_name}")
NUM_CORES = 8
data_dir = os.path.join("data", "v1.0")
list_dir = os.listdir(os.path.join(data_dir, "train"))
list_dir = [f.lower() for f in list_dir]
results = Parallel(n_jobs=NUM_CORES)(
delayed(process_file)(file_name) for file_name in sorted(list_dir)
)