Skip to content

Commit

Permalink
Fix no underscore restoration for reddit GDTB .rels files
Browse files Browse the repository at this point in the history
  * was failing if running from root before process_reddit.py has been run since src/dep/ is not yet restored
  * now checks for restored dep files and uses top level dep/ instead if running from get_text.py
  * fixes #197
  • Loading branch information
amir-zeldes committed Dec 13, 2024
1 parent 3cf4208 commit a77ca34
Showing 1 changed file with 6 additions and 1 deletion.
7 changes: 6 additions & 1 deletion _build/utils/get_reddit/underscores_disrpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,12 @@ def restore_range(range_string, underscored, tid_dict):
tid2string = defaultdict(dict)

if len(dep_files) == 0: # Need token strings from conllu but only restoring rels
reddit_conllu = glob(script_dir + ".." + os.sep + ".." + os.sep + "src" + os.sep + "dep" + os.sep + "GUM_reddit*.conllu")
# Check if src/ files have content
bobby_src = open(script_dir + ".." + os.sep + ".." + os.sep + "src" + os.sep + "dep" + os.sep + "GUM_reddit_bobby.conllu").read()
if "_____" in bobby_src: # Use top level folders instead, we are running from master/get_text.py
reddit_conllu = glob(script_dir + ".." + os.sep + ".." + os.sep + ".." + os.sep + "dep" + os.sep + "GUM_reddit*.conllu")
else:
reddit_conllu = glob(script_dir + ".." + os.sep + ".." + os.sep + "src" + os.sep + "dep" + os.sep + "GUM_reddit*.conllu")
for file_ in reddit_conllu:
lines = io.open(file_,encoding="utf8").readlines()
docname = os.path.basename(file_).replace(".conllu","")
Expand Down

0 comments on commit a77ca34

Please sign in to comment.