Skip to content

Commit

Permalink
issue #2: encapsulate extraction of data from input
Browse files Browse the repository at this point in the history
I've created a separate function for each field being extracted from the
data that needs to be read/normalized in its own way. This should make
understanding the errors in the data a lot easier rather than the
previous way where it was all being done in one giant main() function
  • Loading branch information
Tyler Danstrom committed Jul 18, 2017
1 parent 7161d5a commit 10643d9
Show file tree
Hide file tree
Showing 273 changed files with 2,089 additions and 1,882 deletions.
135 changes: 74 additions & 61 deletions bin/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@
from xml.etree.ElementTree import tostring
from xml.dom import minidom


import mamlukimport.parser.Parser
import mamlukimport.mapper.Mapper
from mamlukimport.mapper import Mapper

def read_directory(a_directory):
items = scandir(a_directory)
Expand All @@ -26,7 +24,13 @@ def read_directory(a_directory):
def expand_list_of_terms(value_string):
item_count = 0
output = {}
for n_term in value_string.split(';'):
if ';' in value_string:
a_list = value_string.split(';')
elif isinstance(value_string, list):
a_list = value_string
else:
a_list = [value_string]
for n_term in a_list:
n_term = n_term.lstrip().strip()
val = None
if n_term != "":
Expand All @@ -36,6 +40,60 @@ def expand_list_of_terms(value_string):
output[item_count] = n_term
return output

def _return_generic_string(a_string):
return expand_list_of_terms("University of Chicago")

def _force_convert_to_list(a_string):
return expand_list_of_terms([a_string])

def _extract_list_of_terms(some_original_input):
return some_original_input

def _extract_copyright(rights_statement):
test = rights_statement.split(' ')[0].encode('utf-8')
test = test.split(b'\xc2\xa9')
if len(test) == 2:
return expand_list_of_terms(test[1].decode('utf-8'))
else:
return _return_generic_string("no copyright")

def _extract_volume_information(some_original_input):
msr_pattern = re.compile('MSR').search(some_original_input)
vol_pattern = re.compile('Vol.').search(some_original_input)
print(some_original_input)

# if msr_pattern:
# volume = data["Title"][data["Title"][0:]
# .index('MSR') + 3:].lstrip().strip()
# title = data["Title"][0:data["Title"].index('MSR')]
# elif vol_pattern:
# volume = data["Title"][1][data["Title"].index('Vol.') + 4:].lstrip().strip()
# title = data["Title"][0:data["Title"].index('Vol.')]
# else:
# volume = "none"
# title = title.lstrip().strip()
# first_check = title[-1]
# if first_check == '(':
# title = title[0:-1].strip().lstrip()
# second_check = title[-1]
# if second_check == ":":
# title = title[0:-1].strip().lstrip()
# if volume:
# volume = re.sub(r'\)', '', re.sub(r'\(', '', volume))
# if 'MamlukStudiesReview' in data["FileName"]:
# output["formatof"] = expand_list_of_terms(volume)
# output["source"] = expand_list_of_terms("printed " + volume)
# else:
# output["part"] = expand_list_of_terms(volume)
# output["source"] = expand_list_of_terms(volume)

def _check_for_webstatement(some_dict):
if some_dict.get("WebStatement", None):
output = some_dict.get("WebStatement")
else:
output = "http://mamluk.uchicago.edu/msr.html"
return _return_generic_string(output)

def create_input(iterable, total_files, outputs):
for n_file in iterable:
try:
Expand All @@ -44,69 +102,23 @@ def create_input(iterable, total_files, outputs):
except JSONDecodeError:
continue
output = {}
output["publisher"] = expand_list_of_terms("University of Chicago")
output["creator"] = expand_list_of_terms(data["Creator"])
output["rights"] = expand_list_of_terms(data["Rights"])
if not isinstance(data["Keywords"], list):
output["keywords"] = expand_list_of_terms(data["Keywords"])
else:
output["keywords"] = expand_list_of_terms(data["Keywords"][0])
if not isinstance(data["Subject"], list):
output["subject"] = expand_list_of_terms(data["Subjects"])
else:
output["subject"] = expand_list_of_terms(data["Subjects"][0])
output["createdate"] = expand_list_of_terms(data["CreateDate"])
output["filename"] = expand_list_of_terms(data["FileName"])
volume = data["FileName"].split('_')[2]
temp = volume.split('-')
if len(temp) >= 2:
head = [temp[0]]
tail = temp[1:]
tail = [x for x in tail if re.compile(r'\d{1,}$').match(x)]
output["copyrightdate"] = expand_list_of_terms(
'-'.join(head + tail))
else:
output["copyrightdate"] = expand_list_of_terms('-'.join([re.sub(r'[a-z]', '',
re.sub(r'\.', '', x))
for x in temp]))
msr_pattern = re.compile('MSR').search(data["Title"])
vol_pattern = re.compile('Vol.').search(data["Title"])
if msr_pattern:
volume = data["Title"][data["Title"][
1].index('MSR') + 3:].lstrip().strip()
title = {1: data["Title"][0:data["Title"].index('MSR')]}
elif vol_pattern:
volume = data["Title"][1][
data["Title"].index('Vol.') + 4:].lstrip().strip()
title = data["Title"][0:data["Title"].index('Vol.')]
else:
volume = "none"
title = expand_list_of_terms(title[1].lstrip().strip())
first_check = title[1][-1]
if first_check == '(':
title = title[0:-1].strip().lstrip()
second_check = title[-1]
if second_check == ":":
title = title[0:-1].strip().lstrip()
if volume:
volume = re.sub(r'\)', '', re.sub(r'\(', '', volume))
if 'MamlukStudiesReview' in data["FileName"]:
output["formatof"] = expand_list_of_terms(volume)
else:
output["part"] = expand_list_of_terms(volume)
output["title"] = expand_list_of_terms(title)
try:
output["webstatement"] = expand_list_of_terms(
data["WebStatement"])
except KeyError:
pass
output["publisher"] = _return_generic_string("University of Chicago")
output["creator"] = _return_generic_string(data["Creator"])
output["rights"] = _force_convert_to_list(data["Rights"])
output["copyright"] = _extract_copyright(data["Rights"])
output["keywords"] = _extract_list_of_terms(data["Keywords"])
output["subjects"] = _extract_list_of_terms(data["Subject"])
output["filename"] = _return_generic_string(data["FileName"])
output["volumme"] = _extract_volume_information(data["Title"])
output["title"] = _return_generic_string(data["Title"])
output["webstatement"] = _check_for_webstatement(data)
outputs.append(output)
return outputs, total_files

def create_output(inputs):
for n_record in inputs:
filename = n_record["filename"]
new_mapper = mamlukimport.mapper.Mapper(n_record)
new_mapper = Mapper(n_record)
new_filename = re.sub(r'.pdf', '.xml', filename[1])
xml_string = tostring(new_mapper.out)
xml_string = minidom.parseString(xml_string).toprettyxml()
Expand All @@ -122,6 +134,7 @@ def main():
parser.add_argument(
"output_directory",
help="A directory to write the results of the metadata extraction")

args = parser.parse_args()
a_generator = read_directory(args.pdf_directory)
total_files = 0
Expand Down
4 changes: 3 additions & 1 deletion mamlukimport/mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ class Mapper(object):
def __init__(self, input):
self._in = input
self._lookup = {'title': {'element':'title', 'qualifier':'none'},
'createdate': {'element': 'date', 'qualifier':'copyright'},
'copyright': {'element': 'date', 'qualifier':'copyright'},
'creator': {'element':'contributor', 'qualifier':'author'},
'rights': {'element': 'rights', 'qualifier': 'statement'},
'webstatement': {'element': 'rights', 'qualifier': 'url'},
Expand All @@ -32,6 +32,8 @@ def _transform(self):
new_element = SubElement(root, "dc_value")
new_element.set("element", instructions["element"])
new_element.set("qualifier", instructions["qualifier"])
if n_value == 'subject':
print(new_element)
if isinstance(self._in[n_key], str):
new_element.text = self._in[n_key]
else:
Expand Down
Loading

0 comments on commit 10643d9

Please sign in to comment.