issue #2: encapsulate extraction of data from input

I've created a separate function for each field being extracted from the data that needs to be read/normalized in its own way. This should make understanding the errors in the data a lot easier rather than the previous way where it was all being done in one giant main() function
uchicago-library · Jul 18, 2017 · 10643d9 · 10643d9
1 parent 7161d5a
commit 10643d9
Show file tree

Hide file tree

Showing 273 changed files with 2,089 additions and 1,882 deletions.
diff --git a/bin/extractor.py b/bin/extractor.py
@@ -10,9 +10,7 @@
 from xml.etree.ElementTree import tostring
 from xml.dom import minidom
 
-
-import mamlukimport.parser.Parser
-import mamlukimport.mapper.Mapper
+from mamlukimport.mapper import Mapper
 
 def read_directory(a_directory):
     items = scandir(a_directory)
@@ -26,7 +24,13 @@ def read_directory(a_directory):
 def expand_list_of_terms(value_string):
     item_count = 0
     output = {}
-    for n_term in value_string.split(';'):
+    if ';' in value_string:
+        a_list = value_string.split(';')
+    elif isinstance(value_string, list):
+        a_list = value_string
+    else:
+        a_list = [value_string]
+    for n_term in a_list:
         n_term = n_term.lstrip().strip()
         val = None
         if n_term != "":
@@ -36,6 +40,60 @@ def expand_list_of_terms(value_string):
             output[item_count] = n_term
     return output
 
+def _return_generic_string(a_string):
+    return expand_list_of_terms("University of Chicago")
+
+def _force_convert_to_list(a_string):
+    return expand_list_of_terms([a_string])
+
+def _extract_list_of_terms(some_original_input):
+    return some_original_input
+
+def _extract_copyright(rights_statement):
+    test = rights_statement.split(' ')[0].encode('utf-8')
+    test = test.split(b'\xc2\xa9')
+    if len(test) == 2:
+        return expand_list_of_terms(test[1].decode('utf-8'))
+    else:
+        return _return_generic_string("no copyright")
+
+def _extract_volume_information(some_original_input):
+    msr_pattern = re.compile('MSR').search(some_original_input)
+    vol_pattern = re.compile('Vol.').search(some_original_input)
+    print(some_original_input)
+
+    # if msr_pattern:
+    #     volume = data["Title"][data["Title"][0:]
+    #     .index('MSR') + 3:].lstrip().strip()
+    #        title = data["Title"][0:data["Title"].index('MSR')]
+    #     elif vol_pattern:
+    #         volume = data["Title"][1][data["Title"].index('Vol.') + 4:].lstrip().strip()
+    #         title = data["Title"][0:data["Title"].index('Vol.')]
+    #     else:
+    #         volume = "none"
+    #         title = title.lstrip().strip()
+    #     first_check = title[-1]
+    #     if first_check == '(':
+    #         title = title[0:-1].strip().lstrip()
+    #     second_check = title[-1]
+    #     if second_check == ":":
+    #         title = title[0:-1].strip().lstrip()
+    #     if volume:
+    #         volume = re.sub(r'\)', '', re.sub(r'\(', '', volume))
+    #         if 'MamlukStudiesReview' in data["FileName"]:
+    #             output["formatof"] = expand_list_of_terms(volume)
+    #             output["source"] = expand_list_of_terms("printed " + volume)
+    #         else:
+    #             output["part"] = expand_list_of_terms(volume)
+    #             output["source"] = expand_list_of_terms(volume)
+
+def _check_for_webstatement(some_dict):
+    if some_dict.get("WebStatement", None):
+        output = some_dict.get("WebStatement")
+    else:
+        output = "http://mamluk.uchicago.edu/msr.html"
+    return _return_generic_string(output)
+
 def create_input(iterable, total_files, outputs):
     for n_file in iterable:
         try:
@@ -44,69 +102,23 @@ def create_input(iterable, total_files, outputs):
         except JSONDecodeError:
             continue
         output = {}
-        output["publisher"] = expand_list_of_terms("University of Chicago")
-        output["creator"] = expand_list_of_terms(data["Creator"])
-        output["rights"] = expand_list_of_terms(data["Rights"])
-        if not isinstance(data["Keywords"], list):
-            output["keywords"] = expand_list_of_terms(data["Keywords"])
-        else:
-            output["keywords"] = expand_list_of_terms(data["Keywords"][0])
-        if not isinstance(data["Subject"], list):
-            output["subject"] = expand_list_of_terms(data["Subjects"])
-        else:
-            output["subject"] = expand_list_of_terms(data["Subjects"][0])
-        output["createdate"] = expand_list_of_terms(data["CreateDate"])
-        output["filename"] = expand_list_of_terms(data["FileName"])
-        volume = data["FileName"].split('_')[2]
-        temp = volume.split('-')
-        if len(temp) >= 2:
-            head = [temp[0]]
-            tail = temp[1:]
-            tail = [x for x in tail if re.compile(r'\d{1,}$').match(x)]
-            output["copyrightdate"] = expand_list_of_terms(
-                '-'.join(head + tail))
-        else:
-            output["copyrightdate"] = expand_list_of_terms('-'.join([re.sub(r'[a-z]', '',
-                                                                            re.sub(r'\.', '', x))
-                                                                     for x in temp]))
-        msr_pattern = re.compile('MSR').search(data["Title"])
-        vol_pattern = re.compile('Vol.').search(data["Title"])
-        if msr_pattern:
-            volume = data["Title"][data["Title"][
-                1].index('MSR') + 3:].lstrip().strip()
-            title = {1: data["Title"][0:data["Title"].index('MSR')]}
-        elif vol_pattern:
-            volume = data["Title"][1][
-                data["Title"].index('Vol.') + 4:].lstrip().strip()
-            title = data["Title"][0:data["Title"].index('Vol.')]
-        else:
-            volume = "none"
-            title = expand_list_of_terms(title[1].lstrip().strip())
-        first_check = title[1][-1]
-        if first_check == '(':
-            title = title[0:-1].strip().lstrip()
-            second_check = title[-1]
-        if second_check == ":":
-            title = title[0:-1].strip().lstrip()
-        if volume:
-            volume = re.sub(r'\)', '', re.sub(r'\(', '', volume))
-            if 'MamlukStudiesReview' in data["FileName"]:
-                output["formatof"] = expand_list_of_terms(volume)
-            else:
-                output["part"] = expand_list_of_terms(volume)
-        output["title"] = expand_list_of_terms(title)
-        try:
-            output["webstatement"] = expand_list_of_terms(
-                data["WebStatement"])
-        except KeyError:
-            pass
+        output["publisher"] = _return_generic_string("University of Chicago")
+        output["creator"] = _return_generic_string(data["Creator"])
+        output["rights"] = _force_convert_to_list(data["Rights"])
+        output["copyright"] = _extract_copyright(data["Rights"])
+        output["keywords"] = _extract_list_of_terms(data["Keywords"])
+        output["subjects"] = _extract_list_of_terms(data["Subject"])
+        output["filename"] = _return_generic_string(data["FileName"])
+        output["volumme"] = _extract_volume_information(data["Title"])
+        output["title"] = _return_generic_string(data["Title"])
+        output["webstatement"] = _check_for_webstatement(data)
         outputs.append(output)
     return outputs, total_files
 
 def create_output(inputs):
    for n_record in inputs:
        filename = n_record["filename"]
-       new_mapper = mamlukimport.mapper.Mapper(n_record)
+       new_mapper = Mapper(n_record)
        new_filename = re.sub(r'.pdf', '.xml', filename[1])
        xml_string = tostring(new_mapper.out)
        xml_string = minidom.parseString(xml_string).toprettyxml()
@@ -122,6 +134,7 @@ def main():
         parser.add_argument(
             "output_directory",
             help="A directory to write the results of the metadata extraction")
+
         args = parser.parse_args()
         a_generator = read_directory(args.pdf_directory)
         total_files = 0

diff --git a/mamlukimport/mapper.py b/mamlukimport/mapper.py
@@ -6,7 +6,7 @@ class Mapper(object):
     def __init__(self, input):
         self._in = input
         self._lookup = {'title': {'element':'title', 'qualifier':'none'},
-                       'createdate': {'element': 'date', 'qualifier':'copyright'},
+                       'copyright': {'element': 'date', 'qualifier':'copyright'},
                        'creator': {'element':'contributor', 'qualifier':'author'},
                        'rights': {'element': 'rights', 'qualifier': 'statement'},
                        'webstatement': {'element': 'rights', 'qualifier': 'url'},
@@ -32,6 +32,8 @@ def _transform(self):
                     new_element = SubElement(root, "dc_value")
                     new_element.set("element", instructions["element"])
                     new_element.set("qualifier", instructions["qualifier"])
+                    if n_value == 'subject':
+                        print(new_element)
                     if isinstance(self._in[n_key], str):
                         new_element.text = self._in[n_key]
                     else: