REL: FIDO Release Candidate 1.6 RC5

- added update signature parameter to control signature download verison: - `-version` parameter that defaults to "latest", behaviour remains identical; - if `-version v104` is passed then v104 signatures will be created; - trapped regex creation exception so that sig file creation is not derailed; - PRONOM/DROID signature file now downloaded from URL rather than via SOAP service; - moved sleep between SOAP downloads so that it's only applied between actual downloads, not when processing cached results; - bumped version number to 1.6.0rc5 plus updated release date; - code style warnings: - some minor refactoring for complex methods; - factoring out string constants; - renamed some variables and methods; - removed some commented code; - tidied exit conditions; and - removed some unreachable code.
openpreserve · Aug 3, 2022 · 701fde3 · 701fde3
1 parent e0d8fd0
commit 701fde3
Show file tree

Hide file tree

Showing 12 changed files with 61,745 additions and 705 deletions.
diff --git a/RELEASENOTES.md b/RELEASENOTES.md
@@ -1,5 +1,6 @@
 RELEASE NOTES
 =============
+
 Format Identification for Digital Objects (fido).
 Copyright 2010 by Open Preservation Foundation.
 
@@ -8,12 +9,29 @@ Copyright 2010 The Open Preservation Foundation
 Fido is made available under the Apache License, Version 2.0; see the file
 LICENSE.txt for details.
 
-Fido 1.6.0rc1
+Fido 1.6.0rc5
+-------------
+
+2022-08-03
+
+- added update signature parameter to control signature download verison:
+- trapped regex creation exception so that sig file creation is not derailed;
+- PRONOM/DROID signature file now downloaded from URL rather than via SOAP service;
+- moved sleep between SOAP downloads so that it's only applied between actual downloads, not when processing cached results;
+- code style warnings:
+  - some minor refactoring for complex methods;
+  - factoring out string constants;
+  - renamed some variables and methods;
+  - removed some commented code;
+  - tidied exit conditions; and
+  - removed some unreachable code.
+
+Fido 1.6.0rc4
 -------------
 
-2022-03-29
+2022-06-22
 
-New command line options for updating signatures, see
+New command line options for updating signatures
 
 - PRONOM signatures can now be updated from a web service [[#202][]].
 - PRONOM v104 support with successful signature compilation (see issue [#203][]) [[#204][]].

diff --git a/fido/__init__.py b/fido/__init__.py
@@ -14,7 +14,7 @@
 from six.moves import input as rinput
 
 
-__version__ = '1.6.0rc1'
+__version__ = '1.6.0rc5'
 
 
 CONFIG_DIR = join(abspath(dirname(__file__)), 'conf')

diff --git a/fido/conf/DROID_SignatureFile-v104.xml b/fido/conf/DROID_SignatureFile-v104.xml
diff --git a/fido/conf/DROID_SignatureFile-v107.xml b/fido/conf/DROID_SignatureFile-v107.xml
diff --git a/fido/conf/formats-v104.xml → fido/conf/formats-v107.xml b/fido/conf/formats-v104.xml → fido/conf/formats-v107.xml
diff --git a/fido/conf/pronom-xml-v104.zip → fido/conf/pronom-xml-v107.zip b/fido/conf/pronom-xml-v104.zip → fido/conf/pronom-xml-v107.zip
diff --git a/fido/conf/versions.xml b/fido/conf/versions.xml
@@ -1,9 +1,9 @@
 <?xml version='1.0' encoding='utf-8'?>
 <versions>
-	<pronomVersion>104</pronomVersion>
-	<pronomSignature>formats-v104.xml</pronomSignature>
+	<pronomVersion>107</pronomVersion>
+	<pronomSignature>formats-v107.xml</pronomSignature>
 	<pronomContainerSignature>container-signature-20200121.xml</pronomContainerSignature>
 	<fidoExtensionSignature>format_extensions.xml</fidoExtensionSignature>
-	<updateScript>1.6.0rc1</updateScript>
+	<updateScript>1.6.0rc5</updateScript>
 	<updateSite>https://fidosigs.openpreservation.org</updateSite>
 </versions>
diff --git a/fido/fido.py b/fido/fido.py
@@ -886,17 +886,16 @@ def main(args=None):
         if (not args.input) and len(args.files) == 1 and args.files[0] == '-':
             if fido.zip:
                 raise RuntimeError("Multiple content read from stdin not yet supported.")
-                sys.exit(1)
                 fido.identify_multi_object_stream(sys.stdin, extension=not args.noextension)
             else:
                 fido.identify_stream(sys.stdin, args.filename, extension=not args.noextension)
         else:
             for file in list_files(args.files, args.recurse):
                 fido.identify_file(file, extension=not args.noextension)
     except KeyboardInterrupt:
-        msg = "FIDO: Interrupt while identifying file {0}"
-        sys.stderr.write(msg.format(fido.current_file))
-        sys.exit(1)
+        sys.stdout.flush()
+        sys.stderr.flush()
+        sys.exit('FIDO: Interrupt while identifying file {0}'.format(fido.current_file))
 
     if not args.q:
         sys.stdout.flush()

diff --git a/fido/prepare.py b/fido/prepare.py
@@ -20,7 +20,7 @@
 from .versions import get_local_versions
 from .char_handler import escape
 
-
+FLG_INCOMPATIBLE = '__INCOMPATIBLE_SIG__'
 class NS:
     """
     Helper class for XML name spaces in ElementTree.
@@ -89,24 +89,29 @@ def save(self, dst=sys.stdout):
             root.append(f)
         self.indent(root)
         with open(dst, 'wb') as file_:
-            # print >>out, ET.tostring(root,encoding='utf-8')
             file_.write(ET.tostring(root))
 
     def indent(self, elem, level=0):
         """Indent output."""
-        i = "\n" + level * "  "
         if len(elem):
-            if not elem.text or not elem.text.strip():
-                elem.text = i + "  "
-            if not elem.tail or not elem.tail.strip():
-                elem.tail = i
-            for elem in elem:
-                self.indent(elem, level + 1)
-            if not elem.tail or not elem.tail.strip():
-                elem.tail = i
+            self._indent_ele(elem, level)
         else:
             if level and (not elem.tail or not elem.tail.strip()):
-                elem.tail = i
+                elem.tail = self._indent_text(level)
+
+    def _indent_ele(self, elem, level):
+        """Indent the element."""
+        if not elem.text or not elem.text.strip():
+            elem.text = self._indent_text(level) + "  "
+        if not elem.tail or not elem.tail.strip():
+            elem.tail = self._indent_text(level)
+        for elem in elem:
+            self.indent(elem, level + 1)
+        if not elem.tail or not elem.tail.strip():
+            elem.tail = self._indent_text(level)
+
+    def _indent_text(self, level):
+        return "\n" + level * "  "
 
     def load_pronom_xml(self, puid_filter=None):
         """
@@ -116,18 +121,12 @@ def load_pronom_xml(self, puid_filter=None):
         If a @param puid is specified, only that one will be loaded.
         """
         formats = []
-        # for p in self.pronom_files:
-        #    print p
-        # print self.pronom_files
-        # exit()
         try:
             zip = zipfile.ZipFile(self.pronom_files, 'r')
             for item in zip.infolist():
-                # print item.filename
                 try:
                     stream = zip.open(item)
                     # Work is done here!
-                    # if item.filename != 'github/fido/fido/conf/pronom-xml/puid.fmt.11.xml':
                     format_ = self.parse_pronom_xml(stream, puid_filter)
                     if format_ is not None:
                         formats.append(format_)
@@ -144,7 +143,7 @@ def load_pronom_xml(self, puid_filter=None):
             id_map = {}
             for element in formats:
                 puid = element.find('puid').text
-                # print "working on puid:",puid
+                # print('working on puid:{}'.format(puid))
                 pronom_id = element.find('pronom_id').text
                 id_map[pronom_id] = puid
             for element in formats:
@@ -207,17 +206,23 @@ def parse_pronom_xml(self, source, puid_filter=None):
             # There are some funny chars in the notes, which caused me trouble and it is a unicode string,
             ET.SubElement(fido_sig, 'note').text = get_text_tna(pronom_sig, 'SignatureNote')
             for pronom_pat in pronom_sig.findall(TNA('ByteSequence')):
+                # print('Parsing ID:{}'.format(puid))
                 fido_pat = ET.SubElement(fido_sig, 'pattern')
                 pos = fido_position(get_text_tna(pronom_pat, 'PositionType'))
-                bytes = get_text_tna(pronom_pat, 'ByteSequenceValue')
+                byte_seq = get_text_tna(pronom_pat, 'ByteSequenceValue')
                 offset = get_text_tna(pronom_pat, 'Offset')
                 max_offset = get_text_tna(pronom_pat, 'MaxOffset')
                 if not max_offset:
                     pass
                 # print "working on puid:", puid, ", position: ", pos, "with offset, maxoffset: ", offset, ",", max_offset
-                regex = convert_to_regex(bytes, 'Little', pos, offset, max_offset)
+                try:
+                    regex = convert_to_regex(byte_seq, 'Little', pos, offset, max_offset)
+                except ValueError as ve:
+                    print('ValueError converting PUID {} signature to regex: {}'.format(puid, ve), file=sys.stderr)
+                    regex = FLG_INCOMPATIBLE
+
                 # print "done puid", puid
-                if regex == "__INCOMPATIBLE_SIG__":
+                if regex == FLG_INCOMPATIBLE:
                     print("Error: incompatible PRONOM signature found for puid {} skipping...".format(puid), file=sys.stderr)
                     # remove the empty 'signature' nodes
                     # now that the signature is not compatible and thus "regex" is empty
@@ -226,7 +231,7 @@ def parse_pronom_xml(self, source, puid_filter=None):
                         fido_format.remove(r)
                     continue
                 ET.SubElement(fido_pat, 'position').text = pos
-                ET.SubElement(fido_pat, 'pronom_pattern').text = bytes
+                ET.SubElement(fido_pat, 'pronom_pattern').text = byte_seq
                 ET.SubElement(fido_pat, 'regex').text = regex
         # Get the format details
         fido_details = ET.SubElement(fido_format, 'details')
@@ -372,7 +377,7 @@ def _convert_err_msg(msg, c, i, chars, buf):
     return "Conversion: {0}: char='{1}', at pos {2} in \n  {3}\n  {4}^\nBuffer = {5}".format(msg, c, i, chars, i * ' ', buf.getvalue())
 
 
-def doByte(chars, i, littleendian, esc=True):
+def do_byte(chars, i, littleendian, esc=True):
     """
     Convert two chars[i] and chars[i+1] into a byte.
 
@@ -473,7 +478,7 @@ def do_any_all_bitmasks(chars, i, predicate, littleendian):
     See https://github.com/nishihatapalmer/byteseek/wiki/Regular-Expression-Syntax#all-bitmasks
     and https://github.com/nishihatapalmer/byteseek/wiki/Regular-Expression-Syntax#any-bitmasks
     """
-    byt, inc = doByte(chars, i + 1, littleendian, esc=False)
+    byt, inc = do_byte(chars, i + 1, littleendian, esc=False)
     bitmask = ord(byt)
     regex = '({})'.format(
         '|'.join(['\\x' + hex(byte)[2:].zfill(2) for byte in range(0x100)
@@ -534,9 +539,9 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
             elif chars[i] in '*+?':
                 state = 'specials'
             else:
-                raise Exception(_convert_err_msg('Illegal character in start', chars[i], i, chars, buf))
+                raise ValueError(_convert_err_msg('Illegal character in start', chars[i], i, chars, buf))
         elif state == 'bytes':
-            (byt, inc) = doByte(chars, i, littleendian)
+            (byt, inc) = do_byte(chars, i, littleendian)
             buf.write(byt)
             i += inc
             state = 'start'
@@ -555,7 +560,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
             i += 2
             while True:
                 if chars[i].isalnum():
-                    (byt, inc) = doByte(chars, i, littleendian)
+                    (byt, inc) = do_byte(chars, i, littleendian)
                     buf.write(byt)
                     i += inc
                 elif chars[i] == '&':
@@ -578,15 +583,15 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
             try:
                 buf.write('[')
                 i += 1
-                (byt, inc) = doByte(chars, i, littleendian)
+                (byt, inc) = do_byte(chars, i, littleendian)
                 buf.write(byt)
                 i += inc
                 # assert(chars[i] == ':')
                 if chars[i] != ':':
                     return "__INCOMPATIBLE_SIG__"
                 buf.write('-')
                 i += 1
-                (byt, inc) = doByte(chars, i, littleendian)
+                (byt, inc) = do_byte(chars, i, littleendian)
                 buf.write(byt)
                 i += inc
                 # assert(chars[i] == ']')
@@ -606,7 +611,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
             i += 1
             while True:
                 if chars[i].isalnum():
-                    (byt, inc) = doByte(chars, i, littleendian)
+                    (byt, inc) = do_byte(chars, i, littleendian)
                     buf.write(byt)
                     i += inc
                 elif chars[i] == '|':
@@ -618,15 +623,15 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
                 elif chars[i] == '[':
                     buf.write('[')
                     i += 1
-                    (byt, inc) = doByte(chars, i, littleendian)
+                    (byt, inc) = do_byte(chars, i, littleendian)
                     buf.write(byt)
                     i += inc
                     # assert(chars[i] == ':')
                     if chars[i] != ':':
                         return "__INCOMPATIBLE_SIG__"
                     buf.write('-')
                     i += 1
-                    (byt, inc) = doByte(chars, i, littleendian)
+                    (byt, inc) = do_byte(chars, i, littleendian)
                     buf.write(byt)
                     i += inc
 

diff --git a/fido/pronom/soap.py b/fido/pronom/soap.py
@@ -21,6 +21,7 @@
 """
 import sys
 import tempfile
+from urllib.error import HTTPError, URLError
 import xml.etree.ElementTree as ET
 from six.moves import urllib
 
@@ -58,30 +59,24 @@ def get_pronom_sig_version():
     ver_ele = tree.find('.//pronom:Version/pronom:Version', NS)
     return int(ver_ele.text)
 
-
-def get_pronom_signature():
+def get_droid_signatures(version):
     """
-    Get PRONOM signature.
+    Get a DROID signature file by version.
 
-    Return a tuple comprising the latest signature XML file as string and a count
-    of the FileFormat elements contained as an integer.
+    Return a tuple comprising the requested signature XML file as string
+    and a count of the FileFormat elements contained as an integer.
     Upon error, write to `stderr` and return the tuple [], False.
     """
-    tree = _get_soap_ele_tree('getSignatureFileV1')
-    for prefix, uri in NS.items():
-        ET.register_namespace(prefix, uri)
-    sigfile_ele = ET.ElementTree(tree.find('.//pronom:SignatureFile', NS))
-    format_ele_len = len(sigfile_ele.findall('.//sig:FileFormat', NS))
-    if format_ele_len < 1:
-        sys.stderr.write("get_pronom_signature(): could not parse XML from SOAP response: file")
-        return [], False
-    # proc_inst = ET.ProcessingInstruction('xml', 'version="1.0" encoding="UTF-8"')
-    with tempfile.TemporaryFile() as fp:
-        sigfile_ele.write(fp, encoding='utf-8', xml_declaration=True)
-        fp.seek(0)
-        xml = fp.read()
-    return xml, format_ele_len
-
+    xml = []
+    format_count = False
+    try:
+        with urllib.request.urlopen('https://www.nationalarchives.gov.uk/documents/DROID_SignatureFile_V{}.xml'.format(version)) as f:
+            xml = f.read().decode('utf-8')
+            root_ele = ET.fromstring(xml)
+            format_count = len(root_ele.findall('FileFormat'))
+    except HTTPError as httpe:
+        sys.stderr.write("get_droid_signatures(): could not download signature file v{} due to exception: {}\n".format(version, httpe))    
+    return xml, format_count
 
 def _get_soap_ele_tree(soap_action):
     soap_string = '{}<soap:Envelope xmlns:xsi="{}" xmlns:xsd="{}" xmlns:soap="{}"><soap:Body><{} xmlns="{}" /></soap:Body></soap:Envelope>'.format(XML_PROC, NS.get('xsi'), NS.get('xsd'), NS.get('soap'), soap_action, PRONOM_NS).encode(ENCODING)
@@ -93,7 +88,12 @@ def _get_soap_ele_tree(soap_action):
 
 
 def _get_soap_response(soap_action, soap_string):
-    req = urllib.request.Request('http://{}/pronom/service.asmx'.format(PRONOM_HOST), data=soap_string)
+    try:
+        req = urllib.request.Request('http://{}/pronom/service.asmx'.format(PRONOM_HOST), data=soap_string)
+    except URLError:
+        print('There was a problem contacting the PRONOM service at http://{}/pronom/service.asmx.'.format(PRONOM_HOST))
+        print('Please check your network connection and try again.')
+        sys.exit(1)
     for key, value in HEADERS.items():
         req.add_header(key, value)
     req.add_header('Content-length', '%d' % len(soap_string))