Skip to content

Commit

Permalink
REL: FIDO Release Candidate 1.6 RC5
Browse files Browse the repository at this point in the history
- added update signature parameter to control signature download verison:
  - `-version` parameter that defaults to "latest", behaviour remains identical;
  - if `-version v104` is passed then v104 signatures will be created;
- trapped regex creation exception so that sig file creation is not derailed;
- PRONOM/DROID signature file now downloaded from URL rather than via SOAP service;
- moved sleep between SOAP downloads so that it's only applied between actual downloads, not when processing cached results;
- bumped version number to 1.6.0rc5 plus updated release date;
- code style warnings:
  - some minor refactoring for complex methods;
  - factoring out string constants;
  - renamed some variables and methods;
  - removed some commented code;
  - tidied exit conditions; and
  - removed some unreachable code.
  • Loading branch information
carlwilson committed Aug 3, 2022
1 parent e0d8fd0 commit 701fde3
Show file tree
Hide file tree
Showing 12 changed files with 61,745 additions and 705 deletions.
24 changes: 21 additions & 3 deletions RELEASENOTES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
RELEASE NOTES
=============

Format Identification for Digital Objects (fido).
Copyright 2010 by Open Preservation Foundation.

Expand All @@ -8,12 +9,29 @@ Copyright 2010 The Open Preservation Foundation
Fido is made available under the Apache License, Version 2.0; see the file
LICENSE.txt for details.

Fido 1.6.0rc1
Fido 1.6.0rc5
-------------

2022-08-03

- added update signature parameter to control signature download verison:
- trapped regex creation exception so that sig file creation is not derailed;
- PRONOM/DROID signature file now downloaded from URL rather than via SOAP service;
- moved sleep between SOAP downloads so that it's only applied between actual downloads, not when processing cached results;
- code style warnings:
- some minor refactoring for complex methods;
- factoring out string constants;
- renamed some variables and methods;
- removed some commented code;
- tidied exit conditions; and
- removed some unreachable code.

Fido 1.6.0rc4
-------------

2022-03-29
2022-06-22

New command line options for updating signatures, see
New command line options for updating signatures

- PRONOM signatures can now be updated from a web service [[#202][]].
- PRONOM v104 support with successful signature compilation (see issue [#203][]) [[#204][]].
Expand Down
2 changes: 1 addition & 1 deletion fido/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from six.moves import input as rinput


__version__ = '1.6.0rc1'
__version__ = '1.6.0rc5'


CONFIG_DIR = join(abspath(dirname(__file__)), 'conf')
Expand Down
2 changes: 0 additions & 2 deletions fido/conf/DROID_SignatureFile-v104.xml

This file was deleted.

58,198 changes: 58,198 additions & 0 deletions fido/conf/DROID_SignatureFile-v107.xml

Large diffs are not rendered by default.

3,906 changes: 3,348 additions & 558 deletions fido/conf/formats-v104.xml → fido/conf/formats-v107.xml

Large diffs are not rendered by default.

Binary file not shown.
6 changes: 3 additions & 3 deletions fido/conf/versions.xml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
<?xml version='1.0' encoding='utf-8'?>
<versions>
<pronomVersion>104</pronomVersion>
<pronomSignature>formats-v104.xml</pronomSignature>
<pronomVersion>107</pronomVersion>
<pronomSignature>formats-v107.xml</pronomSignature>
<pronomContainerSignature>container-signature-20200121.xml</pronomContainerSignature>
<fidoExtensionSignature>format_extensions.xml</fidoExtensionSignature>
<updateScript>1.6.0rc1</updateScript>
<updateScript>1.6.0rc5</updateScript>
<updateSite>https://fidosigs.openpreservation.org</updateSite>
</versions>
7 changes: 3 additions & 4 deletions fido/fido.py
Original file line number Diff line number Diff line change
Expand Up @@ -886,17 +886,16 @@ def main(args=None):
if (not args.input) and len(args.files) == 1 and args.files[0] == '-':
if fido.zip:
raise RuntimeError("Multiple content read from stdin not yet supported.")
sys.exit(1)
fido.identify_multi_object_stream(sys.stdin, extension=not args.noextension)
else:
fido.identify_stream(sys.stdin, args.filename, extension=not args.noextension)
else:
for file in list_files(args.files, args.recurse):
fido.identify_file(file, extension=not args.noextension)
except KeyboardInterrupt:
msg = "FIDO: Interrupt while identifying file {0}"
sys.stderr.write(msg.format(fido.current_file))
sys.exit(1)
sys.stdout.flush()
sys.stderr.flush()
sys.exit('FIDO: Interrupt while identifying file {0}'.format(fido.current_file))

if not args.q:
sys.stdout.flush()
Expand Down
71 changes: 38 additions & 33 deletions fido/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from .versions import get_local_versions
from .char_handler import escape


FLG_INCOMPATIBLE = '__INCOMPATIBLE_SIG__'
class NS:
"""
Helper class for XML name spaces in ElementTree.
Expand Down Expand Up @@ -89,24 +89,29 @@ def save(self, dst=sys.stdout):
root.append(f)
self.indent(root)
with open(dst, 'wb') as file_:
# print >>out, ET.tostring(root,encoding='utf-8')
file_.write(ET.tostring(root))

def indent(self, elem, level=0):
"""Indent output."""
i = "\n" + level * " "
if len(elem):
if not elem.text or not elem.text.strip():
elem.text = i + " "
if not elem.tail or not elem.tail.strip():
elem.tail = i
for elem in elem:
self.indent(elem, level + 1)
if not elem.tail or not elem.tail.strip():
elem.tail = i
self._indent_ele(elem, level)
else:
if level and (not elem.tail or not elem.tail.strip()):
elem.tail = i
elem.tail = self._indent_text(level)

def _indent_ele(self, elem, level):
"""Indent the element."""
if not elem.text or not elem.text.strip():
elem.text = self._indent_text(level) + " "
if not elem.tail or not elem.tail.strip():
elem.tail = self._indent_text(level)
for elem in elem:
self.indent(elem, level + 1)
if not elem.tail or not elem.tail.strip():
elem.tail = self._indent_text(level)

def _indent_text(self, level):
return "\n" + level * " "

def load_pronom_xml(self, puid_filter=None):
"""
Expand All @@ -116,18 +121,12 @@ def load_pronom_xml(self, puid_filter=None):
If a @param puid is specified, only that one will be loaded.
"""
formats = []
# for p in self.pronom_files:
# print p
# print self.pronom_files
# exit()
try:
zip = zipfile.ZipFile(self.pronom_files, 'r')
for item in zip.infolist():
# print item.filename
try:
stream = zip.open(item)
# Work is done here!
# if item.filename != 'github/fido/fido/conf/pronom-xml/puid.fmt.11.xml':
format_ = self.parse_pronom_xml(stream, puid_filter)
if format_ is not None:
formats.append(format_)
Expand All @@ -144,7 +143,7 @@ def load_pronom_xml(self, puid_filter=None):
id_map = {}
for element in formats:
puid = element.find('puid').text
# print "working on puid:",puid
# print('working on puid:{}'.format(puid))
pronom_id = element.find('pronom_id').text
id_map[pronom_id] = puid
for element in formats:
Expand Down Expand Up @@ -207,17 +206,23 @@ def parse_pronom_xml(self, source, puid_filter=None):
# There are some funny chars in the notes, which caused me trouble and it is a unicode string,
ET.SubElement(fido_sig, 'note').text = get_text_tna(pronom_sig, 'SignatureNote')
for pronom_pat in pronom_sig.findall(TNA('ByteSequence')):
# print('Parsing ID:{}'.format(puid))
fido_pat = ET.SubElement(fido_sig, 'pattern')
pos = fido_position(get_text_tna(pronom_pat, 'PositionType'))
bytes = get_text_tna(pronom_pat, 'ByteSequenceValue')
byte_seq = get_text_tna(pronom_pat, 'ByteSequenceValue')
offset = get_text_tna(pronom_pat, 'Offset')
max_offset = get_text_tna(pronom_pat, 'MaxOffset')
if not max_offset:
pass
# print "working on puid:", puid, ", position: ", pos, "with offset, maxoffset: ", offset, ",", max_offset
regex = convert_to_regex(bytes, 'Little', pos, offset, max_offset)
try:
regex = convert_to_regex(byte_seq, 'Little', pos, offset, max_offset)
except ValueError as ve:
print('ValueError converting PUID {} signature to regex: {}'.format(puid, ve), file=sys.stderr)
regex = FLG_INCOMPATIBLE

# print "done puid", puid
if regex == "__INCOMPATIBLE_SIG__":
if regex == FLG_INCOMPATIBLE:
print("Error: incompatible PRONOM signature found for puid {} skipping...".format(puid), file=sys.stderr)
# remove the empty 'signature' nodes
# now that the signature is not compatible and thus "regex" is empty
Expand All @@ -226,7 +231,7 @@ def parse_pronom_xml(self, source, puid_filter=None):
fido_format.remove(r)
continue
ET.SubElement(fido_pat, 'position').text = pos
ET.SubElement(fido_pat, 'pronom_pattern').text = bytes
ET.SubElement(fido_pat, 'pronom_pattern').text = byte_seq
ET.SubElement(fido_pat, 'regex').text = regex
# Get the format details
fido_details = ET.SubElement(fido_format, 'details')
Expand Down Expand Up @@ -372,7 +377,7 @@ def _convert_err_msg(msg, c, i, chars, buf):
return "Conversion: {0}: char='{1}', at pos {2} in \n {3}\n {4}^\nBuffer = {5}".format(msg, c, i, chars, i * ' ', buf.getvalue())


def doByte(chars, i, littleendian, esc=True):
def do_byte(chars, i, littleendian, esc=True):
"""
Convert two chars[i] and chars[i+1] into a byte.
Expand Down Expand Up @@ -473,7 +478,7 @@ def do_any_all_bitmasks(chars, i, predicate, littleendian):
See https://github.com/nishihatapalmer/byteseek/wiki/Regular-Expression-Syntax#all-bitmasks
and https://github.com/nishihatapalmer/byteseek/wiki/Regular-Expression-Syntax#any-bitmasks
"""
byt, inc = doByte(chars, i + 1, littleendian, esc=False)
byt, inc = do_byte(chars, i + 1, littleendian, esc=False)
bitmask = ord(byt)
regex = '({})'.format(
'|'.join(['\\x' + hex(byte)[2:].zfill(2) for byte in range(0x100)
Expand Down Expand Up @@ -534,9 +539,9 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
elif chars[i] in '*+?':
state = 'specials'
else:
raise Exception(_convert_err_msg('Illegal character in start', chars[i], i, chars, buf))
raise ValueError(_convert_err_msg('Illegal character in start', chars[i], i, chars, buf))
elif state == 'bytes':
(byt, inc) = doByte(chars, i, littleendian)
(byt, inc) = do_byte(chars, i, littleendian)
buf.write(byt)
i += inc
state = 'start'
Expand All @@ -555,7 +560,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
i += 2
while True:
if chars[i].isalnum():
(byt, inc) = doByte(chars, i, littleendian)
(byt, inc) = do_byte(chars, i, littleendian)
buf.write(byt)
i += inc
elif chars[i] == '&':
Expand All @@ -578,15 +583,15 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
try:
buf.write('[')
i += 1
(byt, inc) = doByte(chars, i, littleendian)
(byt, inc) = do_byte(chars, i, littleendian)
buf.write(byt)
i += inc
# assert(chars[i] == ':')
if chars[i] != ':':
return "__INCOMPATIBLE_SIG__"
buf.write('-')
i += 1
(byt, inc) = doByte(chars, i, littleendian)
(byt, inc) = do_byte(chars, i, littleendian)
buf.write(byt)
i += inc
# assert(chars[i] == ']')
Expand All @@ -606,7 +611,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
i += 1
while True:
if chars[i].isalnum():
(byt, inc) = doByte(chars, i, littleendian)
(byt, inc) = do_byte(chars, i, littleendian)
buf.write(byt)
i += inc
elif chars[i] == '|':
Expand All @@ -618,15 +623,15 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
elif chars[i] == '[':
buf.write('[')
i += 1
(byt, inc) = doByte(chars, i, littleendian)
(byt, inc) = do_byte(chars, i, littleendian)
buf.write(byt)
i += inc
# assert(chars[i] == ':')
if chars[i] != ':':
return "__INCOMPATIBLE_SIG__"
buf.write('-')
i += 1
(byt, inc) = doByte(chars, i, littleendian)
(byt, inc) = do_byte(chars, i, littleendian)
buf.write(byt)
i += inc

Expand Down
42 changes: 21 additions & 21 deletions fido/pronom/soap.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"""
import sys
import tempfile
from urllib.error import HTTPError, URLError
import xml.etree.ElementTree as ET
from six.moves import urllib

Expand Down Expand Up @@ -58,30 +59,24 @@ def get_pronom_sig_version():
ver_ele = tree.find('.//pronom:Version/pronom:Version', NS)
return int(ver_ele.text)


def get_pronom_signature():
def get_droid_signatures(version):
"""
Get PRONOM signature.
Get a DROID signature file by version.
Return a tuple comprising the latest signature XML file as string and a count
of the FileFormat elements contained as an integer.
Return a tuple comprising the requested signature XML file as string
and a count of the FileFormat elements contained as an integer.
Upon error, write to `stderr` and return the tuple [], False.
"""
tree = _get_soap_ele_tree('getSignatureFileV1')
for prefix, uri in NS.items():
ET.register_namespace(prefix, uri)
sigfile_ele = ET.ElementTree(tree.find('.//pronom:SignatureFile', NS))
format_ele_len = len(sigfile_ele.findall('.//sig:FileFormat', NS))
if format_ele_len < 1:
sys.stderr.write("get_pronom_signature(): could not parse XML from SOAP response: file")
return [], False
# proc_inst = ET.ProcessingInstruction('xml', 'version="1.0" encoding="UTF-8"')
with tempfile.TemporaryFile() as fp:
sigfile_ele.write(fp, encoding='utf-8', xml_declaration=True)
fp.seek(0)
xml = fp.read()
return xml, format_ele_len

xml = []
format_count = False
try:
with urllib.request.urlopen('https://www.nationalarchives.gov.uk/documents/DROID_SignatureFile_V{}.xml'.format(version)) as f:
xml = f.read().decode('utf-8')
root_ele = ET.fromstring(xml)
format_count = len(root_ele.findall('FileFormat'))
except HTTPError as httpe:
sys.stderr.write("get_droid_signatures(): could not download signature file v{} due to exception: {}\n".format(version, httpe))
return xml, format_count

def _get_soap_ele_tree(soap_action):
soap_string = '{}<soap:Envelope xmlns:xsi="{}" xmlns:xsd="{}" xmlns:soap="{}"><soap:Body><{} xmlns="{}" /></soap:Body></soap:Envelope>'.format(XML_PROC, NS.get('xsi'), NS.get('xsd'), NS.get('soap'), soap_action, PRONOM_NS).encode(ENCODING)
Expand All @@ -93,7 +88,12 @@ def _get_soap_ele_tree(soap_action):


def _get_soap_response(soap_action, soap_string):
req = urllib.request.Request('http://{}/pronom/service.asmx'.format(PRONOM_HOST), data=soap_string)
try:
req = urllib.request.Request('http://{}/pronom/service.asmx'.format(PRONOM_HOST), data=soap_string)
except URLError:
print('There was a problem contacting the PRONOM service at http://{}/pronom/service.asmx.'.format(PRONOM_HOST))
print('Please check your network connection and try again.')
sys.exit(1)
for key, value in HEADERS.items():
req.add_header(key, value)
req.add_header('Content-length', '%d' % len(soap_string))
Expand Down
Loading

0 comments on commit 701fde3

Please sign in to comment.