Skip to content

Commit

Permalink
Merge pull request #130 from hathitrust/HTREPO-178
Browse files Browse the repository at this point in the history
HTREPO-178: handle UTF16 checksum files
  • Loading branch information
aelkiss authored Mar 6, 2019
2 parents f8a9811 + 15dfcb9 commit e513d12
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 5 deletions.
23 changes: 21 additions & 2 deletions lib/ht_sip_validator/sip/checksums.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,37 @@ class Checksums
# #each_line) that contains a list of checksums and files
def initialize(checksum_file)
@checksums = {}
checksum_file.each_line do |line|

check_for_bom(checksum_file).each_line() do |line|
line.strip.match(/^([a-fA-F0-9]{32})(\s+\*?)(\S.*)/) do |m|
(checksum, _, filename) = m.captures
# Handle windows-style paths
filename.tr!('\\', "/")
@checksums[File.basename(filename).downcase] = checksum
@checksums[File.basename(filename).downcase] = checksum.downcase
end
end
end

def checksum_for(filename)
@checksums[filename]
end

private

def check_for_bom(checksum_file)
maybe_bom = checksum_file.bytes[0,2]

if maybe_bom == [0xFF,0xFE]
encoding = 'UTF-16LE'
elsif maybe_bom == [0xFE,0xFF]
encoding = 'UTF-16BE'
end

if encoding
checksum_file.force_encoding(encoding)[1..-1].encode("US-ASCII")
else
checksum_file
end
end
end
end
4 changes: 2 additions & 2 deletions lib/ht_sip_validator/sip/sip.rb
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,10 @@ def metadata
def checksums
@checksums ||= if files.include?(CHECKSUM_FILE)
file_in_zip(CHECKSUM_FILE) do |file|
Checksums.new(file)
Checksums.new(file.read)
end
else
Checksums.new(StringIO.new(""))
Checksums.new("")
end
end

Expand Down
Binary file added spec/fixtures/powershell_checksum.md5
Binary file not shown.
Binary file added spec/fixtures/sips/powershell_checksums.zip
Binary file not shown.
7 changes: 6 additions & 1 deletion spec/sip/checksums_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,11 @@ module HathiTrust::SIP
let(:path_sample) { "#{foo_md5} /home/foo/bar/some/long/path/foo" }
let(:windows_sample) { foo_md5 + ' *C:\Users\My Name\with\spaces \path\foo' }
let(:uppercase_sample) { "#{foo_md5} Foo" }
let(:powershell_sample) { File.open(File.dirname(__FILE__) + "/../fixtures/powershell_checksum.md5", "rb").read() }

include_context "with default zip"
let(:zip_stream) do
Zip::File.new(zip_file).glob("**/checksum.md5").first.get_input_stream
Zip::File.new(zip_file).glob("**/checksum.md5").first.get_input_stream.read
end

it "accepts a string" do
Expand All @@ -44,6 +45,10 @@ module HathiTrust::SIP
it "accepts an input stream from a zip file" do
expect(described_class.new(zip_stream).checksums).to eql(zip_checksums)
end

it "can read a checksum file created with powershell (utf-16)" do
expect(described_class.new(powershell_sample).checksums).to include("00000001.html" => "602c5866bb2da48d7301322d3758f6c3")
end
end

describe "#checksum_for" do
Expand Down
11 changes: 11 additions & 0 deletions spec/validate_sip_command_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,17 @@ module HathiTrust
end
end

context "sip with checksums from powershell" do
let(:zip_file) { File.join fixtures_path, "sips", "powershell_checksums.zip" }
let(:argv) { [zip_file] }

it "has no warnings or errors" do
expect do
described_class.new(argv).exec
end.to output(/Success: 0 error\(s\), 0 warning\(s\)/).to_stdout
end
end

context "invalid sip" do
let(:zip_file) { File.join fixtures_path, "sips", "bad_ocr.zip" }
let(:argv) { [zip_file] }
Expand Down

0 comments on commit e513d12

Please sign in to comment.