Skip to content

Commit

Permalink
Read BagIt Profile identifiers from bag-info.txt and validate
Browse files Browse the repository at this point in the history
Additional tests

Handle long lines without a space to break at
  • Loading branch information
whikloj committed Apr 24, 2024
1 parent 4a4671e commit 564df32
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 25 deletions.
47 changes: 29 additions & 18 deletions src/Bag.php
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,17 @@ class Bag
'.zip',
];

/**
* All the extensions in one array.
*/
private const PACKAGE_EXTENSIONS = [
'.tar',
'.tgz',
'.tar.gz',
'.tar.bz2',
'.zip',
];

/**
* Length we start trying to wrap at.
*/
Expand All @@ -134,13 +145,6 @@ class Bag
'.zip' => 'application/zip',
];

/**
* All the extensions in one array.
*
* @var array<string>
*/
private array $packageExtensions;

/**
* Array of current bag version with keys 'major' and 'minor'.
*
Expand Down Expand Up @@ -280,7 +284,6 @@ class Bag
*/
private function __construct(string $rootPath, bool $new = true, ?string $extension = null)
{
$this->packageExtensions = array_merge(self::TAR_EXTENSIONS, self::ZIP_EXTENSIONS);
// Define valid hash algorithms our PHP supports.
$this->validHashAlgorithms = array_filter(
hash_algos(),
Expand Down Expand Up @@ -434,10 +437,10 @@ public function finalize(): void
*/
public function package(string $filepath): void
{
if (!self::hasExtension(self::getExtension($filepath), $this->packageExtensions)) {
if (!self::hasExtension(self::getExtension($filepath), self::PACKAGE_EXTENSIONS)) {
throw new BagItException(
"Unknown archive type ($filepath), the file extension must be one of (" .
implode(", ", $this->packageExtensions) . ")"
implode(", ", self::PACKAGE_EXTENSIONS) . ")"
);
}
$this->finalize();
Expand Down Expand Up @@ -1494,17 +1497,20 @@ private function loadBagInfo(): bool
}
$line = $this->decodeText($line) . PHP_EOL;
$lineLength = strlen($line);
if (str_starts_with($line, " ") || $line[0] == "\t") {
if (str_starts_with($line, " ") || str_starts_with($line, "\t")) {
// Continuation of a line
if (count($bagData) > 0) {
$previousValue = $bagData[count($bagData) - 1]['value'];
// Add a space only if the previous character was not a line break.
$lastChar = substr($previousValue, -1);
$lastCharIsNewline = str_ends_with($previousValue, "\n") ||
str_ends_with($previousValue, "\r");
if ($lineLength >= Bag::BAGINFO_AUTOWRAP_GUESS_LENGTH) {
// Line is max length or longer, should be autowrapped
$previousValue = rtrim($previousValue, "\r\n");
}
$previousValue .= ($lastChar != "\r" && $lastChar != "\n" ? " " : "");
// If the line was too long but had no spaces, it would end up with a previous value of nothing.
// That would cause a space to be added to the beginning of the next line.
$previousValue .= ($lastCharIsNewline || $previousValue == "") ? "" : " ";
$previousValue .= Bag::trimSpacesOnly($line);
$bagData[count($bagData) - 1]['value'] = $previousValue;
} else {
Expand Down Expand Up @@ -1534,7 +1540,7 @@ private function loadBagInfo(): bool
);
}
$value = $matches[4];
if ($lineLength < Bag::BAGINFO_AUTOWRAP_GUESS_LENGTH) {
if ($lineLength < Bag::BAGINFO_AUTOWRAP_GUESS_LENGTH && $value !== "") {
// Shorter line, re-add the newline removed by the preg_match.
$value .= PHP_EOL;
}
Expand All @@ -1553,6 +1559,11 @@ private function loadBagInfo(): bool
$this->bagInfoData = $bagData;

$this->updateBagInfoIndex();
if ($this->hasBagInfoTag(BagItProfile::BAGIT_PROFILE_IDENTIFIER)) {
foreach ($this->getBagInfoByTag(BagItProfile::BAGIT_PROFILE_IDENTIFIER) as $profile) {
$this->addBagProfileByURL($profile);
}
}
return true;
}

Expand Down Expand Up @@ -2610,17 +2621,17 @@ private function mergeWarnings(array $newWarnings): void

/**
* Determine the serialization mimetype from the extension.
* @param string $extension The extension.
* @param string $fileExtension The extension.
* @return string The serialization mimetype.
* @throws BagItException If the serialization mimetype cannot be determined.
*/
private function determineSerializationMimetype(string $extension): string
private function determineSerializationMimetype(string $fileExtension): string
{
foreach (self::SERIALIZATION_MAPPING as $extension => $mimetype) {
if (str_ends_with($extension, $extension)) {
if (str_ends_with($fileExtension, $extension)) {
return $mimetype;
}
}
throw new BagItException("Unable to determine serialization mimetype for extension ($extension).");
throw new BagItException("Unable to determine serialization mimetype for extension ($fileExtension).");
}
}
22 changes: 16 additions & 6 deletions src/Profiles/BagItProfile.php
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@
*/
class BagItProfile
{
/**
* @var string The tag for the profile identifier and resolvable URI.
*/
public const BAGIT_PROFILE_IDENTIFIER = "BagIt-Profile-Identifier";

/**
* @var string
* The identifier (and resolvable URI) of the BagItProfile.
Expand Down Expand Up @@ -950,12 +955,17 @@ public function validateBag(Bag $bag): bool
}
if ($this->getTagFilesRequired() !== []) {
// Grab the first tag manifest, they should all be the same
$manifests = $bag->getTagManifests()[0];
$tag_files = array_keys($manifests->getHashes());
$diff = array_diff($this->getTagFilesRequired(), $tag_files);
if ($diff !== []) {
$errors[] = "Profile requires tag files(s) which are missing from the bag (" .
implode(", ", $diff) . ")";
$manifests = $bag->getTagManifests();
if (count($manifests) === 0) {
$errors[] = "Profile requires tag files but the bag has no tag manifests";
} else {
$manifest = reset($manifests);
$tag_files = array_keys($manifest->getHashes());
$diff = array_diff($this->getTagFilesRequired(), $tag_files);
if ($diff !== []) {
$errors[] = "Profile requires tag files(s) which are missing from the bag (" .
implode(", ", $diff) . ")";
}
}
}
if ($this->getTagFilesAllowed() !== []) {
Expand Down
76 changes: 75 additions & 1 deletion tests/Profiles/ProfileWebTests.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
namespace whikloj\BagItTools\Test\Profiles;

use whikloj\BagItTools\Bag;
use whikloj\BagItTools\Profiles\BagItProfile;
use whikloj\BagItTools\Exceptions\BagItException;
use whikloj\BagItTools\Profiles\ProfileFactory;
use whikloj\BagItTools\Test\BagItWebserverFramework;

Expand Down Expand Up @@ -54,10 +54,19 @@ public static function setUpBeforeClass(): void
'content' => $profileJson,
'path' => 'bagit-test-profile.json',
],
'test-profile-bag.json' => [
'filename' => self::TEST_RESOURCES . '/profiles/test-profile-bag.json',
],
];
parent::setUpBeforeClass();
}

/**
* @group Profiles
* @covers \whikloj\BagItTools\Bag::addBagProfileByURL
* @covers \whikloj\BagItTools\Bag::addBagProfileInternal
* @covers \whikloj\BagItTools\Profiles\BagItProfile::validateBag
*/
public function testAddProfileToBagUri(): void
{
$profile = ProfileFactory::generateProfileFromUri(self::$remote_urls[0]);
Expand Down Expand Up @@ -100,4 +109,69 @@ public function testAddSameProfileTwiceByUri(): void
$bag->removeBagProfile("http://www.library.yale.edu/mssa/bagitprofiles/disk_images.json");
$this->assertCount(0, $bag->getBagProfiles());
}

/**
* @group Profiles
* @covers \whikloj\BagItTools\Profiles\BagItProfile::validateBag
*/
public function testBagDoesntSupportSerialization(): void
{
$bag = Bag::create($this->tmpdir);
$bag->addBagInfoTag('BagIt-Profile-Identifier', trim(self::$remote_urls[1]));
$bag->addBagInfoTag('Contact-Name', 'Some Person');
$bag->addBagInfoTag('Contact-Phone', '555-555-5555');
$bag->addBagInfoTag('Contact-Email', '[email protected]');
$bag->addBagInfoTag('Contact-Address', '1234 Some Street, Some City, Some State, 12345');
$bag->addBagInfoTag('Source-Organization', 'BagItTools');
$tmpfile = $this->getTempName();
file_put_contents($tmpfile, "CUSTOM-TAG-ID: 1234\nCUSTOM-TAG-ORG: 5678\n");
$bag->addTagFile($tmpfile, 'tagFiles/special-tags.txt');
$bag->createFile(
"This is an example test file in the TestProfileBag. It is used to test the\n" .
"validation of a profile.",
"example-file.txt"
);
$bag->addAlgorithm('sha1');
$tmpPackage = $this->getTempName() . ".tgz";
$bag->package($tmpPackage);
$this->assertFileExists($tmpPackage);

$new_bag = Bag::load($tmpPackage);
$this->assertFalse($new_bag->isValid());
$this->assertCount(1, $new_bag->getErrors());
$error = $new_bag->getErrors()[0];
$this->assertEquals(
[
"file" => "http://example.org/example/test-profile-bag.json",
"message" => "Profile allows for serialization MIME type (application/zip) but the bag has MIME " .
"type (application/gzip)"
],
$error
);
}

/**
* @group Profiles
* @covers \whikloj\BagItTools\Profiles\BagItProfile::validateBag
*/
public function testProfileMissingRequiredTag(): void
{
$bag = Bag::create($this->tmpdir);
$bag->addBagInfoTag('BagIt-Profile-Identifier', trim(self::$remote_urls[1]));
$bag->addBagInfoTag('Contact-Name', 'Some Person');
$bag->addBagInfoTag('Source-Organization', 'BagItTools');
$tmpfile = $this->getTempName();
file_put_contents($tmpfile, "CUSTOM-TAG-ID: 1234\nCUSTOM-TAG-ORG: 5678\n");
$bag->addTagFile($tmpfile, 'tagFiles/special-tags.txt');
$bag->createFile(
"This is an example test file in the TestProfileBag. It is used to test the\n" .
"validation of a profile.",
"example-file.txt"
);
$bag->addAlgorithm('sha1');
$tmpPackage = $this->getTempName() . ".zip";
$this->expectException(BagItException::class);
$this->expectExceptionMessage("Bag is not valid, cannot package.");
$bag->package($tmpPackage);
}
}
35 changes: 35 additions & 0 deletions tests/resources/profiles/test-profile-bag.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"BagIt-Profile-Info":{
"BagIt-Profile-Identifier":"http://example.org/example/test-profile-bag.json",
"BagIt-Profile-Version": "1.4.0",
"Source-Organization":"BagItTools",
"Contact-Name":"BagItTools Developers",
"External-Description":"BagIt Profile for testing loading of a bag with a profile",
"Version":"1.2"
},
"Bag-Info": {
"Contact-Name": {
"required": true
},
"Contact-Email": {
"required": true
},
"Source-Organization": {
"required": true,
"values": [
"BagItTools"
]
}
},
"Manifests-Required":[
"sha1"
],
"Accept-Serialization":[
"application/zip"
],
"Tag-Files-Required":[
"bagit.txt",
"bag-info.txt",
"tagFiles/special-tags.txt"
]
}

0 comments on commit 564df32

Please sign in to comment.