diff --git a/dspace_aip_archiver.py b/dspace_aip_archiver.py index 5155eac..f0079f1 100644 --- a/dspace_aip_archiver.py +++ b/dspace_aip_archiver.py @@ -22,6 +22,7 @@ from sqlite3 import Error from subprocess import run from time import strftime +from zipfile import ZipFile class ProgressPercentage(object): @@ -102,11 +103,19 @@ def getRecordsWithValues(oaiRecords): def exportAipFromDSpaceToStorageFolder(handle, configData): cli = configData["dspace"]["DSPACE_CLI"] - eperson = "-e " + configData["dspace"]["DSPACE_EPERSON"] - item = "-i " + handle - file_name = handle.replace("/", "-") + ".tar" + eperson = configData["dspace"]["DSPACE_EPERSON"] + item = handle + file_name = handle.replace("/", "-") + ".zip" destination = join(configData["dspace"]["EXPORT_LOCATION"], file_name) - run([cli, "packager", "-d", "-t AIP", eperson, item, destination]) + run([cli, "packager", "-d", "-t", "AIP", "-e", eperson, "-i", item, destination]) + + +def unZipFile(fileName, sourceFilePath): + + with ZipFile(sourceFilePath + "/" + fileName, 'r') as zipObj: + zipObj.extractall(sourceFilePath + "/temp/") + + os.remove(sourceFilePath + "/" + fileName) def getHandleId(record): @@ -252,7 +261,7 @@ def saveToTargetFile(fileName, content, path): file.write(content) -def createTarFile(fileName, sourceFilePath, targetFilePath, title): +def createTarFile(fileName, sourceFilePath, targetFilePath): allItem = os.listdir(sourceFilePath) with tarfile.open(os.path.join(targetFilePath, fileName), 'w:gz') as tar: @@ -318,15 +327,23 @@ def cleanFolder(folderPath): title = getValueFromField(record, "title") desc = getValueFromField(record, "description") identifier = getHandleId(record.getField("identifier")) + dspaceExportFileName = identifier.replace("/", "-") + ".zip" bagitFileName = identifier.replace("/", "-") + ".tar" - logging.info( "Handle %s: Start export handle file and create APTrust bagit", identifier) exportAipFromDSpaceToStorageFolder( identifier, configData) - if os.path.exists(export_location + bagitFileName): + if os.path.exists(export_location + dspaceExportFileName): + unZipFile(dspaceExportFileName, export_location) + createTarFile( + bagitFileName, + export_location + + "/temp/", + export_location) + cleanFolder(export_location + "/temp/") + os.rmdir(export_location + "/temp/") noid = getNoidFromDB(conn, identifier, noid_template) fileCount = [1, 1] bagitInfo = createBagitInfo(configData, noid, fileCount) @@ -339,8 +356,7 @@ def cleanFolder(folderPath): createTarFile( bagitFileName, export_location, - storage_location, - bagitFileName) + storage_location) uploadFileToS3( storage_location + bagitFileName, configData["s3"]["bucket_name"], bagitFileName) @@ -351,7 +367,7 @@ def cleanFolder(folderPath): "Handle %s: APTrust bagit uploaded to s3", identifier) else: - logging.info("Handle %s file not found", bagitFileName) + logging.info("Handle %s file not found", dspaceExportFileName) conn.close() diff --git a/tests/test_dspace_aip_archiver.py b/tests/test_dspace_aip_archiver.py index 4187822..559db53 100644 --- a/tests/test_dspace_aip_archiver.py +++ b/tests/test_dspace_aip_archiver.py @@ -1,5 +1,10 @@ -from unittest import TestCase +import os +import tarfile +import tempfile +import zipfile + from dspace_aip_archiver import * +from unittest import TestCase class TestSuite(TestCase): @@ -83,3 +88,87 @@ def test_getRecordsWithValues(self): testOAIRecords = [] outputRecords = getRecordsWithValues(testOAIRecords) self.assertEqual(outputRecords, []) + + def test_createTarFile(self): + + # Prepare files in the source folder + testFile1 = "test1.txt" + testFile2 = "test2.txt" + content = "content2" + tempPath = tempfile.TemporaryDirectory().name + sourceFolder = tempfile.mkdtemp() + sourceFolderPath = sourceFolder + "/" + + saveToTargetFile(testFile1, content, sourceFolderPath) + saveToTargetFile(testFile2, content, sourceFolderPath) + + # Create a test tar file in the target folder + targetFolder = tempfile.mkdtemp() + targetFolderPath = targetFolder + "/" + fileName = "test.tar" + createTarFile(fileName, sourceFolderPath, targetFolderPath) + + self.assertTrue(tarfile.is_tarfile(targetFolderPath + fileName)) + + # Untar tarfile + validateFolder = tempfile.mkdtemp() + validateFolderPath = targetFolder + "/" + tarTempFile = tarfile.open(targetFolderPath + fileName) + tarTempFile.extractall(validateFolderPath) + targetFolderFiles = os.listdir(validateFolderPath) + expected = ['test.tar', 'test1.txt', 'test2.txt'] + + self.assertListEqual(targetFolderFiles, expected) + + def test_saveToTargetFile(self): + + fileName = "test.txt" + content = "content" + + with tempfile.TemporaryDirectory() as tmpdirname: + tempPath = tmpdirname + saveToTargetFile(fileName, content, tempPath) + + f = open(tmpdirname + fileName, "r") + self.assertEqual(f.read(), "content") + + def test_createBagit(self): + + # Prepare files in the source folder + testFile1 = "test1.txt" + testFile2 = "test2.txt" + content = "content2" + tempPath = tempfile.TemporaryDirectory().name + sourceFolder = tempfile.mkdtemp() + sourceFolderPath = sourceFolder + "/" + + saveToTargetFile(testFile1, content, sourceFolderPath) + saveToTargetFile(testFile2, content, sourceFolderPath) + + testConfigData = { + 'aptrust': { + 'organization': 'VT', + 'group_id': '12345', + 'desc': 'Desc'}} + testNoid = "zxcvb" + testfileCount = [1, 1] + + bagitInfo = createBagitInfo(testConfigData, testNoid, testfileCount) + checksum = ["md5", "sha256"] + + createBagit(sourceFolderPath, bagitInfo, checksum) + + sourceFolderFiles = os.listdir(sourceFolderPath) + expected = [ + 'tagmanifest-md5.txt', + 'bagit.txt', + 'bag-info.txt', + 'tagmanifest-sha256.txt', + 'manifest-md5.txt', + 'data', + 'manifest-sha256.txt'] + self.assertListEqual(sourceFolderFiles, expected) + + sourceFolderDataFiles = os.listdir(sourceFolderPath + "data/") + expected = ['test1.txt', 'test2.txt'] + self.assertListEqual(sourceFolderDataFiles, expected)