Skip to content

Commit

Permalink
#20 - use fixityAlgorithm from Dataverse (5.14+)
Browse files Browse the repository at this point in the history
  • Loading branch information
qqmyers committed Aug 15, 2023
1 parent b8bf8da commit b5d85a7
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 19 deletions.
86 changes: 72 additions & 14 deletions src/main/java/org/sead/uploader/dataverse/DVUploader.java
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ public class DVUploader extends AbstractUploader {
private static boolean directUpload = true;
private static boolean trustCerts = false;
private static boolean singleFile = false;

private String fixityAlgorithm = "MD5";

private int timeout = 1200;
private int httpConcurrency = 4;
Expand Down Expand Up @@ -225,7 +227,54 @@ public boolean parseCustomArg(String arg) {
}
return false;
}


@Override
public void processRequests() {
httpclient = getSharedHttpClient();

try {
// This api call will check for the fixityAlgorithm. Before v5.14, Dataverse servers should respond with a 404 and we'll use the default.
// http://$SERVER/api/files/fixityAlgorithm

String serviceUrl = server + "/api/files/fixityAlgorithm";
HttpGet httpget = new HttpGet(serviceUrl);

CloseableHttpResponse response = httpclient.execute(httpget, getLocalContext());
try {
switch (response.getStatusLine().getStatusCode()) {
case 200:
HttpEntity resEntity = response.getEntity();
if (resEntity != null) {
String res = EntityUtils.toString(resEntity);
String alg = (new JSONObject(res)).getJSONObject("data").getString("message");
try{
MessageDigest.getInstance(alg);
fixityAlgorithm = alg;
println("Using FixityAlgorithm configured for this Dataverse: " + fixityAlgorithm);
} catch (NoSuchAlgorithmException e) {
println("Unknown FixityAlgorithm requested by this Dataverse: " + alg + ", using the default: " + fixityAlgorithm);
}
}
break;
case 404:
println("FixityAlgorithm API call not available, using the default: " + fixityAlgorithm);
break;
default:
// Report unexpected errors and assume dataset doesn't exist
println("Error response when checking for fixityAlgorithm: "
+ response.getStatusLine().getReasonPhrase());
break;
}
} finally {
response.close();
}
} catch (IOException e) {
println("Error processing fixityAlgorithm API request: " + e.getMessage());
}
super.processRequests();
}

private ZipFile zf = null;

@Override
Expand Down Expand Up @@ -951,7 +1000,7 @@ private String multipartDirectFileUpload(Resource file, String path, int retries

httpput.addHeader("x-amz-tagging", "dv-state=temp");
try {
MessageDigest messageDigest = MessageDigest.getInstance("MD5");
MessageDigest messageDigest = MessageDigest.getInstance(fixityAlgorithm);

try (InputStream inStream = file.getInputStream(); DigestInputStream digestInputStream = new DigestInputStream(inStream, messageDigest)) {
// This is hte new form for requests - keeping the example but won't update until we can change all
Expand All @@ -978,7 +1027,10 @@ private String multipartDirectFileUpload(Resource file, String path, int retries
jsonData.put("storageIdentifier", storageIdentifier);
jsonData.put("fileName", file.getName());
jsonData.put("mimeType", file.getMimeType());
jsonData.put("md5Hash", localchecksum);
JSONObject inputChecksumObject = new JSONObject();
inputChecksumObject.put("@type", fixityAlgorithm);
inputChecksumObject.put("@value", localchecksum);
jsonData.put("checksum", inputChecksumObject);
jsonData.put("fileSize", file.length());
if (recurse) {
// Dataverse takes paths without an initial / and ending without a /
Expand All @@ -991,7 +1043,7 @@ private String multipartDirectFileUpload(Resource file, String path, int retries
}
}
file.setMetadata(jsonData);
dataId = "md5:" + localchecksum;
dataId = fixityAlgorithm + ":" + localchecksum;
}
if (dataId != null) {
retries = 0;
Expand All @@ -1007,7 +1059,7 @@ private String multipartDirectFileUpload(Resource file, String path, int retries
}

} catch (NoSuchAlgorithmException nsae) {
println("MD5 algorithm not found: " + nsae.getMessage());
println("Fixity algorithm not found: " + nsae.getMessage());
}
} else {

Expand All @@ -1027,11 +1079,11 @@ private String multipartDirectFileUpload(Resource file, String path, int retries
HttpPartUploadJob.setHttpClientContext(getLocalContext());
HttpPartUploadJob.setPartSize(maxPartSize);

//Create a map to store the eTags from the parts and the md5 calculated for the whole file
//Create a map to store the eTags from the parts and the fixityAlg calculated for the whole file
Map<String, String> mpUploadInfoMap = new HashMap<String, String>(uploadUrls.length() + 1);
//Setup a job to calculate the md5 hash of the file
//Setup a job to calculate the fixityAlg hash of the file
//Probably helpful to have it run in parallel, but it could be a pre or post step as well. If the network is fast relative to disk, we may want the executor to use one extra thread for this
MD5Job mjob = new MD5Job(file, mpUploadInfoMap);
DigestJob mjob = new DigestJob(file, mpUploadInfoMap, fixityAlgorithm);
executor.execute(mjob);

//Now set up upload jobs for each part
Expand Down Expand Up @@ -1075,15 +1127,15 @@ private String multipartDirectFileUpload(Resource file, String path, int retries
break;
}
}
//Technically, the uploads to S3 could succeed and only the md5 fails, but in this case we still want to abort the MP Upload, not complete it.
if (!mpUploadInfoMap.containsKey("md5")) {
//Technically, the uploads to S3 could succeed and only the fixityAlg fails, but in this case we still want to abort the MP Upload, not complete it.
if (!mpUploadInfoMap.containsKey(fixityAlgorithm)) {
fileUploadComplete = false;
}
if (fileUploadComplete) {
println("Part uploads Completed for " + storageIdentifier);
HttpPut completeUpload = new HttpPut(server + completeUrl + "&key=" + apiKey);
JSONObject eTags = new JSONObject();
((Set<String>) mpUploadInfoMap.keySet()).stream().filter(partNo -> (!partNo.equals("md5"))).forEachOrdered(partNo -> {
((Set<String>) mpUploadInfoMap.keySet()).stream().filter(partNo -> (!partNo.equals(fixityAlgorithm))).forEachOrdered(partNo -> {
eTags.put(partNo, mpUploadInfoMap.get(partNo));
});
StringEntity body = new StringEntity(eTags.toString());
Expand All @@ -1097,13 +1149,16 @@ private String multipartDirectFileUpload(Resource file, String path, int retries
if (status == 200) {
println("Successful upload of " + file.getAbsolutePath());
if (singleFile) {
dataId = registerFileWithDataverse(file, path, storageIdentifier, mpUploadInfoMap.get("md5"), retries);
dataId = registerFileWithDataverse(file, path, storageIdentifier, mpUploadInfoMap.get(fixityAlgorithm), retries);
} else {
JSONObject jsonData = new JSONObject();
jsonData.put("storageIdentifier", storageIdentifier);
jsonData.put("fileName", file.getName());
jsonData.put("mimeType", file.getMimeType());
jsonData.put("md5Hash", mpUploadInfoMap.get("md5"));
JSONObject inputChecksumObject = new JSONObject();
inputChecksumObject.put("@type", fixityAlgorithm);
inputChecksumObject.put("@value", mpUploadInfoMap.get(fixityAlgorithm));
jsonData.put("checksum", inputChecksumObject);
jsonData.put("fileSize", file.length());
if (recurse) {
// Dataverse takes paths without an initial / and ending without a /
Expand All @@ -1116,7 +1171,7 @@ private String multipartDirectFileUpload(Resource file, String path, int retries
}
}
file.setMetadata(jsonData);
dataId = "md5:" + mpUploadInfoMap.get("md5");
dataId = fixityAlgorithm + ":" + mpUploadInfoMap.get(fixityAlgorithm);
}
} else {
println("Partial upload of " + file.getAbsolutePath() + ", complete upload failed with status: " + status);
Expand Down Expand Up @@ -1171,7 +1226,10 @@ private String registerFileWithDataverse(Resource file, String path, String stor
jsonData.put("storageIdentifier", storageIdentifier);
jsonData.put("fileName", file.getName());
jsonData.put("mimeType", file.getMimeType());
jsonData.put("md5Hash", checksum);
JSONObject inputChecksumObject = new JSONObject();
inputChecksumObject.put("@type", fixityAlgorithm);
inputChecksumObject.put("@value", checksum);
jsonData.put("checksum", inputChecksumObject);
jsonData.put("fileSize", file.length());
if (recurse) {
// Dataverse takes paths without an initial / and ending without a /
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,16 @@
*
* @author Jim
*/
public class MD5Job implements Runnable {
public class DigestJob implements Runnable {

Resource file;
Map infoMap;
final String alg;

public MD5Job(Resource file, Map infoMap) throws IllegalStateException {
public DigestJob(Resource file, Map infoMap, String alg) throws IllegalStateException {
this.file = file;
this.infoMap = infoMap;
this.alg = alg;
}

/*
Expand All @@ -37,21 +39,21 @@ public MD5Job(Resource file, Map infoMap) throws IllegalStateException {
@Override
public void run() {
try {
MessageDigest messageDigest = MessageDigest.getInstance("MD5");
MessageDigest messageDigest = MessageDigest.getInstance(alg);

try (InputStream inStream = file.getInputStream(); DigestInputStream digestInputStream = new DigestInputStream(inStream, messageDigest)) {
byte[] bytes;
bytes = new byte[64*1024];
while(digestInputStream.read(bytes) >= 0) {
}
String checksum = Hex.encodeHexString(digestInputStream.getMessageDigest().digest());
infoMap.put("md5", checksum);
infoMap.put(alg, checksum);
} catch (IOException e) {
e.printStackTrace(System.out);
println("Error calculating digest for: " + file.getAbsolutePath() + " : " + e.getMessage());
}
} catch (NoSuchAlgorithmException nsae) {
println("MD5 algorithm not found: " + nsae.getMessage());
println("Fixity algorithm not found: " + nsae.getMessage());
}
}
}

0 comments on commit b5d85a7

Please sign in to comment.