diff --git a/CSV2Metadata.java b/CSV2Metadata.java index b1e98b1..409aa82 100644 --- a/CSV2Metadata.java +++ b/CSV2Metadata.java @@ -15,17 +15,34 @@ */ - import org.apache.commons.cli.*; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVParser; import org.apache.commons.csv.CSVRecord; - +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.http.HttpStatus; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpPut; +import org.apache.http.conn.HttpClientConnectionManager; +import org.apache.http.entity.StringEntity; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.apache.http.impl.conn.BasicHttpClientConnectionManager; +import org.apache.http.util.EntityUtils; +import org.w3c.dom.*; + +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; import java.io.*; +import java.nio.charset.Charset; +import java.util.Base64; import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; +import java.util.Properties; /** * Class to read a CSV file with headers and create an XML file @@ -35,6 +52,26 @@ * */ public class CSV2Metadata { + + private Log log = LogFactory.getLog(getClass()); + + + private HttpClientConnectionManager cm = new BasicHttpClientConnectionManager(); + private CloseableHttpClient httpclient; + + private javax.xml.parsers.DocumentBuilderFactory factory = javax.xml.parsers.DocumentBuilderFactory.newInstance(); + + private TransformerFactory transformerFactory = TransformerFactory.newInstance(); + + private static final String XIP_NS = "http://www.tessella.com/XIP/v4"; + + private Properties userDetails; + + public CSV2Metadata(Properties userDetails) { + factory.setNamespaceAware(true); + this.userDetails = userDetails; + } + /** * The java Main entry point for executing the class * @@ -51,6 +88,7 @@ public static void main(String[] args) { options.addOption( "r", "root", true, "the root element of the dublin core xml, defaults to dc" ); options.addOption( "n", "namespace", true, "the root element namespace, defaults to http://purl.org/dc/elements/1.1/" ); options.addOption( "p", "prefix", true, "the root element namespace prefix, defaults to dc" ); + options.addOption( "u", "user", true, "the property file with Preservica username & password" ); options.addOption( "h", "help", false, "print this message" ); HelpFormatter formatter = new HelpFormatter(); @@ -68,6 +106,8 @@ public static void main(String[] args) { File inputFile = null; File outputDir = null; + Properties userDetails = new Properties(); + try { // parse the command line arguments CommandLine line = parser.parse( options, args ); @@ -83,6 +123,11 @@ public static void main(String[] args) { rootElement = DEFAULT_ROOT_ELEMENT; } + if ( line.hasOption( "u" ) ) { + String properties = line.getOptionValue( "u" ); + userDetails.load(new FileInputStream(properties)); + } + if ( line.hasOption( "p" ) ) { rootPrefix = line.getOptionValue( "p" ); } else { @@ -116,6 +161,7 @@ public static void main(String[] args) { if ( line.hasOption( "o" ) ) { String outputFolder = line.getOptionValue( "o" ); outputDir = new File(outputFolder); + outputDir.mkdirs(); if ( (!outputDir.exists()) || (!outputDir.isDirectory()) ) { System.out.println(String.format("The output directory %s does not exist", outputFolder)); System.exit(1); @@ -125,8 +171,10 @@ public static void main(String[] args) { System.exit(1); } + + try { - CSV2Metadata metadata = new CSV2Metadata(); + CSV2Metadata metadata = new CSV2Metadata(userDetails); int files = metadata.parse(inputFile, outputDir, fileColumn, rootElement, rootPrefix, rootNamespace); System.out.println(String.format("Created %d XML files in %s", files, outputDir.getName())); } catch (Exception e) { @@ -138,6 +186,10 @@ public static void main(String[] args) { catch( ParseException exp ) { System.out.println(exp.getMessage()); formatter.printHelp( cmdLine, options ); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); } } @@ -219,6 +271,8 @@ private int parse(File csvDocument, File folder, String filenameColumn, String r osw.write(">"); osw.write(System.getProperty("line.separator")); + String filerefId = null; + for (int i = 0; i < headerCount; i++) { String header = headers[i]; boolean isDublinCore = (header.startsWith("dc:") || header.startsWith("dcterms:")); @@ -232,6 +286,9 @@ private int parse(File csvDocument, File folder, String filenameColumn, String r } osw.write(System.getProperty("line.separator")); } + if (header.toLowerCase().trim().startsWith("fileref")) { + filerefId = record.get(i).trim(); + } } osw.write(String.format("", rootPrefix, rootElement)); osw.flush(); @@ -239,8 +296,225 @@ private int parse(File csvDocument, File folder, String filenameColumn, String r fos.close(); numFiles++; + + // if the entity does not have descriptive metadata with the required + // namespace then add it. + + if (filerefId != null && (filerefId.length() > 0) ) { + if ((userDetails != null) && (!userDetails.isEmpty())) { + Document xipDocument = getEntity(filerefId); + if (xipDocument != null) { + if (!hasDublinCore(xipDocument, rootNamespace)) { + org.w3c.dom.Document dublinCoreDocument = getDocumentFromFile(xmlFile); + xipDocument = addDublinCore(dublinCoreDocument, xipDocument, rootNamespace); + updateEntity(xipDocument, filerefId); + } else { + System.out.println("Entity: " + filerefId + " already has Dublin Core metadata. Ignoring...."); + } + } else { + System.out.println("Failed to find a Preservica entity with ID: " + filerefId); + } + } else { + System.out.println("Create a preservica.properties file with username and password"); + System.out.println("to update entries"); + } + } } return numFiles; } + + /** + * Update the Preservica File entity with the dublin core metadata + * + * + * @param document + * @param entityRef + */ + private void updateEntity(Document document, String entityRef) { + CloseableHttpClient client = getClient(); + CloseableHttpResponse response = null; + try { + + String domain = userDetails.getProperty("preservica.domain"); + + HttpPut putRequest = new HttpPut(String.format("https://%s/api/entity/digitalFiles/%s", domain, entityRef.trim())); + putRequest.setHeader("Authorization", getHeader()); + + document.normalize(); + + DOMSource domSource = new DOMSource(document); + StringWriter writer = new StringWriter(); + StreamResult result = new StreamResult(writer); + Transformer transformer = transformerFactory.newTransformer(); + transformer.transform(domSource, result); + + StringEntity se = new StringEntity(writer.toString(), "UTF-8"); + putRequest.setEntity(se); + response = client.execute(putRequest); + if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { + log.info("Updated object: " + entityRef); + } + if (response.getStatusLine().getStatusCode() != HttpStatus.SC_OK) { + log.error("Failed to update entity"); + log.error(response.getStatusLine().toString()); + } + } catch (Exception ex) { + log.error(ex.getMessage()); + throw new RuntimeException(ex); + } finally { + EntityUtils.consumeQuietly(response.getEntity()); + IOUtils.closeQuietly(response); + } + return; + } + + /** + * Add dublin core metadata to an existing file entity + * + * @param dublinCore + * @param xipDocument + * @return Document + */ + private Document addDublinCore(Document dublinCore, Document xipDocument, String namespace) { + + // Create a new Metadata element + Element metadataElement = xipDocument.createElement("Metadata"); + metadataElement.setAttribute("schemaURI", namespace); + + // add the dublin core to it. + Node dublinCoreNode = xipDocument.importNode(dublinCore.getDocumentElement(), true); + metadataElement.appendChild(dublinCoreNode); + + // metadata goes after the "Directory" element; + NodeList elements = xipDocument.getDocumentElement().getElementsByTagName("Directory"); + if (elements.getLength() == 1) { + Element elem = (Element)elements.item(0); + elem.getParentNode().insertBefore(metadataElement, elem.getNextSibling()); + } + + return xipDocument; + } + + + /** + * Create a org.w3c.dom.Document from the dublin Core Metadata file + * + * @param xmlFile + * @return Document + */ + private Document getDocumentFromFile(File xmlFile) { + org.w3c.dom.Document document = null; + try { + javax.xml.parsers.DocumentBuilder builder = factory.newDocumentBuilder(); + document = builder.parse(xmlFile); + } catch (Exception ex) { + log.error(ex); + } + return document; + } + + + /** + * Check that the current document does not have generic metadata already + * with the same namespace. + * make its safe to re-run the program + * + * @param document + * @param namespace + * @return true + */ + private boolean hasDublinCore(Document document, String namespace) { + + NodeList list = document.getElementsByTagNameNS(XIP_NS, "Metadata"); + for (int i = 0; i < list.getLength(); i++) { + Node node = list.item(i); + NamedNodeMap namedNodeMap = node.getAttributes(); + if (namedNodeMap != null) { + Node attribute = namedNodeMap.getNamedItem("schemaURI"); + if (attribute != null) { + if (attribute.getNodeValue().equals(namespace)) { + return true; + } + } + } + } + return false; + } + + private String getHeader() { + byte[] bytes = Base64.getEncoder().encode(String.format("%s:%s", userDetails.getProperty("preservica.username"), userDetails.getProperty("preservica.password")).getBytes()); + return String.format("Basic %s", new String(bytes, Charset.forName("UTF-8"))); + } + + /** + * Get a Preservica entity by its reference + * + * @param entityRef + * @return org.w3c.dom Document of XIP XML + */ + private Document getEntity(String entityRef) { + + String domain = userDetails.getProperty("preservica.domain"); + + CloseableHttpClient client = getClient(); + CloseableHttpResponse response = null; + try { + HttpGet httpGet = new HttpGet(String.format("https://%s/api/entity/entities/%s", domain, entityRef.trim())); + httpGet.setHeader("Authorization", getHeader()); + response = client.execute(httpGet); + if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { + return getDocument(response); + } + if (response.getStatusLine().getStatusCode() != HttpStatus.SC_OK) { + log.error("Failed to create get entity"); + log.error(response.getStatusLine().toString()); + } + } catch (Exception ex) { + log.error(ex.getMessage()); + throw new RuntimeException(ex); + } finally { + EntityUtils.consumeQuietly(response.getEntity()); + IOUtils.closeQuietly(response); + } + return null; + } + + /** + * Get the http client for the REST calls. + * + * @return HttpClient + */ + private CloseableHttpClient getClient() { + if (httpclient == null) { + httpclient = HttpClients.custom().setConnectionManager(cm).build(); + } + return httpclient; + } + + /** + * Create a document from a http response + * + * @param response + * @return Document + * + * @throws Exception + */ + private Document getDocument(CloseableHttpResponse response) throws Exception { + org.w3c.dom.Document document; + StringWriter sw = new StringWriter(); + IOUtils.copy(response.getEntity().getContent(), sw); + javax.xml.parsers.DocumentBuilder builder = factory.newDocumentBuilder(); + InputStream is = null; + try { + is = new java.io.ByteArrayInputStream(sw.toString().getBytes(Charset.forName("UTF-8"))); + document = builder.parse(is); + } finally { + IOUtils.closeQuietly(is); + EntityUtils.consumeQuietly(response.getEntity()); + IOUtils.closeQuietly(response); + } + return document; + } + } diff --git a/README.md b/README.md index 34831b8..16cdb45 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,9 @@ Convert a spreadsheet (CSV) file into [Dublin Core](http://dublincore.org/) XML files The XML files conform to the [Preservica](http://preservica.com/) naming convention linking them to digital files which they describe. +Use this program if you have local digital files which have not yet been ingested into Preservica. + + Usage: `csv2dc.cmd -i file.csv -o output [-c "file name column"] [-r root] [-p prefix] [-n namespace]` @@ -79,3 +82,37 @@ Then use the following command line arguments: `csv2dc.cmd -i file.csv -o output -c filename -r dc -p oai_dc -n http://www.openarchives.org/OAI/2.0/oai_dc/` +# csv2preservica + +This program will convert a spreadsheet (CSV) file into [Dublin Core](http://dublincore.org/) XML files +and then add these XML dublin core metadata files to the digital files within a [Preservica](http://preservica.com/) system using the API. +The Spreadsheet has the same format as above, but must contain a column containing the Preservica fileRef of the digital object +the metadata will be attached to. + +Use this program if you the digital files have already been ingested into Preservica and you now want to +add additional descriptive metadata held in a spreadsheet. + +The additional column in the spreadsheet must be called "fileref" and contain the UUID of the Preservica digital file. + +eg. + +filename | fileref | dc:description | dc:identifier | dc:title | dc:subject | dcterms:provenance +-------- | -------- | ------------- | ------------- | -------- | ----------- | ----------- +LC-USZ62-20901.tiff | 8283edc6-8016-4100-a94c-3db90b0e4a75 | Picture of a plane | LC-USZ62-20901 | Photo Title | Plane | LOC +LC-USZ62-43601.tiff | 9183edc6-2115-2912-b8ad-5ef3013c7b21 | Picture of a Car | LC-USZ62-43601 | Photo Title2 | Car | LOC + + +To use the web service API to update metadata in Preservica you will need to create +a properties file called `preservica.properties` in the program directory +and set the following values. + +* preservica.domain={us.preservica.com,eu.preservica.com,au.preservica.com,ca.preservica.com} +* preservica.username=jo@example.com +* preservica.password=xxxxxxx + +The command line arguments for controlling the dublin core metadata are the same: + +Usage: + +`csv2preservica.cmd -i file.csv -o output [-c "file name column"] [-r root] [-p prefix] [-n namespace]` + diff --git a/csv2preservica.cmd b/csv2preservica.cmd new file mode 100644 index 0000000..7f3e1d1 --- /dev/null +++ b/csv2preservica.cmd @@ -0,0 +1,121 @@ +@ECHO OFF +:: ------------------------------------------------------------------ +:: JavaHome.bat - search for and set JAVA_HOME +:: 1. If JAVA_HOME is set in system environment, do nothing else. +:: 2. If javahome.txt already exists, use that value +:: 3. If those fail, search parent directories for java.exe +:: 4. Finally, try looking in the registry for other Java installations +:: Note- Script does not add trailing slash to JAVA_HOME variable +:: Note- JBINARY var can be set to JRE or JDK detection +:: ------------------------------------------------------------------ +SET TITLE=JavaHome.bat +TITLE=%TITLE% +SETLOCAL ENABLEDELAYEDEXPANSION +SET JDKBIN=\bin +SET JREBIN=\jre\bin +SET JBINARY=%JREBIN%\java.exe +SET SCRIPTDIR=%~dp0 +SET SCRIPTDIR=%SCRIPTDIR:~0,-1% +IF EXIST "%SCRIPTDIR%\javahome.txt" ( + ECHO. + GOTO :LOCKFILE +) +:: search environment section +IF DEFINED JAVA_HOME ( + ECHO. + CALL :STRIP "!JAVA_HOME!">"!SCRIPTDIR!\javahome.txt" + GOTO :END +) +SET "dir=%~f0" +:DIRLOOP +CALL :FGETDIR "%dir%" +IF EXIST "%dir%\%JBINARY%" ( + ECHO Parent directory search found JAVA_HOME at %dir% + GOTO :SEARCHSET +) +IF "%dir:~-1%" == ":" ( + ECHO Parent directory search reached root and "%JBINARY%" was not found. + GOTO :REGISTRY +) +GOTO :DIRLOOP +:SEARCHSET +SET JAVA_HOME=%dir% +ECHO %JAVA_HOME%>javahome.txt +GOTO :END +:REGISTRY +:: registry search section +:: runs only when JAVA_HOME not set, file search fails, and javahome.txt doesn't exist +ECHO Searching registry for JAVA_HOME... +ECHO. 2>merged.txt +ECHO. 2>list.txt +ECHO. 2>uniquelist.txt +IF NOT EXIST reg32.txt ECHO. 2>reg32.txt +IF NOT EXIST reg64.txt ECHO. 2>reg64.txt +START /w REGEDIT /e reg32.txt "HKEY_LOCAL_MACHINE\SOFTWARE\WOW6432Node\JavaSoft\Java Development Kit" +TYPE reg32.txt | FIND "JavaHome" > merged.txt +START /w REGEDIT /e reg64.txt "HKEY_LOCAL_MACHINE\SOFTWARE\JavaSoft\Java Development Kit" +TYPE reg64.txt | FIND "JavaHome" >> merged.txt +FOR /f "tokens=2 delims==" %%x IN (merged.txt) DO ( + CALL :STRIP "%%~x" >>list.txt +) +FOR /F "tokens=* delims= " %%a IN (list.txt) DO ( + SET str=%%a + FIND /I ^"!str!^" list.txt>nul + FIND /I ^"!str!^" uniquelist.txt>nul + IF ERRORLEVEL 1 ECHO !str!>>uniquelist.txt +) +:PROMPT +ECHO Select a JDK from the list: +SET /A COUNT=0 +FOR /f "tokens=1,* delims=" %%y IN (uniquelist.txt) DO ( + SET /A COUNT += 1 + ECHO !COUNT!: %%~y +) +SET /P NUMBER=Type a number here: +IF "%NUMBER%" GTR "%COUNT%" GOTO :PROMPT +SET /A COUNT=0 +FOR /f "tokens=1,* delims=" %%z IN (uniquelist.txt) DO ( + SET /A COUNT += 1 + IF !COUNT!==!NUMBER! ( + SET JAVA_HOME=%%~z + ) +) +ECHO %JAVA_HOME%>javahome.txt +GOTO CLEANUP + +:: batch functions section +:FGETDIR +SET "dir=%~dp1" +SET "dir=%dir:~0,-1%" +EXIT /B 0 +:STRIP +REM Strip quotes and extra backslash from string +SET n=%~1 +SET n=%n:\\=\% +SET n=%n:"=% +IF NOT "%n%"=="" ECHO %n% +GOTO :EOF +:: cleanup and end +:CLEANUP +REM cleanup of registry search +DEL /Q merged.txt +DEL /Q list.txt +DEL /Q uniquelist.txt +DEL /Q reg32.txt +DEL /Q reg64.txt +GOTO :LOCKFILE +:: if all fails +:FAILED +IF NOT DEFINED JAVA_HOME ( + ECHO Error: JAVA_HOME not set in system vars, file search failed, && javahome.txt didn't exist. + GOTO :END +) +:LOCKFILE +ECHO. +SET /P JAVA_HOME=<"%SCRIPTDIR%\javahome.txt" +:END + +"%JAVA_HOME%\bin\java" -cp .;lib/* CSV2Metadata -u preservica.properties %* + + + diff --git a/csv2preservica.sh b/csv2preservica.sh new file mode 100644 index 0000000..2a45833 --- /dev/null +++ b/csv2preservica.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +if type -p java; then + java -cp .:lib/* CSV2Metadata $@ +elif [[ -n "$JAVA_HOME" ]] && [[ -x "$JAVA_HOME/bin/java" ]]; then + $JAVA_HOME/bin/java -cp .:lib/* CSV2Metadata -u preservica.properties $@ +else + echo "No Java found in Path" +fi + + diff --git a/lib/commons-io-2.4.jar b/lib/commons-io-2.4.jar new file mode 100644 index 0000000..90035a4 Binary files /dev/null and b/lib/commons-io-2.4.jar differ diff --git a/lib/commons-lang3-3.7.jar b/lib/commons-lang3-3.7.jar new file mode 100644 index 0000000..f37ded6 Binary files /dev/null and b/lib/commons-lang3-3.7.jar differ diff --git a/lib/commons-logging-1.2.jar b/lib/commons-logging-1.2.jar new file mode 100644 index 0000000..93a3b9f Binary files /dev/null and b/lib/commons-logging-1.2.jar differ diff --git a/lib/httpclient-4.5.5.jar b/lib/httpclient-4.5.5.jar new file mode 100644 index 0000000..7796b0e Binary files /dev/null and b/lib/httpclient-4.5.5.jar differ diff --git a/lib/httpcore-4.4.9.jar b/lib/httpcore-4.4.9.jar new file mode 100644 index 0000000..e2d3cbf Binary files /dev/null and b/lib/httpcore-4.4.9.jar differ diff --git a/preservica.properties b/preservica.properties new file mode 100644 index 0000000..71f19a5 --- /dev/null +++ b/preservica.properties @@ -0,0 +1,14 @@ + + +## The domain of your preservica system +## Can be: us.preservica.com eu.preservica.com au.preservica.com ca.preservica.com + +preservica.domain=us.preservica.com + +## A Preservica username and password which has +## permission to write metadata to the digital objects. + +preservica.username=jo@example.com +preservica.password=xxx + +