Skip to content

Commit

Permalink
Merge pull request DSpace#1160 from atmire/DS-2876
Browse files Browse the repository at this point in the history
DS-2876 Framework for importing external metadata
  • Loading branch information
mwoodiupui committed Jan 27, 2016
2 parents 38a4def + 42f057a commit 2833907
Show file tree
Hide file tree
Showing 22 changed files with 1,476 additions and 0 deletions.
12 changes: 12 additions & 0 deletions dspace-api/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -670,6 +670,18 @@
<version>1</version>
<type>jar</type>
</dependency>

<dependency>
<groupId>org.apache.ws.commons.axiom</groupId>
<artifactId>axiom-impl</artifactId>
<version>1.2.14</version>
</dependency>

<dependency>
<groupId>org.apache.ws.commons.axiom</groupId>
<artifactId>axiom-api</artifactId>
<version>1.2.14</version>
</dependency>
<!-- S3 -->
<dependency>
<groupId>com.amazonaws</groupId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/

package org.dspace.importer.external;

/** Represents a problem with the input source: e.g. cannot connect to the source.
* Created by Roeland Dillen (roeland at atmire dot com)
* Date: 19/09/12
* Time: 13:17
*/
public class MetadataSourceException extends Exception {
public MetadataSourceException() {
}

public MetadataSourceException(String s) {
super(s);
}

public MetadataSourceException(String s, Throwable throwable) {
super(s, throwable);
}

public MetadataSourceException(Throwable throwable) {
super(throwable);
}
}
55 changes: 55 additions & 0 deletions dspace-api/src/main/java/org/dspace/importer/external/Query.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/

package org.dspace.importer.external;

import org.apache.commons.collections.map.MultiValueMap;

import java.util.Collection;

/**
* Created by Roeland Dillen (roeland at atmire dot com)
* Date: 27/09/12
* Time: 15:26
*/
public class Query {
private MultiValueMap parameters = new MultiValueMap();

public MultiValueMap getParameters() {
return parameters;
}

public void addParameter(String key,Object value){
parameters.put(key,value);
}

protected void addSingletonParameter(String key,Object value){
parameters.remove(key);
parameters.put(key,value);
}

public <T> T getParameterAsClass(String key, Class<T> clazz){
Collection c=parameters.getCollection(key);
if(c==null||c.isEmpty()) return null;
else {
Object o=c.iterator().next();
if(clazz.isAssignableFrom(o.getClass()))
return (T) o ;
else return null;
}

}

public Collection getParameter(String key){
return parameters.getCollection(key);
}

public void setParameters(MultiValueMap parameters) {
this.parameters = parameters;
}
}
147 changes: 147 additions & 0 deletions dspace-api/src/main/java/org/dspace/importer/external/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
- [Introduction](#Introduction)
- [Features](#Features)
- [Abstraction of input format](#Abstraction-input-format)
- [Transformation to DSpace item](#transformation)
- [Relation with BTE](#bte)
- [Implementation of an import source](#Example-implementation)
- [Inherited methods](#Inherited-methods)
- [Metadata mapping](#Mapping)


# Introduction <a name="Introduction"></a> #

This documentation explains the features and the usage of the importer framework.

## Features <a name="Features"></a> ##

- lookup publications from remote sources
- Support for multiple implementations

## Abstraction of input format <a name="Abstraction-input-format"></a> ##

The importer framework does not enforce a specific input format. Each importer implementation defines which input format it expects from a remote source.
The import framework uses generics to achieve this. Each importer implementation will have a type set of the record type it receives from the remote source's response.
This type set will also be used by the framework to use the correct MetadataFieldMapping for a certain implementation. Read [Implementation of an import source](#Example-implementation) for more information.

## Transformation to DSpace item <a name="transformation"></a> ##

The framework produces an 'ImportRecord' that is completely decoupled from DSPace. It contains a set of metadata DTO's that contain the notion of schema,element and qualifier. The specific implementation is responsible for populating this set. It is then very simple to create a DSPace item from this list.

## Relation with BTE <a name="bte"></a> ##

While there is some overlap between this framework and BTE, this framework supports some features that are hard to implement using the BTE. It has explicit support to deal with network failure and throttling imposed by the data source. It also has explicit support for distinguishing between network caused errors and invalid requests to the source.
Furthermore the framework doesn't impose any restrictions on the format in which the data is retrieved. It uses java generics to support different source record types. A reference implementation of using XML records is provided for which a set of metadata can be generated from any xpath expression (or composite of xpath expressions).
Unless 'advanced' processing is necessary (e.g. lookup of authors in an LDAP directory) this metadata mapping can be simply configured using spring. No code changes necessary. A mixture of advanced and simple (xpath) mapping is also possible.

This design is also in line with the roadmap to create a Modular Framework as detailed in [https://wiki.duraspace.org/display/DSPACE/Design+-+Module+Framework+and+Registry](https://wiki.duraspace.org/display/DSPACE/Design+-+Module+Framework+and+Registry)
This modular design also allows it to be completely independent of the user interface layer, be it JSPUI, XMLUI, command line or the result of the new UI projects: [https://wiki.duraspace.org/display/DSPACE/Design+-+Single+UI+Project](https://wiki.duraspace.org/display/DSPACE/Design+-+Single+UI+Project)

# Implementation of an import source <a name="Example-implementation"></a> #

Each importer implementation must at least implement interface *org.dspace.importer.external.service.other.Imports* and implement the inherited methods.

One can also choose to implement class *org.dspace.importer.external.service.other.Source* next to the Imports interface. This class contains functionality to handle request timeouts and to retry requests.

A third option is to implement class *org.dspace.importer.external.service.AbstractImportSourceService*. This class already implements both the Imports interface and Source class. AbstractImportSourceService has a generic type set 'RecordType'. In the importer implementation this type set should be the class of the records received from the remote source's response (e.g. when using axiom to get the records from the remote source's XML response, the importer implementation's type set is *org.apache.axiom.om.OMElement*).

Implementing the AbstractImportSourceService allows the importer implementation to use the framework's build-in support to transform a record received from the remote source to an object of class *org.dspace.importer.external.datamodel.ImportRecord* containing DSpace metadata fields, as explained here: [Metadata mapping](#Mapping).

## Inherited methods <a name="Inherited-methods"></a> ##

Method getImportSource() should return a unique identifier. Importer implementations should not be called directly, but class *org.dspace.importer.external.service.ImportService* should be called instead. This class contains the same methods as the importer implementatons, but with an extra parameter 'url'. This url parameter should contain the same identifier that is returned by the getImportSource() method of the importer implementation you want to use.

The other inherited methods are used to query the remote source.

## Metadata mapping <a name="Mapping"></a> ##

When using an implementation of AbstractImportSourceService, a mapping of remote record fields to DSpace metadata fields can be created.

first create an implementation of class AbstractMetadataFieldMapping with the same type set used for the importer implementation.

Then create a spring configuration file in [dspace.dir]/config/spring/api.

Each DSpace metadata field that will be used for the mapping must first be configured as a spring bean of class *org.dspace.importer.external.metadatamapping.MetadataFieldConfig*.

```xml
<bean id="dc.title" class="org.dspace.importer.external.metadatamapping.MetadataFieldConfig">
<constructor-arg value="dc.title"/>
</bean>
```

Now this metadata field can be used to create a mapping. To add a mapping for the "dc.title" field declared above, a new spring bean configuration of a class class *org.dspace.importer.external.metadatamapping.contributor.MetadataContributor* needs to be added. This interface contains a type argument.
The type needs to match the type used in the implementation of AbstractImportSourceService. The responsibility of each MetadataContributor implementation is to generate a set of metadata from the retrieved document. How it does that is completely opaque to the AbstractImportSourceService but it is assumed that only one entity (i.e. item) is fed to the metadatum contributor.


For example ```java SimpleXpathMetadatumContributor implements MetadataContributor<OMElement>``` can parse a fragment of xml and generate one or more metadata values.


This bean expects 2 property values:

- field: A reference to the configured spring bean of the DSpace metadata field. e.g. the "dc.title" bean declared above.
- query: The xpath expression used to select the record value returned by the remote source.

```xml
<bean id="titleContrib" class="org.dspace.importer.external.metadatamapping.contributor.SimpleXpathMetadatumContributor">
<property name="field" ref="dc.title"/>
<property name="query" value="dc:title"/>
</bean>
```

Multiple record fields can also be combined into one value. To implement a combined mapping first create a *SimpleXpathMetadatumContributor* as explained above for each part of the field.

```xml
<bean id="lastNameContrib" class="org.dspace.importer.external.metadatamapping.contributor.SimpleXpathMetadatumContributor">
<property name="field" ref="dc.contributor.author"/>
<property name="query" value="x:authors/x:author/x:surname"/>
</bean>
<bean id="firstNameContrib" class="org.dspace.importer.external.metadatamapping.contributor.SimpleXpathMetadatumContributor">
<property name="field" ref="dc.contributor.author"/>
<property name="query" value="x:authors/x:author/x:given-name"/>
</bean>
```

Note that namespace prefixes used in the xpath queries are configured in bean "FullprefixMapping" in the same spring file.

```xml
<util:map id="FullprefixMapping" key-type="java.lang.String" value-type="java.lang.String">
<description>Defines the namespace mappin for the SimpleXpathMetadatum contributors</description>
<entry key="http://purl.org/dc/elements/1.1/" value="dc"/>
<entry key="http://www.w3.org/2005/Atom" value="x"/>
</util:map>
```

Then create a new list in the spring configuration containing references to all *SimpleXpathMetadatumContributor* beans that need to be combined.

```xml
<util:list id="combinedauthorList" value-type="org.dspace.importer.external.metadatamapping.contributor.MetadataContributor" list-class="java.util.LinkedList">
<ref bean="lastNameContrib"/>
<ref bean="firstNameContrib"/>
</util:list>{{/code}}
```

Finally create a spring bean configuration of class *org.dspace.importer.external.metadatamapping.contributor.CombinedMetadatumContributor*. This bean expects 3 values:

- field: A reference to the configured spring bean of the DSpace metadata field. e.g. the "dc.title" bean declared above.
- metadatumContributors: A reference to the list containing all the single record field mappings that need to be combined.
- separator: These characters will be added between each record field value when they are combined into one field.

```xml
<bean id="authorContrib" class="org.dspace.importer.external.metadatamapping.contributor.CombinedMetadatumContributor">
<property name="separator" value=", "/>
<property name="metadatumContributors" ref="combinedauthorList"/>
<property name="field" ref="dc.contributor.author"/>
</bean>
```

Each contributor must also be added to the "MetadataFieldMap" used by the *MetadataFieldMapping* implementation. Each entry of this map maps a metadata field bean to a contributor. For the contributors created above this results in the following configuration:

```xml
<util:map id="org.dspace.importer.external.metadatamapping.MetadataFieldConfig"
value-type="org.dspace.importer.external.metadatamapping.contributor.MetadataContributor">
<entry key-ref="dc.title" value-ref="titleContrib"/>
<entry key-ref="dc.contributor.author" value-ref="authorContrib"/>
</util:map>
```

Note that the single field mappings used for the combined author mapping are not added to this list.

Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/

package org.dspace.importer.external;

import org.dspace.importer.external.service.other.MetadataSource;

/**
* Created by: Antoine Snyers (antoine at atmire dot com)
* Date: 27 Oct 2014
*/
public abstract interface SourceExceptionHandler<T extends MetadataSource> {

public abstract void handle(T source);

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.importer.external.datamodel;

import org.dspace.importer.external.metadatamapping.MetadatumDTO;

import java.util.Collection;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;

/**
* Created by Roeland Dillen (roeland at atmire dot com)
* Date: 17/09/12
* Time: 14:03
*/
public class ImportRecord {
private List<MetadatumDTO> valueList = null;

public List<MetadatumDTO> getValueList() {
return Collections.unmodifiableList(valueList);
}

public ImportRecord(List<MetadatumDTO> valueList) {
//don't want to alter the original list. Also now I can control the type of list
this.valueList = new LinkedList<MetadatumDTO>(valueList);
}

@Override
public String toString() {
final StringBuilder sb = new StringBuilder();
sb.append("Record");
sb.append("{valueList=");
for(MetadatumDTO val:valueList){
sb.append("{");
sb.append(val.getSchema());
sb.append("; ");
sb.append(val.getElement());
sb.append("; ");

sb.append(val.getQualifier());
sb.append("; ");

sb.append(val.getValue());
sb.append("; ");
sb.append("}\n");

}
sb.append("}\n");
return sb.toString();
}

public Collection<MetadatumDTO> getValue(String schema, String element, String qualifier){
List<MetadatumDTO> values=new LinkedList<MetadatumDTO>();
for(MetadatumDTO value:valueList){
if(value.getSchema().equals(schema)&&value.getElement().equals(element)){
if(qualifier==null&&value.getQualifier()==null){
values.add(value);
} else if (value.getQualifier()!=null&&value.getQualifier().equals(qualifier)) {
values.add(value);
}
}
}
return values;
}

public void addValue(MetadatumDTO value){
this.valueList.add(value);
}
}
Loading

0 comments on commit 2833907

Please sign in to comment.