Commit 4a93169a authored by Mark Jordan's avatar Mark Jordan
Browse files

Added plugins.

parent dd8e345f
# Plugin to rename OBJ.jpe to OBJ.jpg. We need to do this
# because the extension assiged to JPEG files by the mimetypes
# package is '.jpe'.
def execute(dir, pid, cmodel, label):
import os
if os.path.exists(os.path.join(dir, 'OBJ.jpe')):
os.rename(os.path.join(dir, 'OBJ.jpe'), os.path.join(dir, 'OBJ.jpg'))
# Plugin to write out a CSV file listing the PID of each object, its
# content model, and its label.
def execute(dir, pid, cmodel, label):
import os
import csv
manifest_entry = [pid, cmodel, label]
path = os.path.dirname(dir)
manifest_file = open(os.path.join(path, 'manifest.csv'), 'a')
manifest_writer = csv.writer(manifest_file)
manifest_writer.writerow(manifest_entry)
manifest_file.close()
# Fetch Islandora Collections
Command-line tool for downloading collections from Islandora.
Command-line tool for downloading collections from Islandora. Currently only works with single-file content types.
## Requirements
* On the target Islandora instance
* [Islandora REST](https://github.com/discoverygarden/islandora_rest)
* On the system where the script is run
* Python 2.7. Will not run on Python 3.
## Installation
......@@ -19,7 +20,32 @@ Command-line tool for downloading collections from Islandora.
`python fetch_collection.py`
Select a collection, specify an output directory, and hit enter.
Agree to the terms of use, select a collection, specify an output directory, and hit enter. Output will look like this (assuming you specified `output` as your output directory):
```
output/
├── islandora%3A1
│   ├── DC.xml
│   └── OBJ.jpg
├── islandora%3A10
│   ├── DC.xml
│   └── OBJ.jpg
├── islandora%3A9
│   ├── DC.xml
│   └── OBJ.jpg
└── manifest.csv
```
Each Islandora object will be saved in its own subdirectory named after the object's PID; the directory name is encoded to make the `:` in the PID filesystem safe. In the output directory there will be a CSV file named `manifest.csv` listing for each object its PID, content model, and label.
### Plugins
Plugins are simple Python modules that act on the retrived object just after it is saved to disk. Plugins are not registered; if a file ending in `.py` is in the `CollectionFetcherPlugins` subdirectory, its `execute` function will be executed, with the following as arguments:
* `dir`: the object's output directory
* `pid`: the object's PID
* `cmodel`: the object's content model, in the form `info:fedora/islandora:sp_basic_image`
* `label`: the object's label
## Maintainer
......
......@@ -4,21 +4,44 @@
site_base_url = 'http://digital.lib.sfu.ca/'
exclude_collections = [
'islandora:root',
'scrbpreservation:collection',
'sfudigitalcollections:1',
'test:collection',
'bcbookworld:collection',
'ubc:collection',
'ubcjap:collection',
'ubcscot:collection',
'choddar:collection',
'art:collection',
'sfulibr:collection',
'islandora:bookCollection',
'cubanovels:collection'
]
import sys
import os
import urllib
import urllib2
import json
import mimetypes
import fnmatch
reload(sys)
sys.setdefaultencoding('utf8')
mimetypes.init()
sys.path.append(os.path.join(os.getcwd(), 'CollectionFetcherPlugins'))
# For debugging purposes.
import pprint
pp = pprint.PrettyPrinter(indent=4)
tos_response_ok = ['y', 'Y']
tos_response = raw_input("I agree to use the downloaded content for research and study purposes only, and to not publish or redistribute it as a data set. (y/n): ")
if tos_response not in tos_response_ok:
exit()
model = 'islandora:collectionCModel'
collections_query = 'RELS_EXT_hasModel_uri_t:"' + model + '"?sort=fgs_label_s+asc&fl=PID,fgs_label_t&rows=1000000'
......@@ -30,21 +53,22 @@ def writeContentFile(path, content):
file.write(content)
file.close()
def writeManifestFile(path, content):
file = open(path, 'a')
file.write(content + '\n')
file.close()
# Get list of collections, with the ones identified in exclude_collections filtered out.
collections_response_body = urllib2.urlopen(collections_request_url).read()
collections_response_body_dict = json.loads(collections_response_body)
collections = collections_response_body_dict['response']['docs']
# todo: Some collections in exlude_collections are not being removed, e.g., islandora:root.
for collection in collections:
if collection['PID'] in exclude_collections:
collections.remove(collection)
# Print the collection menu.
index = 0
collections_menu = {}
for collection in collections:
index += 1
collections_menu[index] = collection
print str(index) + ' - ' + collection['fgs_label_t']
print str(index) + '\t' + collection['fgs_label_t'] + ' - ' + collection['PID']
collection_index = raw_input("Enter the number of the collection you want to download: ")
collection_index = int(collection_index)
......@@ -61,6 +85,7 @@ output_directory = raw_input("Enter the output directory where you want the coll
if not os.path.exists(output_directory):
os.makedirs(output_directory)
# Query Solr for all objects in the selected collection.
collection_pid = collections_menu[collection_index]['PID'];
objects_query = 'RELS_EXT_isMemberOfCollection_uri_t:"' + collection_pid + '"?sort=fgs_label_s+asc&fl=PID,fgs_label_t,RELS_EXT_hasModel_uri_t&rows=1000000'
objects_request_url = site_rest_url + 'solr/' + objects_query
......@@ -69,22 +94,19 @@ objects_response_body_dict = json.loads(objects_response_body)
objects = objects_response_body_dict['response']['docs']
for object in objects:
manifest_entry = object['PID'] + '\t' + object['RELS_EXT_hasModel_uri_t'] + '\t' + object['fgs_label_t']
writeManifestFile(os.path.join(output_directory, 'manifest.tsv'), manifest_entry)
print("Retrieving content from " + site_base_url + "islandora/object/" + object['PID'])
escaped_pid = urllib.quote_plus(object['PID'])
object_output_directory = os.path.join(output_directory, escaped_pid)
os.makedirs(object_output_directory)
# DC
# Write out DC.xml.
dc_url = site_base_url + 'islandora/object/' + object['PID'] + '/datastream/DC/download'
dc_xml = urllib2.urlopen(dc_url).read()
dc_xml_path = os.path.join(object_output_directory, 'DC.xml')
writeContentFile(dc_xml_path, dc_xml)
# We will want to be able to control on a per-cmodel basis which datasterams
# get downloaded, e.g. for large image, it's the JPG, not the OBJ.
# @odo: tWe will want to be able to control on a per-cmodel basis which
# datastreams get downloaded, e.g. for large image, it's the JPG, not the OBJ.
single_file_cmodels = [
'info:fedora/islandora:sp_basic_image',
'info:fedora/islandora:sp_large_image_cmodel',
......@@ -101,4 +123,11 @@ for object in objects:
obj_content_path = os.path.join(object_output_directory, 'OBJ' + extension)
writeContentFile(obj_content_path, obj_content)
# Apply plugins.
plugins = fnmatch.filter(os.listdir('./CollectionFetcherPlugins'), '*.py')
for plugin_filename in plugins:
plugin_name, ext = os.path.splitext(plugin_filename)
plugin = __import__(plugin_name)
plugin.execute(object_output_directory, object['PID'], object['RELS_EXT_hasModel_uri_t'], object['fgs_label_t'])
print("Output is in " + os.path.abspath(output_directory))
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment