Commit 272bc90d authored by Alex Garnett's avatar Alex Garnett
Browse files

port to python 3

parent 32c52b02
......@@ -5,7 +5,6 @@ def execute(dir, pid, cmodel, label):
import csv
manifest_entry = [pid, cmodel, label]
path = os.path.dirname(dir)
manifest_file = open(os.path.join(path, 'manifest.csv'), 'a')
manifest_writer = csv.writer(manifest_file)
manifest_writer.writerow(manifest_entry)
manifest_file.close()
with open(os.path.join(path, 'manifest.csv'), 'a', encoding="utf-8") as manifest_file:
manifest_writer = csv.writer(manifest_file)
manifest_writer.writerow(manifest_entry)
......@@ -9,7 +9,7 @@ Currently only works with single-file content types.
* On the target Islandora instance
* [Islandora REST](https://github.com/discoverygarden/islandora_rest)
* On the system where the script is run
* Python 2.7. Will not run on Python 3.
* Python 3
## Installation
......
......@@ -22,14 +22,13 @@ exclude_collections = [
import sys
import os
import urllib
import urllib2
import urllib.request
import requests
import urllib.parse
import json
import mimetypes
import fnmatch
reload(sys)
sys.setdefaultencoding('utf8')
mimetypes.init()
sys.path.append(os.path.join(os.getcwd(), 'CollectionFetcherPlugins'))
......@@ -38,7 +37,7 @@ import pprint
pp = pprint.PrettyPrinter(indent=4)
tos_response_ok = ['y', 'Y']
tos_response = raw_input("I agree to use the downloaded content for research and study purposes only, and to not publish or redistribute it as a data set. (y/n): ")
tos_response = input("I agree to use the downloaded content for research and study purposes only, and to not publish or redistribute it as a data set. (y/n): ")
if tos_response not in tos_response_ok:
exit()
......@@ -48,13 +47,8 @@ collections_query = 'RELS_EXT_hasModel_uri_t:"' + model + '"?sort=fgs_label_s+as
site_rest_url = site_base_url + 'islandora/rest/v1/'
collections_request_url = site_rest_url + 'solr/' + collections_query
def writeContentFile(path, content):
file = open(path, 'w')
file.write(content)
file.close()
# Get list of collections, with the ones identified in exclude_collections filtered out.
collections_response_body = urllib2.urlopen(collections_request_url).read()
collections_response_body = urllib.request.urlopen(collections_request_url).read().decode('utf-8')
collections_response_body_dict = json.loads(collections_response_body)
collections = collections_response_body_dict['response']['docs']
# todo: Some collections in exlude_collections are not being removed, e.g., islandora:root.
......@@ -68,20 +62,20 @@ collections_menu = {}
for collection in collections:
index += 1
collections_menu[index] = collection
print str(index) + '\t' + collection['fgs_label_t'] + ' - ' + collection['PID']
print(str(index) + '\t' + collection['fgs_label_t'] + ' - ' + collection['PID'])
collection_index = raw_input("Enter the number of the collection you want to download: ")
collection_index = input("Enter the number of the collection you want to download: ")
collection_index = int(collection_index)
indexes = list(collections_menu.keys())
# @todo: Validate user input to be an integer.
if collection_index in indexes:
print collections_menu[collection_index]['fgs_label_t']
print(collections_menu[collection_index]['fgs_label_t'])
else:
print "That isn't a valid collection_index. Bye for now."
print("That isn't a valid collection_index. Bye for now.")
exit()
output_directory = raw_input("Enter the output directory where you want the collection saved: ")
output_directory = input("Enter the output directory where you want the collection saved: ")
if not os.path.exists(output_directory):
os.makedirs(output_directory)
......@@ -89,21 +83,22 @@ if not os.path.exists(output_directory):
collection_pid = collections_menu[collection_index]['PID'];
objects_query = 'RELS_EXT_isMemberOfCollection_uri_t:"' + collection_pid + '"?sort=fgs_label_s+asc&fl=PID,fgs_label_t,RELS_EXT_hasModel_uri_t&rows=1000000'
objects_request_url = site_rest_url + 'solr/' + objects_query
objects_response_body = urllib2.urlopen(objects_request_url).read()
objects_response_body = urllib.request.urlopen(objects_request_url).read().decode('utf-8')
objects_response_body_dict = json.loads(objects_response_body)
objects = objects_response_body_dict['response']['docs']
for object in objects:
print("Retrieving content from " + site_base_url + "islandora/object/" + object['PID'])
escaped_pid = urllib.quote_plus(object['PID'])
escaped_pid = urllib.parse.quote_plus(object['PID'])
object_output_directory = os.path.join(output_directory, escaped_pid)
os.makedirs(object_output_directory)
# Write out DC.xml.
dc_url = site_base_url + 'islandora/object/' + object['PID'] + '/datastream/DC/download'
dc_xml = urllib2.urlopen(dc_url).read()
dc_xml = urllib.request.urlopen(dc_url).read().decode('utf-8')
dc_xml_path = os.path.join(object_output_directory, 'DC.xml')
writeContentFile(dc_xml_path, dc_xml)
with open(dc_xml_path, 'w', encoding="utf-8") as file:
file.write(dc_xml)
# @odo: tWe will want to be able to control on a per-cmodel basis which
# datastreams get downloaded, e.g. for large image, it's the JPG, not the OBJ.
......@@ -116,12 +111,13 @@ for object in objects:
]
if object['RELS_EXT_hasModel_uri_t'] in single_file_cmodels:
obj_url = site_base_url + 'islandora/object/' + object['PID'] + '/datastream/OBJ/download'
obj_content_request = urllib2.urlopen(obj_url)
obj_content = obj_content_request.read()
mimetype = obj_content_request.info().getheader('Content-Type')
# Ported this routine from urllib to requests because it's much easier to work with in Python3, the others can be ported as well
obj = requests.get(obj_url)
mimetype = obj.headers["content-type"]
extension = mimetypes.guess_extension(mimetype)
obj_content_path = os.path.join(object_output_directory, 'OBJ' + extension)
writeContentFile(obj_content_path, obj_content)
with open(obj_content_path, 'wb') as file:
file.write(obj.content)
# Apply plugins.
plugins = fnmatch.filter(os.listdir('./CollectionFetcherPlugins'), '*.py')
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment