-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimdi2untl.py
172 lines (149 loc) · 8.18 KB
/
imdi2untl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
##################################################
#
# Creator: [email protected]
# Date: July 2018
#
# Converts .imdi xml files generated by SayMore
# to UNTL xml for UNT Dept of Linguistics.
# Needs SayMore v3.1.4 and lower.
# Some code should already work with summer '18 SayMore release.
#
# Dependencies: sys, os, xmltodict (https://github.com/martinblech/xmltodict)
#
# From command line:
# python3 <path/to/IMDIPackages/sessionID> <path/to/destination/folder, Default=<path/to/IMDIPackages/sessionID>
#
#################################################
import xmltodict
import sys
import os
def convertDuration(ISOformat):
"""Converts ISO duration format (hh:mm:ss) to UNT format (Xhrs Xmin Ysec)"""
duration = ISOformat.split(':')
components = [' hr.', ' min.', ' sec.']
unt_time = []
for idx, time in enumerate(duration):
if time != '00':
unt_time.append(time)
unt_time.append(components[idx])
return ''.join(unt_time)
def xml2file(path):
"""Convert xml dict back to xml and writes to file"""
xmltodict.unparse(unt_dict, output=open(path, 'w'), pretty=True)
def writeXML(resource, unt_dict, destination):
"""Gets IMDI metadata for appropriate file to UNT XML.
Writes to file."""
filename = os.path.basename(resource['ResourceLink'])
resource_name, resource_ext = os.path.splitext(filename)
newfilename = resource_name + resource_ext[1:].upper() + '.untl'
newpath = destination + '/' + newfilename
if resource_ext == '.eaf' or resource_ext == '.pdf':
original_file = filename
# Resource type.
unt_prefix['resourceType'] = 'text'
# Format. Only audio or audio/wav like IMDI?
unt_prefix['format'] = 'text'
# get duration of audio/ size of eaf in KB
unt_prefix['description'][1]['#text'] = ''#resource['Size']
# is primary source?
unt_prefix['primarySource'] = '' #'1'
# get relation_based_on
#unt_prefix['???'] = os.path.basename(resource['MediaResourceLink'])
xml2file(newpath)
elif resource_ext == '.wav' or resource_ext == '.mp3' or resource_ext == '.MP3':
# Resource type.
unt_prefix['resourceType'] = 'sound'
# Format. Only audio or audio/wav like IMDI?
unt_prefix['format'] = 'audio'
# get duration of audio
unt_prefix['description'][1]['#text'] = '1 recording (%s)' % convertDuration(
resource['TimePosition']['End'])
# Is primary source? Assume True if audio/wav. 1 = True
unt_prefix['primarySource'] = '1'
# get relation_based_for
#unt_prefix[???] = original_file
xml2file(newpath)
# ############# MAIN ##################
# Where to save UNTL files
dest_folder = sys.argv[1]
if len(sys.argv) == 3:
dest_folder = sys.argv[2]
#loop over files and converts files with .imdi extensions in a directory to .untl XMLs
for filename in os.listdir(sys.argv[1]):
if filename.endswith('.imdi'):
print(filename)
filepath = os.path.join(sys.argv[1], filename)
with open(filepath) as IMDI:
# IMDI xml to dict
imdi_dict = xmltodict.parse(IMDI.read())
unt_dict = {
'metadata': {
'title': {'@qualifier': 'officialtitle', '#text': ''},
'creator': {'@qualifier': 'rth', 'type': 'per', 'name': 'Chelliah, Shobhana'}, #hard-coded
'contributor': [{'@qualifier': 'rth', 'type': 'per', 'name': 'Chelliah, Shobhana'}], #hard-coded
'date': {'@qualifier': 'creation', '#text': '2008'}, #hard-coded
'language': '',
'description': [{'@qualifier': 'content', '#text': ''}, {'@qualifier': 'physical', '#text': ''}],
'subject': [{'@qualifier': 'LCSH', '#text': 'Linguistics.'}, {'@qualifier': 'KWD', '#text': ''},
{'@qualifier': 'KWD', '#text': ''}],
'primarySource': '',
'source': {'@qualifier': 'other', '#text': 'Data collection'}, # hard-coded
'collection': 'SAALT', # hard-coded
'institution': 'UNTCOI', # hard-coded
'rights': {'@qualifier': 'access', '#text': 'public'}, # hard-coded
'resourceType': '',
'format': '',
'identifier': {'@qualifier': 'LOCAL-CONT-NO', '#text': ''},
# These are hard-coded, except ark and metadataModificationDate
'meta': [{'@qualifier': 'metadataCreator', '#text': 'htarver'},
{'@qualifier': 'system', '#text': 'DC'},
{'@qualifier': 'ark', '#text': ''},
{'@qualifier': 'metadataCreationDate', '#text': '2018-04-24, 12:04:05'},
{'@qualifier': 'metadataModifier', '#text': 'mrobinson'},
{'@qualifier': 'metadataModificationDate', '#text': ''},
{'@qualifier': 'hidden', '#text': 'False'}]
}}
imdi_prefix = imdi_dict['METATRANSCRIPT']['Session']
unt_prefix = unt_dict['metadata']
# FROM SESSION METADATA
# Copy Title. imdi Title --> unt title
unt_prefix['title']['#text'] = imdi_prefix['Title']
# Get Creator. Researcher/recorder name from Settings???? after "Recorded by"
#Creator hard-coded for now
# Get contributors. Copy all Actors (from People) as Contributors.
# if only one actor, else multiple actors
if imdi_prefix['MDGroup']['Actors'] is not None:
if isinstance(imdi_prefix['MDGroup']['Actors']['Actor'], list):
for Actor in imdi_prefix['MDGroup']['Actors']['Actor']:
unt_prefix['contributor'].append({'type': 'per', 'name': Actor['FullName']})
else:
name = imdi_prefix['MDGroup']['Actors']['Actor']['FullName']
unt_prefix['contributor'].append({'type': 'per', 'name': name})
# Add qualifier="rth" to researcher as contributor. Must have already gotten researcher's name.
#unt_prefix['contributor'].append(
#{'@qualifier': 'rth', 'type': 'per', 'name': unt_prefix['creator']['name']})
# Copy Date from SayMore Custom field (imdi=Keys??) - hard-coded for now
#unt_prefix['date']['#text'] = imdi_prefix['Date'].split('-')[0]
# Copy imdi Content Language as language.
if imdi_prefix['MDGroup']['Content']['Languages'] is not None:
if isinstance(imdi_prefix['MDGroup']['Content']['Languages']['Language'], list):
for Language in imdi_prefix['MDGroup']['Content']['Languages']['Language']:
if Language['Description']['#text'] == 'Content Language':
unt_prefix['language'] = Language['Id']
else:
unt_prefix['language'] = imdi_prefix['MDGroup']['Content']['Languages']['Language']['Id']
# Copy description as unt content description
unt_prefix['description'][0]['#text'] = imdi_prefix['Description']
# copy_sbj-kwds
# Copy session identifier/IMDI Name as UNT identifier
unt_prefix['identifier']['#text'] = imdi_prefix['Name']
# Create a UNTL xml for each WAV and EAF file in a SayMore session. Assumes min. 1 each per session
for _, resource in imdi_prefix['Resources'].items():
# if one audio and/or one written, else multiple of each type
if not isinstance(resource, list):
if resource['ResourceLink'].startswith(imdi_prefix['Name']):
writeXML(resource, unt_dict, dest_folder)
else:
for x in resource:
if x['ResourceLink'].startswith(imdi_prefix['Name']):
writeXML(x, unt_dict, dest_folder)