-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjournalAnalsysis.py
175 lines (142 loc) · 4.14 KB
/
journalAnalsysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
"""
A suite of programs to perform analysis on the TEI-markup Hunt Journals.
James Truitt
July 2021
"""
from glob import glob
import xml.etree.ElementTree as ET
def loadData():
print("Loading XML data...")
# Get list of TEI filenames
filenames = glob("../obf-site/src/assets/pid-tei/*.xml")
# Initialize container for xml data
xmls = []
# Loop over filenames, extracting the XML data in each & adding it to xmls
for filename in filenames:
print("Reading {}".format(filename + "..."), end="")
xml = ET.parse(filename)
xmls.append(xml)
print("\tdone")
print("XML data successfully loaded!")
return xmls
def getBoolCooccurrences(tei):
"""
For each div in the XML, returns a list of all the Ark IDs in that div
@param: tei, an XML file encoded according to the TEI standard
@return: a list of lists of Ark Ids
"""
# define namespace
ns = {'x':'http://www.tei-c.org/ns/1.0'}
# Get root element of tei
root = tei.getroot()
# Get iterable of divs
body = root[1].find("x:body", ns)
divs = body.iterfind("x:div", ns)
# Create container for entries
entries = []
# Loop over divs, extracting persName tags
for div in divs:
# Create a container for the persNames
entryNames = []
#print(div.find('{http://www.tei-c.org/ns/1.0}p'))
for para in div.iterfind('{http://www.tei-c.org/ns/1.0}p'):
for name in para.iterfind('{http://www.tei-c.org/ns/1.0}persName'):
if name.attrib["key"] in entryNames:
continue
else:
entryNames.append(name.attrib["key"])
entries.append(entryNames)
return entries
def getCountCoocurrences(tei):
return []
def main():
print("\n")
# Load the TEI data from files
teis = loadData()
print("\n")
# Initialize a holder for our data
data = []
print("Extracting Ark Id lists from TEI...\t")
# Loop over the TEI files
for journal in teis:
# From each TEI, extract set of sets of ARKids cooccurring in entries
collocs = getBoolCooccurrences(journal)
# Add that set to data holder
data += collocs
print("done!\n")
# Create a parallel list of numbers of names per entry
# lengths = []
# for entry in data:
# lengths.append(len(entry))
# print(lengths)
print("Filter out empty entries...\t")
# Filter entries without names out of data
# (loop over list backward, removing empty lists)
i = len(data)
while i > 0:
i += -1
if len(data[i]) == 0:
del data[i]
print("done!\n")
#### Calculate some statistics #####
# Loop over entries to count occurrences of each name
namecounts = {}
for entry in data:
for name in entry:
if name in namecounts.keys():
namecounts[name] += 1
else:
namecounts[name] = 1
nameList = list(namecounts.keys())
nameList.sort()
# Print list of names
for name in nameList:
print(name+"\t"+str(namecounts[name]))
## Create cooccurrence matrix ##
# # Initialize matrix
# matrix = [[0 for i in nameList] for j in nameList]
#
# # Loop over data and add to matrix
# for entry in data:
# for name1 in entry:
# mainIndex1 = nameList.index(name1)
# entryIndex = entry.index(name1)
# for name2 in entry:
# mainIndex2 = nameList.index(name2)
# matrix[mainIndex1][mainIndex2] += 1
#
# # Print matrix
# for i in nameList:
# print("_"+i, end="")
# print("")
# for i in range(len(nameList)):
# print(nameList[i], end="_")
# for k in range(len(nameList)):
# print(matrix[i][k], end="_")
# print("")
print("Analyzing coocurrence data...\t",end="")
# Create a list of name pairs
namePairs = {}
for i in nameList:
index = nameList.index(i)
for k in nameList[index+1:]:
key = i +'\t'+ k
namePairs[key] = 0
keys = list(namePairs.keys())
# keys.sort()
# Initalize the TSV that will eventually get written to a file
output = "Source\tTarget\tWeight\n"
# Loop over name-pairs, counting their occurrence in the data
for pair in keys:
for entry in data:
# Check to see if both halves of the key occur in this entry
if pair[:8] in entry and pair[-8:] in entry:
namePairs[pair] += 1
if namePairs[pair] > 0:
output += pair+'\t'+ str(namePairs[pair]) +'\n'
print("done!\n")
print("Writing data to cooccurrences.tsv...\t")
with open("cooccurrences.tsv","w") as f:
f.write(output)
print("done!\n")
main()