-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathextract.pl
320 lines (271 loc) · 10.8 KB
/
extract.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
#!/usr/bin/local/perl
# Perl script for extracting MD&A from 10-k forms.
# by Cheong Yiufung @ HKUST
# instructed by Prof. Allen Huang and with help from Xia Jingjing.
# 09/02/12
# Usage:
# perl extract.pl <filename>
# The script will then generate <filename>_mda and <filename>_quant if the corresponding item is extracted.
# Whether it successfully extracts or not, it will generate <filename>_plaintext, which contains the plain text after decoding from HTML.
# You can feed <filename>_out to extract_MDA.pl to check the difference.
# Use rm *_7 *_7a *_out to remove generated files.
######## Declaration of packages used #########
package HTMLStrip;
use base "HTML::Parser";
use HTML::Entities;
use utf8;
######## Declaration and Definition of functions #######
# All regular expressions use 4 modifiers, i, x, s, and g
# i for case insensitivity, so "ITEM" is the same as "item".
# x for adding whitespace inside regular expresssions, so we can add comments to improve readability.
# s for treating the expression as string. so . match literally anything, even \n
# g for global matching.
# Note: ? added after + or * (such as .*? \s+?) are for non-greedy pattern matching.
sub MDA{
# $change to record how many substitution has taken place.
$change = $content =~ s/
(?<!\")(?<!\,\ )(?<!in\ )(?<!and\ )(?<!or\ )(?<!not\ )(?<!see\ )(?<!to\ )(?<!with\ )(?<!under\ )(?<!regarding\ )(?<!by\ )(?<!the\ )(?<!caption\ )(?<!read\ )(?<!at\ )(?<!following\ )(?<!both\ )(?<!also\ )(?<!of\ )(?<!within\ )
###IMPORTANT COMMENT###
# This part declare words and symbols that CANNOT appear BEFORE the item.
# include : " , in and see to with under regarding by the caption read at following both
# This means the item is not being quoted nor referenced, hence escape from all the quoting rules and refernce sentences.
# Explanations on rules:
# after ", there is no space, so use (?<!\")
# after word, there should be one space, so use (?<!word\ )
# Add new rules according to above patterns. Possibly syntax could be simpler but I haven't found the way, so you have to do this tedious work, sorry.
# IMPORTANT REMINDER: also update the words in subroutine Quantitative and setStopSign. You should update 4 regex in total (2 in setStopSign).
###Continue of Regex###
item\s+?7
# item and arbitrary number of spaces.
[^Aa]*?
# anything in between except A, so won't match 7A.
.{0,25}
#anything in between for at most 25 characters. it match item 6, 7 and other situations.
#also cause problems of overrunning.
management.?s?.?\s+?discussions?\s+?and\s+?analysis\s+?
# the necessary part
(of\s+?financial\s+?conditions?|of\s+?results\s+?of\s+?operations?|or\s+?plans?\s+?of\s+?operations?)?
# "financial conditions" and "results of operations" may come in different order.
(\s+?and\s+?results\s+?of\s+?operations?|\s+?and\s+?financial\s+?conditions?)?
# the latter part may be unnecessary, so ? mark is used in the last.
/
######SPLIT MDA######
/gixs;
if($change == 0){
## Big chance that MDA is hidden in Item 6.
print "Try to find MDA in Item 6. \n";
TryMDA6();
}
if($MDAinItem6 == 0){
if($content =~ m/
item\s+?7
[^Aa]*?
(Consolidated)?\s+?financial\s+?statements?
/gixs){
# Item 7 is Financial statements, however, Item 6 is NOT MDA.
# catch this file by python script to find out more details.
print "Invalid Item 7: Financial statements found. \n";
}
}
}
# The function try to assert whether MDA is in Item 6.
# It only gets called when MDA is NOT in Item 7.
# May assert the global variable $MDAinItem6
sub TryMDA6{
# Item 6 is MDA.
$change = $content =~ s/
(?<!\")(?<!\,\ )(?<!in\ )(?<!and\ )(?<!or\ )(?<!not\ )(?<!see\ )(?<!to\ )(?<!with\ )(?<!under\ )(?<!regarding\ )(?<!by\ )(?<!the\ )(?<!caption\ )(?<!read\ )(?<!at\ )(?<!following\ )(?<!both\ )(?<!also\ )(?<!of\ )(?<!within\ )
item\s+?6
[^Aa]*?
.{0,25}
management.?s?.?\s+?discussions?\s+?and\s+?analysis\s+?
(of\s+?financial\s+?conditions?|of\s+?results\s+?of\s+?operations?|or\s+?plans?\s+?of\s+?operations?)?
(\s+?and\s+?results\s+?of\s+?operations?|\s+?and\s+?financial\s+?conditions?)?
/
######SPLIT MDA######
/gixs;
if ($change != 0){ # MDA is in Item 6.
$MDAinItem6=1;
}
# Item 6 is Plan of operations. considered as MDA.
$change = $content =~ s/
(?<!\")(?<!\,\ )(?<!in\ )(?<!and\ )(?<!or\ )(?<!not\ )(?<!see\ )(?<!to\ )(?<!with\ )(?<!under\ )(?<!regarding\ )(?<!by\ )(?<!the\ )(?<!caption\ )(?<!read\ )(?<!at\ )(?<!following\ )(?<!both\ )(?<!also\ )(?<!of\ )(?<!within\ )
item\s+?6
[^Aa]*?
.{0,25}
(management.?s?.?\s+?)?plans?\s+?of\s+?operations?\s+?
/
######SPLIT MDA######
/gixs;
if ($change != 0){ # MDA is in Item 6.
$MDAinItem6=1;
}
}
sub Quantitative{
$content =~ s/
(?<!\")(?<!\,\ )(?<!in\ )(?<!and\ )(?<!or\ )(?<!not\ )(?<!see\ )(?<!to\ )(?<!with\ )(?<!under\ )(?<!regarding\ )(?<!by\ )(?<!the\ )(?<!caption\ )(?<!read\ )(?<!at\ )(?<!following\ )(?<!both\ )(?<!also\ )(?<!of\ )(?<!within\ )
item\s+?7\.?\s*a # Sometimes may be 7.A
.*?
(quantitative|qualitative)\s+?and\s+?(quantitative|qualitative|qualification)\s+?disclosures?\s+?about\s+?market\s+?risk
/
######SPLIT QUANT######
/gixs;
}
sub setStopSign{
$content =~ s/
(?<!\")(?<!\,\ )(?<!in\ )(?<!and\ )(?<!or\ )(?<!not\ )(?<!see\ )(?<!to\ )(?<!with\ )(?<!under\ )(?<!regarding\ )(?<!by\ )(?<!the\ )(?<!caption\ )(?<!read\ )(?<!at\ )(?<!following\ )(?<!both\ )(?<!also\ )(?<!of\ )(?<!within\ )
item\s+?8
.*?
financial\s+?statements
/
######SPLIT STOPSIGN######
/gixs;
$content =~ s/
(?<!\")(?<!\,\ )(?<!in\ )(?<!and\ )(?<!or\ )(?<!not\ )(?<!see\ )(?<!to\ )(?<!with\ )(?<!under\ )(?<!regarding\ )(?<!by\ )(?<!the\ )(?<!caption\ )(?<!read\ )(?<!at\ )(?<!following\ )(?<!both\ )(?<!also\ )(?<!of\ )(?<!within\ )
item\s+?9a?
.*
changes\s+?in\s+?and\s+?
/
######SPLIT STOPSIGN######
/gixs;
# SPECIAL:
# When MDAinItem6 = 1, probably Item 7 Financial Statements is a stop sign.
# Set here.
if($MDAinItem6){
$content =~ s/
item\s+?7
[^Aa]*?
(Consolidated)?\s+?financial\s+?statements?
/
######SPLIT STOPSIGN######
/gixs;
}
}
#Debug mode.
$debug=1;
# Item 6 as MDA mode. asserted in MDA().
# In such situation, it's probable that Item 7 is financial statements.
# need to reset the stop sign under such situation
# Function influenced include: setStopSign() (set Item 7 financial statements as stop sign. )
$MDAinItem6=0;
if($debug == 1){ print "before extract.\n";}
# global string to store all the content;
$contentInOneLine="";
# override the method in HTML::Parser to add the target text into $string.
sub text{
my ($self, $text) = @_;
$contentInOneLine.=$text;
}
############# Start of Process #############
# Read file in.
$filename=$ARGV[0];
open (FILE, "< $filename")
or die "$filename cannot be open: $!\n";
my $p = new HTMLStrip;
while(<FILE>){
# function called procedures:
# parse($_) -> text($_) -> added into $contentInOneLine.
$p->parse($_);
}
$p->eof;# flush the parser.
close FILE;
if($debug==1){ print "file read.\n";}
# By so far we get the whole text in one line.
# Before extracting, substitute all HTML entity code into ASCII character.
$plainTextInOneLine = HTML::Entities::decode($contentInOneLine);
# Substitute all the UTF-8 encoded character into us-ascii character.
# Not a full list, but should be enough to eliminate all the strange characters and help proceed
# the substitution in the regex.
$plainTextInOneLine =~ s//'/gs;
$plainTextInOneLine =~ s//"/gs;
$plainTextInOneLine =~ s//"/gs;
$plainTextInOneLine =~ s//./gs;
$plainTextInOneLine =~ s//-/gs;
$plainTextInOneLine =~ s//-/gs;
$plainTextInOneLine =~ s/’/'/gs;
$plainTextInOneLine =~ s/\xc2\xa0/ /gs;
$plainTextInOneLine =~ s/\xa0/ /gs;
# Get rid of non-ASCII characters to avoid Warnings like "Wide character in print".
# Hopefully dumping several characters(normally they're space or some strange characters) won't hurt the text. Comment it if you wish.
$plainTextInOneLine =~ s/[^[:ascii:]]+//g;
# start from the 5% to skip the table of contents part.
$startPos = length($plainTextInOneLine) * 0.05;
$content = substr($plainTextInOneLine, $startPos);
if($debug==1){ print "before matching. \n";}
# Matching and substitution.
if($debug==1){ print "MDA starts matching. \n";}
MDA();
if($debug==1){ print "MDA fin matching. \n";}
if($debug==1){ print "QUANT starts matching. \n";}
Quantitative();
if($debug==1){ print "QUANT fin matching. \n";}
if($debug==1){ print "STOPSIGN starts matching. \n";}
setStopSign();
if($debug==1){ print "STOPSIGN fin matching. \n";}
# split lines into array.
@all = split /\#\#\#\#\#\#/, $content;
# prepare empty strings for output.
$outputMDA="";
$outputQuant="";
$existMDA=0;
$existQuant=0;
if($debug==1){ print "before extracting.\n";}
# now it's in such a pattern.
# sth######MDA######Content of MDA######QUANT######Content of QUANT
for($i = 0; $i < scalar(@all); ++$i){
if($all[$i] =~ m/^SPLIT (MDA|QUANT|STOPSIGN)$/s){
if($debug==1){ print "inside extraction.\n";}
# now $all[$i+1] should store the string I want.
if($all[$i] =~ m/^SPLIT MDA$/s){
if($debug==1){ print "inside MDA.\n";}
$existMDA=1;
# a do-until block is used here to add consecutive segments together.
# In some cases, there might be multiple MDA(Continue) in the file which will mess up the file. The loop will add them all together.
do{
if($debug==1){ print "adding MDA.\n";}
$outputMDA.=$all[++$i];
} until($all[$i+1] =~ m/^SPLIT QUANT$/s
or $all[$i+1] =~ m/^SPLIT STOPSIGN$/s
or $i==scalar(@all));
}
elsif($all[$i] =~ m/^SPLIT QUANT$/s){
if($debug==1){ print "inside QUANT.\n";}
$existQuant=1;
do{
$outputQuant.=$all[++$i];
} until($all[$i+1] =~ m/^SPLIT STOPSIGN$/s
or $i==scalar(@all));
}
}
}
if($debug==1){ print "after extracting.\n";}
# Output to 3 individual files.
$fileout=$filename."_plaintext";
$mdaFile=$filename."_mda";
$quantFile=$filename."_quant";
# out
open (FILEOUT, "> $fileout");
print FILEOUT $plainTextInOneLine;
close FILEOUT;
# 7
if($existMDA){
open (MDAFILE, "> $mdaFile");
print MDAFILE "MANAGEMENT'S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS\n\n";
print MDAFILE $outputMDA;
close MDAFILE;
print "$mdaFile outputted.\n";
}
else{
print "MDA in $filename not found!\n";
}
# 7a
if($existQuant){
open (QUANTFILE, "> $quantFile");
print QUANTFILE "QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK\n\n";
print QUANTFILE $outputQuant;
close QUANTFILE;
print "$quantFile outputted.\n";
}
else{
print "Quantitative in $filename not found.\n";
}