This repository has been archived by the owner on Jul 24, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path99_utils-w2v_prep_data
51 lines (46 loc) · 2.02 KB
/
99_utils-w2v_prep_data
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# pre-process data to be fed to word2vec
# * commons
#
set -e
CLEAN_VERSION='1'
clean () {
cat "${1:-/dev/stdin}" | \
# adapted from word2vec/demo-train-big-model-v1.sh
awk '{if ($0 == "<s>") {} else if ($0 == "</s>") { print "" } else { printf "%s ",tolower($0)}}' | \
perl -e '
# Program to filter Wikipedia XML dumps to "clean" text consisting only of lowercase
# letters (a-z, converted from A-Z), and spaces (never consecutive)...
# All other characters are converted to spaces. Only text which normally appears.
# in the web browser is displayed. Tables are removed. Image captions are.
# preserved. Links are converted to normal text. Digits are spelled out.
# *** Modified to not spell digits or throw away non-ASCII characters ***
# Written by Matt Mahoney, June 10, 2006. This program is released to the public domain.
while (<>) {
# Remove any text not normally visible
s/<.*>//; # remove xml tags
s/&/&/g; # decode URL encoded chars
s/</</g;
s/>/>/g;
s/<ref[^<]*<\/ref>//g; # remove references <ref...> ... </ref>
s/<[^>]*>//g; # remove xhtml tags
s/\[http:[^] ]*/[/g; # remove normal url, preserve visible text
s/\|thumb//ig; # remove images links, preserve caption
s/\|left//ig;
s/\|right//ig;
s/\|\d+px//ig;
s/\[\[image:[^\[\]]*\|//ig;
s/\[\[category:([^|\]]*)[^]]*\]\]/[[$1]]/ig; # show categories without markup
s/\[\[[a-z\-]*:[^\]]*\]\]//g; # remove links to other languages
s/\[\[[^\|\]]*\|/[[/g; # remove wiki url, preserve visible text
s/{{[^}]*}}//g; # remove {{icons}} and {tables}
s/{[^}]*}//g;
s/\[//g; # remove [ and ]
s/\]//g;
s/&[^;]*;/ /g; # remove URL encoded chars
s/[0-9]/0/g;
$_=" $_ ";
chop;
print $_;
}
' | awk '{if ((length($0) > 20) && (NF > 5)) print;}' | sed -e 's/^ //'
}