-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsentences.py
68 lines (52 loc) · 1.68 KB
/
sentences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# In this script I want to read the file book.txt which contains several sentences.
# I then want to split the book into sentences and save them in a list.
import json
import os
import shutil
import time
import re
import enchant
import random
en_dict = enchant.Dict("en_US")
book_path = 'book.txt'
sentences = []
def check_en(sentence):
whitelist_words = [";" , ":", "(", ")", "!", "?", ",", ".", "'", '-']
for word in sentence:
if en_dict.check(word) or word in whitelist_words:
pass
else:
return False
return True
# Open book and get the full text as string
with open(book_path, 'r') as book:
book_text = book.read()
sentences = book_text.replace('\n', ' ')
sentences = sentences.split('.')
print(len(sentences))
# remove empty strings
sentences = [s for s in sentences if s]
print(len(sentences))
# split sentences with ";"
new_sentences = []
for sentence in sentences:
if ';' in sentence:
new_sentences += sentence.split(';')
else:
new_sentences.append(sentence)
print(len(new_sentences))
# remove sentences with less than 10 words and more than 20 words
new_sentences = [s for s in new_sentences if len(s.split(' ')) > 10 and len(s.split(' ')) < 20]
print(len(new_sentences))
english_sentences = [s for s in new_sentences if check_en(s)]
print(len(english_sentences))
# for _ in range(10):
# print(random.choice(english_sentences))
# print('')
# write all sentences in a txt file, one sentence per line
with open('sentences.txt', 'w') as outfile:
for sentence in english_sentences:
if len(sentence.split(' ')) < 10:
print(sentence)
outfile.write(sentence + '\n')
print('DONE')