-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathextracter.py
104 lines (97 loc) · 4.22 KB
/
extracter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# -*- coding: utf-8 -*-
"""
Extract information from news using Named-entity recognition technique from nltk.
Some nltk module might be needed, plase modify and insert the script below if neccesary:
>>> import nltk
>>> nltk.download()
"""
import pymongo
import nltk
import argparse
import os
FLAGS = None
def __read_from_mongodb__(host,port,query={},show=False):
"""Get the query result from MongoDB.
Args:
host: str. The host of MongoDB.
port: int.The port of MongoDB.
query: dict. Query for certain collection. Default:all. e.g. {'title':'Machine Learning: A Gentle Introduction. – Towards Data Science'}
Returns:
List of post(document) found.
"""
client = pymongo.MongoClient(host,port)
collection = client.get_database('AI_news_tracker').get_collection('article')
posts = [i for i in collection.find(query)]
print(f'Read {len(posts)} posts from database.')
if show:
_=[print(post['title']) for post in posts]
return posts
def __summarize__(content,show=False):
"""Summerize the content by counting top 5 sentences that have most NN,NNP in Named-entity recognition.
Args:
content: str. The content you want to summerize.
show: bool. Show the summary or not. Default False.
Returns:
List of top 5 sentences.
"""
results=[]
for sent_no,sentence in enumerate(nltk.sent_tokenize(content)):
no_of_tokens=len(nltk.word_tokenize(sentence)) #split the sentence
tagged=nltk.pos_tag(nltk.word_tokenize(sentence)) #Part-of-Speech tagging
no_of_nouns=len([word for word,pos in tagged if pos in ["NN","NNP"]])
ners=nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence)), binary=False) #merge tags for chunking
no_of_ners= len([chunk for chunk in ners if hasattr(chunk, 'label')]) #check the chunk
score=(no_of_ners+no_of_nouns)/(float(no_of_tokens)+1)
results.append((sent_no,no_of_tokens,no_of_ners, no_of_nouns,score,sentence))
summary = []
for number,sent in enumerate(sorted(results,key=lambda x: x[4],reverse=True)):
if number < 5:
if show:
print(sent[5])
summary.append(sent[5])
return summary
def __write_summary_into_db__(post,host,port):
"""write summary into MongoDB
Args:
post: dict. document
host: str. The host of MongoDB.
port: int.The port of MongoDB.
Returns:
None
"""
client = pymongo.MongoClient(host,port)
collection = client.get_database('AI_news_tracker').get_collection('article')
try:
collection.update_one({'_id':post['_id']}, {'$set': {'summary': post['summary']}})
print('Write into database successfully.')
except KeyError:
print('Warning: No summary was written.')
return
def main(query,host,port,show=False):
posts = __read_from_mongodb__(host,port,query,show)
for post in posts:
try:
if show: print(post['title'])
post['summary'] = __summarize__(post['content'],show)
__write_summary_into_db__(post,host,port)
except KeyError:
print('Warning: No content found in{}'.format(post['_id']))
print('Done')
os._exit(1)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--title',type=str,default=None,help="Query for title. Default:None e.g. 'Machine Learning: A Gentle Introduction. – Towards Data Science'")
parser.add_argument('--author',type=str,default=None,help="Query for author. Default:None e.g. 'Nvs Abhishek' ")
parser.add_argument('--date',type=str,default=None,help="Query for date. Default:None e.g.'Sep 22' ")
parser.add_argument('--host',type=str,default='127.0.0.1',help="The host of MongoDB")
parser.add_argument('--port',type=int,default='27017',help="The port of MongoDB")
parser.add_argument('--show',type=bool,default=False,help="Show the title and summary or not.")
FLAGS, unparsed = parser.parse_known_args()
query = {}
if FLAGS.title is not None:
query['title'] = FLAGS.title
if FLAGS.author is not None:
query['author'] = FLAGS.author
if FLAGS.date is not None:
query['date'] = FLAGS.date
main(query,FLAGS.host,FLAGS.port,FLAGS.show)