-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
117 lines (88 loc) · 3.97 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from wordcloud import WordCloud, ImageColorGenerator
def scrape(journal, vol_start, vol_end):
result = requests.get("https://www.journals.elsevier.com/"+journal)
soup = BeautifulSoup(result.content, features="lxml")
metrics = soup.find_all("span", "tooltip")
met_names = ['CiteScore','ImpactFactor','5-year Impact Factor','Norm. Impact per Paper','Journal Rank']
data = [a.b.text for a in metrics if a.b != None]
del data[2:4]
met_dict = dict(zip(met_names,data))
volumes = [str(i) for i in range(vol_start,vol_end)]
titles_str = []
for i in volumes:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
url = "https://www.sciencedirect.com/journal/" + journal + "/vol/" + i +"/suppl/C"
r = requests.get(url, headers=headers)
site = BeautifulSoup(r.content)
titles = site.find_all("span", class_="js-article-title")
for j in titles:
titles_str.append(j.text)
data = pd.DataFrame(titles_str)
data.to_csv('data/raw_data_'+journal+'.csv')
return met_dict
def clean_data(journal):
data = pd.read_csv('data/raw_data_'+journal+'.csv')
titles_str = list(data.values[:,1])
sws = stopwords.words('english')
titles_list = [re.split(r'\W+',i) for i in titles_str]
titles_set = [set(i).difference(sws)-{''} for i in titles_list]
return titles_set
def wordcloud(clean_data, max_words, max_font_size, min_font_size, background_color):
text = ' '.join([' '.join(list(i)) for i in clean_data])
wc = WordCloud(max_words = max_words, max_font_size = max_font_size, min_font_size = min_font_size, background_color=background_color).generate(text)
return wc
def word_freq(clean_data):
total_list = [x for _list in clean_data for x in _list]
total_set = list(set(total_list))
count = [total_list.count(i) for i in total_set]
total_dict = dict(zip(total_set,count))
return {k: v for k, v in sorted(total_dict.items(), key=lambda item: item[1],reverse=True)}
def word_trends(clean_data, num_journals, num_words, factor):
count = int(len(clean_data)/num_journals)*factor
section_dict = {}
section_keys = set()
for i in range(0,int(num_journals/factor)):
temp = clean_data[i*count:i*count+count]
temp_dict = word_freq(temp)
keys = list(temp_dict.keys())[0:num_words]
values = [temp_dict[i] for i in keys]
temp_dict_red = dict(zip(keys,values))
section_dict["Section: " + str(i)] = temp_dict_red
section_keys = set(section_keys | set(keys))
section_graph = {}
for i in list(section_keys):
section_graph[i] = []
for i in section_dict.values():
for j in list(section_keys):
if j not in i.keys():
i[j] = 0
section_graph[j].append(i[j])
return section_graph
def word_connections(clean_data, words, connections):
total_list = [x for _list in clean_data for x in _list]
total_set = list(set(total_list))
total_num = dict((j,i) for i,j in enumerate(total_set))
word_mat = np.zeros((len(clean_data), len(total_set)))
for i,j in enumerate(clean_data):
for k in list(j):
word_mat[i,total_num[k]] = 1
words_ind = [total_set.index(i) for i in words]
new_word_mat = np.zeros((len(words),len(total_set)))
for i in word_mat:
for j,k in enumerate(words_ind):
if i[k] == 1:
new_word_mat[j] = new_word_mat[j] + i
conn_dict = {}
for i in words:
temp_max = list(new_word_mat[words.index(i)].argsort()[-connections:-1][::-1])
temp_val = new_word_mat[words.index(i),temp_max]
temp_word = np.array(total_set)[temp_max]
conn_dict[i] = dict(zip(temp_word,temp_val))
return conn_dict