-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathevent_parse.py
320 lines (274 loc) · 13.2 KB
/
event_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
import requests
from bs4 import BeautifulSoup
import json
from hidden import username, password
import pandas as pd
# DataFrame
df = pd.read_csv('facebook_pages.csv', header=0)
def selection():
# This function returns whether to sort by Uni or Society
print('First question, sort by university or by culture?')
while True: # loops until valid input is made
try:
txt = int(input('1 - University\n2 - Culture\n3 - Quit program\n'))
except ValueError:
print('Not a number!')
else:
if ( 0 < txt < 4):
break
else:
print('Out of range. Try again')
return txt
def selection2(list_type):
# This function returns the second user selection
global df
# Sorted by University
if list_type == 1:
print('Select a University')
uni_list = df['unis/societies'].copy().to_list()
list_size = int(len(uni_list)+1)
for number, uni in enumerate(uni_list):
print(str(number+1) + ' - ' + str(uni))
print('Please enter the number associated with the university.')
while True: # loops until valid input is made
try:
txt = int(input())
except ValueError:
print('Not a number!')
else:
if ( 0 < txt < list_size):
break
else:
print('Out of range. Try again')
return txt
# Sorted by Cultural society focus
if list_type == 2:
print('Select a Culture.')
soc_list=list(df.copy())
del soc_list[0]
list_size = int(len(soc_list)+1)
for number, soc in enumerate(soc_list):
print(str(number+1) + ' - ' + str(soc))
print('Please enter the number associated with the culture.')
while True: # loops until valid input is made
try:
txt = int(input())
except ValueError:
print('Not a number!')
else:
if ( 0 < txt < list_size):
break
else:
print('Out of range. Try again')
return txt
def soc_list_by_uni(uni):
# This function lists the societies by University
global df
df_uni = df.copy() #Don't want to manipulate the original dataframe so make a copy
df_uni.set_index('unis/societies', inplace=True)
series = df_uni.iloc[uni-1].dropna()
soc_links = series.to_list()
return soc_links
def soc_list_by_culture(culture):
# This function lists societies by society focus
global df
df_cul = df.copy() # Don't want to manipulate original df so make a copy
df_cul.rename(columns=df_cul.iloc[0])
series = df_cul.iloc[:,culture].dropna()
soc_links = series.to_list()
return soc_links
def get_society(list_of_urls):
# This function returns a number which represents a society from a list of societies
length = len(list_of_urls)
print('Found ' + str(length) + ' facebook pages!')
payload = { # facebook asks for your email and password to facebook to make requests
'email': username,
'password': password
}
POST_LOGIN_URL = "http://www.facebook.com/login?"
counter = 1
for society_url in list_of_urls:
with requests.Session() as session:
post = session.post(POST_LOGIN_URL, data=payload)
page = requests.get(society_url)
soup = BeautifulSoup(page.content, "html.parser") # Html code of page in soup
title = soup.title.text.replace(' - Events | Facebook', '') # manipulate the title text
print(str(counter) + ' - ' + title) # associated number to society
counter = counter + 1
print(str(counter) + ' - All\nTo view events, enter a the number associated with a society or "All".') #Added an 'All' option to list all events from all societies
while True: # loops until valid input is made
try:
txt = int(input(''))
except ValueError:
print('Not a number!')
else:
if ( 0 < txt < length+2):
return txt
else:
print('Out of range. Try again')
def get_return():
# This function checks whether the user wants to return
print('Enter \'return\' to go back to main menu if they are no events you have interest in. Otherwise press any key to proceed.')
while True:
try:
txt = str(input())
except:
ValueError
else:
if txt == 'return':
return False
return True
def get_list_of_events_to_add(events_df):
# This function gets the list of events user wants to add
row_count = events_df.shape[0]
print('\nWhich events do you want to add to calender? (pick number between 1-' + str(row_count) + ')\nYou can enter up to ' + str(row_count) + ' events before entering any letter to stop.\n')
while True: # gets a list of user inputs but stops when a non-integer is inputed. Also checks if whether values and length is valid
try:
my_list = []
while True:
my_list.append(int(input()))
except:
if 0 < len(my_list) <= row_count: # checks length of list
if all(0 < i <= row_count for i in my_list) == True: # checks if values (for events) are valid
return [my_list, events_df]
print('Something went wrong, try enter again.')
def select_event_from_society(fb_url):
print('Collecting events ..')
payload = { # facebook asks for your own fb account in order to make requests
'email': username,
'password': password
}
POST_LOGIN_URL = "http://www.facebook.com/login?"
with requests.Session() as session:
post = session.post(POST_LOGIN_URL, data=payload)
page = requests.get(fb_url) # make request for fb page
soup = BeautifulSoup(page.content, "html.parser") # html code from web page to variable soup
title = soup.title.text.replace(' - Events | Facebook', '') # modify title text
links = [a.get('href') for a in soup.find_all('a', href=True)] #https://docs.python.org/2/tutorial/datastructures.html#list-comprehensions
series = pd.Series(links, index = links) # series of links found in fb_urls
event_links = series.loc[series.str.startswith('/events/', na=False)].to_list() # links that start with '/events' -> indicates its a facebook event url
no_of_links = len(event_links) # number of event links
if no_of_links < 1:
print('No events found.')
return 0
else:
print(str(no_of_links) + ' events found.')
# lists will be used to add event details to main dataframe
event_title_list = []
time_list = []
loc_list = []
for i in range(0, no_of_links):
event_links[i] = 'https://mbasic.facebook.com' + event_links[i]
for event in event_links:
with requests.Session() as session:
post = session.post(POST_LOGIN_URL, data=payload)
page = requests.get(event)
soup = BeautifulSoup(page.content, "html.parser")
title_filter = soup.find_all('h3') # title found in h3 tag, ignore the rest of soup
time_filter = soup.find_all('dt') # time found in a dt tag, ignore the rest of soup
loc_filter = soup.find_all('dd') # location found in the dd tag, ignore the rest of soup
for detail in title_filter:
detail = str(detail.text)
if detail == 'Summary': # ignore the text which writes 'Summary'
pass
else:
event_title_list.append(detail) # add title to list
temp = []
for detail in time_filter:
detail = str(detail.text)
temp.append(detail) # add text to temporary list
time_list.append(temp[0]) # time is always found first so only add that to time_list, ignore rest
temp = []
for detail in loc_filter:
detail = str(detail.text)
temp.append(detail) # add text to temporary list
loc_list.append(temp[-1]) # location is always found last so only add that to time_list, ignore rest
events_table = pd.DataFrame(
{'event_title': event_title_list,
'event_time': time_list,
'event_loc': loc_list
})
events_table.index+=1
print('\nEvents held by ' + str(title) + ':')
print(events_table[['event_title','event_time']])
ans = get_return()
if ans == False:
return False
else:
event_details = get_list_of_events_to_add(events_table)
event_details.append(title) # Add the society name to the dataframe
return event_details
def select_event_from_societies(fb_urls):
print('Collecting events ..')
payload = { # facebook asks for your own fb account in order to make requests
'email': username,
'password': password
}
POST_LOGIN_URL = "http://www.facebook.com/login?"
all_events = pd.DataFrame(columns = ['event_title', 'event_time', 'event_loc', 'society']) # main dataframe
for fb_url in fb_urls:
with requests.Session() as session:
post = session.post(POST_LOGIN_URL, data=payload)
page = requests.get(fb_url)
soup = BeautifulSoup(page.content, "html.parser") # html code from web page to variable soup
title = soup.title.text.replace(' - Events | Facebook', '') # modify title text
links = [a.get('href') for a in soup.find_all('a', href=True)] #https://docs.python.org/2/tutorial/datastructures.html#list-comprehensions collect url links found in fb_url
series = pd.Series(links, index = links) # series of links found in fb_urls
event_links = series.loc[series.str.startswith('/events/', na=False)].to_list() # links that start with '/events' -> indicates its a facebook event url
no_of_links = len(event_links)
if no_of_links < 1: # no links
print('No events found for ' + title + '\n')
else:
print(str(no_of_links) + ' events found for ' + title + '\n')
# lists will be used to add event details to main dataframe
event_title_list = []
time_list = []
loc_list = []
for i in range(0, no_of_links):
event_links[i] = 'https://mbasic.facebook.com' + event_links[i] # modify the links for http requests later
for event in event_links:
with requests.Session() as session:
post = session.post(POST_LOGIN_URL, data=payload)
page = requests.get(event)
soup = BeautifulSoup(page.content, "html.parser")
title_filter = soup.find_all('h3') # title found in h3 tag, ignore the rest of soup
time_filter = soup.find_all('dt') # time found in a dt tag, ignore the rest of soup
loc_filter = soup.find_all('dd') # location found in the dd tag, ignore the rest of soup
for detail in title_filter:
detail = str(detail.text)
if detail == 'Summary': # ignore the text which writes 'Summary'
pass
else:
event_title_list.append(detail) # add title to list
temp = []
for detail in time_filter:
detail = str(detail.text)
temp.append(detail) # add text to temporary list
time_list.append(temp[0]) # time is always found first so only add that to time_list, ignore rest
temp = []
for detail in loc_filter:
detail = str(detail.text)
temp.append(detail) # add text to temporary list
loc_list.append(temp[-1]) # location is always found last so only add that to time_list, ignore rest
events_table = pd.DataFrame(
{'event_title': event_title_list,
'event_time': time_list,
'event_loc': loc_list,
'society': title
})
all_events = all_events.append(events_table, ignore_index = True) # append dataframe to main dataframe
if all_events.empty == True:
return False
else:
all_events.index+=1
grouped_df = all_events.copy().groupby('society')
print('Here is a list of all events.\n')
for key, item in grouped_df: #item is each society df, key is the society name
print(key, "\n")
print(grouped_df.get_group(key)[['event_title', 'event_time']].to_string(header=False), "\n\n") # Construct DataFrame from group with provided name.
ans = get_return()
if ans == False:
return ans # return to main menu
else:
event_details = get_list_of_events_to_add(all_events)
return event_details # returns [my_list, events_df]