-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathegauge_data.py
136 lines (124 loc) · 8.38 KB
/
egauge_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/python
import urllib2
import csv
import pandas as pd
import pytz
import time
import datetime
import os.path
import httplib
user_input = 1498867200
'''
HOW TO RUN CODE PROPERLY
specify the user_input as whatever start time you want.
Run the script. If a file is already finished, it should skip it. if it is interrupted mid construction of a file, you
just have to restart and it will append it.
If the script stops before it is done, you just gotta start it again. however, the full_list file will be weird. I'm
trying to fix that now.
- upload new version of files to server***
'''
df = pd.read_csv('eGauges-nearest-airport.csv', sep=',') # Loads df
homes_set = df['eGauge'].tolist() # to loop over homes
timezones = df['Airport timezone'].tolist() # TS
# egauge_id = [] # keeps track of egauges
# earliest_record = []
# egauge_dict = {} # Dictionary for full list
# counter = 0 # TEST
epoch0 = pytz.utc.localize(datetime.datetime(1970, 1, 1)) # TS
for home_ in range(len(homes_set)): # begin loop over homes
# counter += 1 # TEST
# print counter # TEST
home_id_ = str(homes_set[home_]) # utility
# egauge_id.append(home_id_) # Checks home as looped over
# --------------------------------------------------Webpage reader------------------------------------------------------
# start_ = 1322697600 # Test for 1 hour Initial start time
# start_ = 1322514000 # Test for 1 day
# start_ = 1327708800 # Test for 30 days
start_ = user_input
end_ = start_ # Looping purposes
reter = ['initial', 'junk'] # Initial condition for loop
while len(reter) > 1: # make sure this is a proper end condition
timestamps_for_home = [] # New list every month
local_time_for_home = [] # New list every month
egauge_id_repeated = [] # New list every month
gen_kwh_for_home = [] # New list every month
egauge_dict = {} # Dictionary for full list
if os.path.isfile(home_id_ + '-data.csv'): # if it already exists determine new start time
previously_scraped = pd.read_csv(home_id_ + '-data.csv', sep=',')
previous_timestamps = previously_scraped['Timestamp'].tolist()
start_ = previous_timestamps[-1] # Finds the last entry to exist.
end_ = start_ # Looping purposes
# end_ = end_ - 3600 # Test 1 hour
# end_ = end_ - 86400 # Test 1 days
# start_ = start_ - 86400 # part of the test for 1 day
end_ = end_ - 2592000 # Test 30 days : specifically when the if is here ^^
try:
passto = 'http://' + home_id_ + '.egaug.es/cgi-bin/egauge-show?&c&C&h&f='+str(start_)+'&t='+str(end_) + '&Z=US/Eastern'
# print passto # TEST
response = urllib2.urlopen(passto) # saves response from call
except urllib2.HTTPError, e:
# earliest_record.append('HTTPError')
egauge_dict[home_id_] = 'HTTPError'
continue
except urllib2.URLError, e:
# earliest_record.append('URLError')
egauge_dict[home_id_] = 'URLError'
continue
except Exception, e:
# earliest_record.append('Exception')
egauge_dict[home_id_] = 'Exception'
continue
# --------------------------------------------------Webpage reader----------------------------------------------
cr = csv.reader(response)
reter = []
try:
for row in cr:
reter.append(row) # Builds more understandable format
except httplib.IncompleteRead, e: # I don't think this exception actually does anything
# earliest_record.append('IncompleteRead')
egauge_dict[home_id_] = 'IncompleteRead'
continue
# print len(reter), reter # TEST
for lists in range(1, len(reter)): # from start of data to end
# -----------------------------------Time conversion nonsense-----------------------------------------------
if len(reter[lists][0]) <= 16: # BECAUSE LOCALTIME IS SOMETIME PROVIDED IN WEIRD FORMAT
time1 = datetime.datetime.strptime(reter[lists][0], '%Y-%m-%d %H:%M')
else:
time1 = datetime.datetime.strptime(reter[lists][0], '%Y-%m-%d %H:%M:%S')
local_timezone = pytz.timezone(timezones[home_]) # timezone object for local
#time2 = local_timezone.normalize(local_timezone.localize(time1)) # This line doesnt change timezones
time3 = pytz.utc.normalize(pytz.utc.localize(time1)) # time object for utc time
time4 = time3.astimezone(local_timezone) # this line converts to local time
unix_dt = int((time3 - epoch0).total_seconds()) # timestamp in seconds
# -----------------------------------------APPEND LISTS-----------------------------------------------------
timestamps_for_home.append(unix_dt)
local_time_for_home.append(time4)
egauge_id_repeated.append(home_id_)
gen_kwh_for_home.append(reter[lists][2])
# -------------------------------------------------Puts data in form--------------------------------------------
home_stamp = pd.DataFrame(timestamps_for_home, columns=['Timestamp']) # This bit turns lists into columns
egauge_id_col = pd.DataFrame(egauge_id_repeated, columns=['egauge']) # and joins then into 1 df
local_col = pd.DataFrame(local_time_for_home, columns=['Local Time'])
gen_col = pd.DataFrame(gen_kwh_for_home, columns=['Solar [kw]'])
home_data = pd.concat([home_stamp, egauge_id_col, local_col, gen_col], join='outer', axis=1)
if os.path.isfile(home_id_ + '-data.csv'): # but if the df file already exists
# previously_scraped = pd.read_csv(home_id_ + '-data.csv', sep=',')
home_data = previously_scraped.append(home_data, ignore_index=True) # load it and append new data
home_data.to_csv(home_id_ + '-data.csv', sep=',', encoding='utf-8', index=False) # either way save it back
# time.sleep(6) # this is so we don't get blacklisted again
# earliest_record.append(unix_dt) # Checks earliest timestamp for home checked before loop
egauge_dict[home_id_] = unix_dt #
full_list = pd.DataFrame(egauge_dict.items(), columns=['Homes', 'Earliest Record'])
if os.path.isfile('full_list.csv'):
previous_full_list = pd.read_csv('full_list.csv', sep=',')
full_list = previous_full_list.append(full_list, ignore_index=True)
full_list.to_csv('full_list.csv', sep=',', encoding='utf-8', index=False)
# time.sleep(60) # Wait 60 seconds for next request to avoid blacklist
# --------------------------------------------------Outside of loop-------------------------------------------------
# this is for the checklist .
'''
all_egauges = pd.DataFrame(egauge_id, columns=['eGauges'])
earliest = pd.DataFrame(earliest_record, columns=['Earliest Records'])
full_list = pd.concat([all_egauges, earliest], join='outer', axis=1)
full_list.to_csv('full_list.csv', sep=',', encoding='utf-8', index=False)
'''