forked from utkarshdalal/brookings_data_scrapers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerit_scraper.py
160 lines (133 loc) · 7.24 KB
/
merit_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import urllib.request
from bs4 import BeautifulSoup
import csv
from datetime import datetime, timedelta
import os
import logging
import pymysql
import helper_methods
def get_merit_data(conn):
meritindia_url = 'http://www.meritindia.in'
current_datetime = datetime.utcnow().replace(microsecond=0).isoformat()
page = urllib.request.urlopen(meritindia_url)
html_content = BeautifulSoup(page, 'html.parser')
column_headings = ['TIMESTAMP']
row_values = [current_datetime]
print(f'Running write-to-aws.py at {current_datetime}')
# Get data headers from website
data_types = html_content.find_all('div', 'gen_title_sec')
for data_type in data_types:
column_headings.append(str(data_type.text.strip()))
# Get current data values from website
current_values = html_content.find_all('div', 'gen_value_sec')
for current_value in current_values:
data_value = (current_value.find('span', 'counter'))
row_values.append(data_value.text.strip().replace(',', ''))
print('Writing data to rds')
try:
cursor = conn.cursor()
cursor2 = conn.cursor()
data_dict = {}
demand_met = float(row_values[1])
thermal_generation = float(row_values[2])
gas_generation = float(row_values[3])
nuclear_generation = float(row_values[4])
hydro_generation = float(row_values[5])
renewable_generation = float(row_values[6])
total_generation = thermal_generation + gas_generation + nuclear_generation + hydro_generation + renewable_generation
utc_timestamp = datetime.strptime(current_datetime, "%Y-%m-%dT%H:%M:%S")
rounded_timestamp_15 = utc_timestamp - timedelta(minutes=utc_timestamp.minute % 15,
seconds=utc_timestamp.second,
microseconds=utc_timestamp.microsecond)
rounded_timestamp_5 = utc_timestamp - timedelta(minutes=utc_timestamp.minute % 5,
seconds=utc_timestamp.second,
microseconds=utc_timestamp.microsecond)
data_dict['demand_met'] = demand_met
data_dict['thermal_generation'] = thermal_generation
data_dict['gas_generation'] = gas_generation
data_dict['nuclear_generation'] = nuclear_generation
data_dict['hydro_generation'] = hydro_generation
data_dict['renewable_generation'] = renewable_generation
data_dict['thermal_generation_corrected'] = thermal_generation
data_dict['gas_generation_corrected'] = gas_generation
data_dict['nuclear_generation_corrected'] = nuclear_generation
data_dict['hydro_generation_corrected'] = hydro_generation
data_dict['renewable_generation_corrected'] = renewable_generation
data_dict['timestamp'] = utc_timestamp
data_dict['5_min_rounded_timestamp'] = rounded_timestamp_5
data_dict['15_min_rounded_timestamp'] = rounded_timestamp_15
data_dict = calculate_corrected_data(cursor2, data_dict, demand_met, gas_generation, hydro_generation,
nuclear_generation, renewable_generation, thermal_generation,
total_generation)
helper_methods.insert_into_table('merit_india_data_rounded_corrected', data_dict, cursor, conn)
except Exception as e:
print(f'Could not write data to rds! {str(e)}')
finally:
cursor.close()
conn.close()
print('Finished writing data to rds')
def calculate_corrected_data(cursor2, data_dict, demand_met, gas_generation, hydro_generation, nuclear_generation,
renewable_generation, thermal_generation, total_generation):
if total_generation <= 0.95 * demand_met or total_generation >= 1.05 * demand_met:
cursor2.execute(
f'select timestamp, thermal_generation_corrected, gas_generation_corrected, hydro_generation_corrected, '
f'nuclear_generation_corrected, renewable_generation_corrected, demand_met from '
f'merit_india_data_rounded_corrected where timestamp < "{current_datetime}" '
f'order by timestamp desc limit 1')
for r2 in cursor2:
previous_thermal = r2[1]
previous_gas = r2[2]
previous_hydro = r2[3]
previous_nuclear = r2[4]
previous_renewable = r2[5]
previous_demand_met = r2[6]
current_thermal = thermal_generation
current_gas = gas_generation
current_hydro = hydro_generation
current_nuclear = nuclear_generation
current_renewable = renewable_generation
current_demand_met = demand_met
corrected_thermal = current_thermal
previous_thermal_ratio = previous_thermal / previous_demand_met
current_thermal_ratio = current_thermal / current_demand_met
if abs((current_thermal - previous_thermal) / previous_thermal) >= 0.1:
corrected_thermal *= previous_thermal_ratio / current_thermal_ratio
corrected_gas = current_gas
previous_gas_ratio = previous_gas / previous_demand_met
current_gas_ratio = current_gas / current_demand_met
if abs((current_gas - previous_gas) / previous_gas) >= 0.1:
corrected_gas *= previous_gas_ratio / current_gas_ratio
corrected_hydro = current_hydro
previous_hydro_ratio = previous_hydro / previous_demand_met
current_hydro_ratio = current_hydro / current_demand_met
if abs((current_hydro - previous_hydro) / previous_hydro) >= 0.1:
corrected_hydro *= previous_hydro_ratio / current_hydro_ratio
corrected_nuclear = current_nuclear
previous_nuclear_ratio = previous_nuclear / previous_demand_met
current_nuclear_ratio = current_nuclear / current_demand_met
if abs((current_nuclear - previous_nuclear) / previous_nuclear) >= 0.1:
corrected_nuclear *= previous_nuclear_ratio / current_nuclear_ratio
corrected_renewable = current_renewable
previous_renewable_ratio = previous_renewable / previous_demand_met
current_renewable_ratio = current_renewable / current_demand_met
if abs((current_renewable - previous_renewable) / previous_renewable) >= 0.1:
corrected_renewable *= previous_renewable_ratio / current_renewable_ratio
data_dict['thermal_generation_corrected'] = corrected_thermal
data_dict['gas_generation_corrected'] = corrected_gas
data_dict['nuclear_generation_corrected'] = corrected_nuclear
data_dict['hydro_generation_corrected'] = corrected_hydro
data_dict['renewable_generation_corrected'] = corrected_renewable
return data_dict
def run():
host = os.environ['HOST']
port = int(os.environ['PORT'])
dbname = os.environ['DB']
user = os.environ['USER']
password = os.environ['PASSWORD']
conn = pymysql.connect(host, user=user, port=port,
passwd=password, db=dbname)
get_merit_data(conn)
def lambda_handler(event, context):
run()
if __name__ == "__main__":
run()