This repository has been archived by the owner on Aug 11, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_data.py
161 lines (140 loc) · 7.19 KB
/
get_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# -*- coding: utf-8 -*-
"""csv-to-json.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1uMcGJKRFaEVstKb6YKnPw1l-bzRylmfN
"""
import pandas as pd
import datetime
import os
import json
import numpy as np
from urllib.request import urlopen
def process_df(url,label):
# replace some names used in the file so that they confirm to ISO standards
# (which is what DialogFlow will give as system intents)
replacement_dict = {
'US': 'United States',
'Korea, South': 'South Korea'
}
## read the file corresponding to one: recovered, deaths, new cases
df = pd.read_csv(url)
## replace any non-standard ways of referencing countries / states
for key in replacement_dict:
df.replace(key, replacement_dict[key], inplace=True)
## rename columns for easy access
df.rename(columns={df.columns[0]: 'State', df.columns[1]: 'Country'}, inplace=True)
## delete lat and long
df.drop(df.columns[2:4], axis=1, inplace=True)
## remove nans for empty states to empty string
df['State'].fillna('Total', inplace=True)
df.fillna('', inplace=True)
## create new rows for those countries which have only state wise numbers
df_group = df.groupby(by=df.columns[1]).sum()
df_group.reset_index(inplace=True)
## append the new rows for country wise aggregation to original df
df = df.append(df_group, sort=True)
df['State'].fillna('Total', inplace=True)
df.fillna('', inplace=True)
## remove any duplicate rows - eg. country like India which has no state-wise info
df.drop_duplicates(inplace=True)
## remove duplicate total rows in case of mismatch in total count across states
df.drop_duplicates(subset=["Country","State"], keep='last', inplace=True)
## convert from wide form to narrow/long form
df = pd.melt(df, id_vars = ['Country', 'State'] , var_name = 'Date', value_name=label)
df.drop_duplicates(inplace=True)
df[label].replace('', np.nan, inplace=True)
df.dropna(subset=[label], inplace=True)
return df
def process_and_get_files():
GLOBAL_CONFIRMED_URL = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
GLOBAL_DEATHS_URL = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"
GLOBAL_RECOVERED_URL = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"
try:
## process the three files
df_confirmed = process_df(GLOBAL_CONFIRMED_URL,'confirmed')
df_recovered = process_df(GLOBAL_RECOVERED_URL,'recovered')
df_deaths = process_df(GLOBAL_DEATHS_URL,'deaths')
## merge the three files into one
df = df_confirmed.merge(df_recovered, on=['Country', 'State', 'Date']).merge(df_deaths, on=['Country', 'State', 'Date'])
#load India data
# os.system("curl -O https://api.rootnet.in/covid19-in/stats/daily")
with urlopen("https://api.rootnet.in/covid19-in/stats/daily") as f:
data = json.load(f)
for item in data['data']:
day = item['day']
# don't put India level data as that is already available
# df = df.append({
# 'Country': 'India',
# 'State': 'Total',
# 'Date': day,
# 'Confirmed': item['summary']['total'],
# 'Recovered': item['summary']['discharged'],
# 'Deaths': item['summary']['deaths']
# }, ignore_index=True)
for reg_item in item['regional']:
df = df.append({
'Country': 'India',
'State': reg_item['loc'],
'Date': day,
'confirmed': reg_item['confirmedCasesIndian'] + reg_item['confirmedCasesForeign'],
'recovered': reg_item['discharged'],
'deaths': reg_item['deaths']
}, ignore_index=True)
## convert the date string to datetime format
df['Date'] = pd.to_datetime(df['Date'])
df.sort_values(by=['Country', 'State', 'Date'], inplace=True)
df[['confirmed','recovered','deaths']] = df[['confirmed','recovered','deaths']].fillna(0)
df = df.rename(columns={'confirmed': 'Confirmed', 'recovered': 'Recovered', 'deaths':'Deaths'})
for index, row in df.iterrows():
if (type(row['Confirmed']) == str) or (type(row['Recovered']) == str) or (type(row['Deaths']) == str):
print(row)
break
df[['Confirmed','Recovered','Deaths']] = df[['Confirmed','Recovered','Deaths']].diff()
df = df[(df['Confirmed'] >= 0) & (df['Recovered'] >= 0) & (df['Deaths'] >= 0)]
# df = df[df['Date'] != datetime.datetime.strptime('2020-01-22', '%Y-%m-%d')]
df_total = df[df['State'] == 'Total']
df_total = df_total.groupby('Date').sum()
df_total.reset_index(level=0, inplace=True)
df_total['State'] = 'Total'
df_total['Country'] = 'World'
df_total = df_total[['Country', 'State', 'Date', 'Confirmed', 'Recovered', 'Deaths']]
df = df.append(df_total, ignore_index = True)
# df.to_csv('coronabot_stats_data.csv', index=False)
df['Date'] = df['Date'].astype(str)
df['Date'] = df['Date'].apply(lambda x: x[0:10])
df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
max_date = df['Date'].max()
return (True, "Success", df, max_date)
# # writes output to json file which is optimised for query
# # queries can be on country, then optionally state, and optionally date
# with open('coronabot_stats_data.json', 'w') as json:
# prev_country = ""
# prev_state = ""
# json.write('{\n')
# for i in df.index:
# country = df['Country'][i]
# state = df['State'][i]
# if not prev_country == country:
# if not prev_country == "":
# json.write('\n\t}\n},\n')
# json.write('"' + country + '": {\n')
# json.write('\t"' + state + '": {\n')
# elif not prev_state == state:
# json.write('\t},\n')
# json.write('\t"' + state + '": {\n')
# else:
# json.write(',\n')
# json.write('\t\t"' +
# str(df['Date'][i]) +
# '": {\n\t\t\t"Deaths": ' + str(df['Deaths'][i]) +
# ',\n\t\t\t"Recovered": ' + str(df['Recovered'][i]) +
# ',\n\t\t\t"Confirmed": ' + str(df['Confirmed'][i])
# + "\n\t\t}")
# prev_country = country
# prev_state = state
# json.write('\n\t}\n}\n}')
except Exception as ex:
return(False, str(ex), None, None)
if __name__ == '__main__':
(msg, exceptions, _, _) = process_and_get_files()