Skip to content

Commit

Permalink
Merge pull request #1204 from resource-watch/foo_024
Browse files Browse the repository at this point in the history
update script
  • Loading branch information
weiqi-tori authored Jul 9, 2024
2 parents ec96ef8 + 175d8dd commit 8366646
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 24 deletions.
42 changes: 20 additions & 22 deletions soc_016_conflict_protest_events/contents/src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,14 @@
('total', 'numeric')])

# url for armed conflict location & event data
SOURCE_URL = 'https://api.acleddata.com/acled/read/?key={key}&email={user}&event_date={date_start}|{date_end}&event_date_where=BETWEEN&page={page}'
# SOURCE_URL = 'https://api.acleddata.com/acled/read/?key={key}&email={user}&event_date={date_start}|{date_end}&event_date_where=BETWEEN&page={page}'
SOURCE_URL = 'https://api.acleddata.com/acled/read/?key={key}&email={user}&event_date={evt_date}'

# minimum pages to process
MIN_PAGES = 1
# MIN_PAGES = 1

# maximum pages to process
MAX_PAGES = 800
# MAX_PAGES = 800

# Resource Watch dataset API ID
# Important! Before testing this script:
Expand Down Expand Up @@ -166,12 +167,8 @@ def fetch_data(src_url):
'''
src_url = SOURCE_URL
# the dates between which we want the data
date_start = get_date_range()[0].strftime("%Y-%m-%d")
date_end = get_date_range()[1].strftime("%Y-%m-%d")

# specify the page of source url we want to pull
# initialize at 0 so that we can start pulling from page 1 in the loop
page = 0
date_start = get_date_range()[0]
date_end = get_date_range()[1]

# length (number of rows) of new_data
# initialize at 1 so that the while loop works during first step
Expand All @@ -180,18 +177,16 @@ def fetch_data(src_url):
# create an empty list to store ids of data
new_ids = []
# create an empty dataframe to store data
data_df = pd.DataFrame()
# get and parse each page; stop when no new results or max pages
# process up to MIN_PAGES even if there are no new results from them
while page <= MIN_PAGES or new_count and page < MAX_PAGES:
data_df = pd.DataFrame(columns = ['event_id_cnty','event_type', 'latitude', 'longitude'])
# get and parse each day's data; stop when no new results or end date
while date_start <= date_end:
try:
# increment page number in every loop
page += 1
logging.info("Fetching page {}".format(page))
logging.info("Fetching data for {}".format(date_start.strftime("%Y-%m-%d")))
# create an empty list to store data
new_rows = []
# generate the url and pull data for this page
r = requests.get(src_url.format(key=ACLED_KEY, user=ACLED_USER, date_start=date_start, date_end=date_end, page=page))
# generate the url and pull data for this day
# r = requests.get(src_url.format(key=ACLED_KEY, user=ACLED_USER, date_start=date_start, date_end=date_end, page=page))
r = requests.get(src_url.format(key=ACLED_KEY, user=ACLED_USER, evt_date=date_start.strftime("%Y-%m-%d")))
# columns of the pandas dataframe
""" cols = ["data_id", "event_date", "year", "time_precision", "event_type", "sub_event_type", "actor1", "assoc_actor_1", "inter1",
"actor2", "assoc_actor_2", "inter2", "interaction", "country", "iso3", "region", "admin1", "admin2", "admin3", "location",
Expand All @@ -217,13 +212,16 @@ def fetch_data(src_url):
# add the list of values from this row to the list of new data
new_rows.append(row)

# number of new rows added in this page
# number of new rows added on this day
new_count = len(new_rows)
# append the new rows to the pandas dataframe
data_df = data_df.append(pd.DataFrame(new_rows, columns=cols))
data_df = pd.concat([data_df,pd.DataFrame(new_rows, columns=cols)])
# data_df = data_df.append(pd.DataFrame(new_rows, columns=cols))
date_start += datetime.timedelta(days=1)

except:
logging.error('Could not fetch or process page {}'.format(page))
logging.error(f'Could not fetch or process data for {date_start}')
date_start += datetime.timedelta(days=1)
# drop duplicate records by event_id_cnty
data_df = data_df.drop_duplicates(['event_id_cnty']).iloc[:, 1:]

Expand Down Expand Up @@ -584,4 +582,4 @@ def main():
# Update Resource Watch
updateResourceWatch(num_new)

logging.info('SUCCESS')
logging.info('SUCCESS')
2 changes: 1 addition & 1 deletion upload_eia_data/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,6 @@ RUN useradd -r $NAME
RUN chown -R $NAME:$NAME /opt/$NAME
VOLUME /opt/$NAME/data
USER $NAME
ENV PYTHONPATH /src
ENV PYTHONPATH=/src

CMD ["python", "main.py"]
2 changes: 1 addition & 1 deletion upload_worldbank_data/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,6 @@ RUN useradd $NAME
RUN chown -R $NAME:$NAME /opt/$NAME
VOLUME /opt/$NAME/data
USER $NAME
ENV PYTHONPATH /src
ENV PYTHONPATH=/src

CMD ["python", "main.py"]

0 comments on commit 8366646

Please sign in to comment.