-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBOE_Utilities.py
138 lines (93 loc) · 5.03 KB
/
BOE_Utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env python
# coding: utf-8
# In[1]:
# -------------------------------------------------------------------------------
# Import additional Python functionality / various libraries
# -------------------------------------------------------------------------------
import pandas as pd
import numpy as np
from scipy.stats import zscore
import pickle
# -------------------------------------------------------------------------------
# Define function that loads and combines worksheets from the source xlsx file
# -------------------------------------------------------------------------------
def create_combined_dataframe(file_name):
try:
# Obtain list of sheet names from the source Excel file.
print("Sheet names in the Excel file:")
excel_file = pd.ExcelFile(file_name)
sheet_names = excel_file.sheet_names
print(sheet_names)
dfs = [] # List to store dataframes from each worksheet
# Iterate through each sheet in the Excel file
for sheet_name in sheet_names:
print(f"Processing sheet: {sheet_name}")
# Read the worksheet into a pandas dataframe
# header = 4 (based on manual inspection of file)
df = pd.read_excel(file_name, sheet_name=sheet_name, header=4)
# Slice off a single, superfluous row from the resulting dataframe (based on manual inspection of file)
df = df[1:]
# The "time period" column shall be the index for each dataframe (based on manual inspection of file)
df.set_index('Time period and dataset code row', inplace=True)
clean_index_name = "TimePeriod"
df.index.name = clean_index_name
# Rename columns with suffix indicating the source sheet
df.columns = [f'{col} (Sheet_{sheet_name})' for col in df.columns]
# add the newly loaded dataframe to the list of dataframes ("dfs")
dfs.append(df)
print("Dataframe loaded successfully.")
# Concatenate all dataframes into a single dataframe
dfs_combined = pd.concat(dfs, axis=1, join='outer')
print(f"{len(dfs)} Dataframes combined successfully.")
return dfs_combined
except Exception as e:
print(f"Error loading Excel data: {str(e)}")
return None
# -------------------------------------------------------------------------------
# Define function that tidies up the combined dataframe
# -------------------------------------------------------------------------------
def tidy_the_dataframe(df):
try:
# Convert the format of the index to be easily recognized by Python as "datetime" format
df.index = df.index.str.replace(' ', '-') # Replace spaces with dashes
df.index = pd.to_datetime(df.index) # Convert index to datetime format
print("Index converted to datetime format successfully.")
except Exception as e:
print(f"Error occurred while converting index to datetime format: {e}")
try:
# Set datatype for all columns to floating
df = df.apply(pd.to_numeric, errors='coerce').astype(float)
print("Columns converted to floating point numbers successfully.")
except Exception as e:
print(f"Error occurred while converting columns to floating point numbers: {e}")
return df
# -------------------------------------------------------------------------------
# Define function that amends the column names of the combined tidy dataframe
# -------------------------------------------------------------------------------
def rename_columns(df, column_mapping):
try:
# Apply column renaming using a mapping provided by the user
df = df.rename(columns=column_mapping)
print("Columns renamed successfully.")
return df
except Exception as e:
print("An error occurred while renaming columns:", e)
return None
# -------------------------------------------------------------------------------
# Define function that creates duplicate dataframes that show values as % change
# -------------------------------------------------------------------------------
def create_percentage_change_df(df, shift_value):
try:
# Copy the input DataFrame
df_copy = df.copy()
# Calculate percentage changes versus the specified shift value
df_copy = ((df_copy / df_copy.shift(shift_value)) - 1) * 100
# Calculate z-score for the 'GDP_Total_MarketPrices' column
df_copy['Zscore'] = zscore(df_copy['GDP_Total_MarketPrices'], nan_policy='omit')
# Clip z-score values
df_copy['Zscore'] = np.clip(df_copy['Zscore'], -4, 4)
# Return the resulting DataFrame
return df_copy
except Exception as e:
print(f"An error occurred whilst creating percentage change df: {e}")
return None