300 change how we count functionalnon functional for the playbook (#309)

* fixed total json of pop dislocation post processing (#301) (#303) * fixed total json of pop dislocation post processing (#301) * changelog title change --------- Co-authored-by: Chen Wang <[email protected]> * Galveston Capital Shock and CGE (#280) * First commit * Create test_galvestoncge.py * Update galvestoncge.py * Update galvestoncge.py * Draft update with new Nonethnic_CGE_Model * Fixed whatever bug * Remove files that should'nt be commited * Code to add missing sectors * Update CHANGELOG.md * Updated test and corrected city in code * Catch infeasible and other wrong solver status * Update modules.rst * Update galvestoncge.py --------- Co-authored-by: Chen Wang <[email protected]> * rewrite cge post pcoressing util (#297) * rewrite * changelog * write better test * adjust * fix galveston * rewrite test scripts for all 3 testbed cge * space * import correct galveston cge --------- Co-authored-by: YONG WOOK KIM <[email protected]> * add some temp test will remove later * finally it's working.... * clean up the logic * write proper test for joplin * add galveston mcs * write tests for joplin and galveston * changelog * fix pytest * use the correct unique cluster or unique category --------- Co-authored-by: Jong Lee <[email protected]> Co-authored-by: Vismayak Mohanarajan <[email protected]> Co-authored-by: YONG WOOK KIM <[email protected]>
IN-CORE · Apr 7, 2023 · 7502488 · 7502488
1 parent 073ea4f
commit 7502488
Show file tree

Hide file tree

Showing 3 changed files with 144 additions and 73 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,7 +13,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
 ### Fixed
 - CGE output post process util function [#298](https://github.com/IN-CORE/pyincore/issues/298)
 - Population Dislocation utility function arbitrarily assumes there will be dislocated and non-dislocated [#301](https://github.com/IN-CORE/pyincore/issues/301)
-
+- Functional vs non-functional calculation based of failure sample now [#300](https://github.com/IN-CORE/pyincore/issues/300)
 
 ## [1.9.0] - 2023-03-15
 

diff --git a/pyincore/utils/dataprocessutil.py b/pyincore/utils/dataprocessutil.py
@@ -7,16 +7,18 @@
 import geopandas as gpd
 import json
 import pandas as pd
+import numpy as np
 
-from pyincore import Dataset, DataService
+from pyincore import Dataset, DataService, IncoreClient
+from functools import reduce
 
 
 class DataProcessUtil:
 
     @staticmethod
     def get_mapped_result_from_analysis(client, inventory_id: str, dmg_result_dataset,
                                         bldg_func_dataset, archetype_mapping_id: str,
-                                        groupby_col_name: str = "max_state", arch_col='archetype'
+                                        groupby_col_name: str = "max_state", arch_col="archetype"
                                         ):
         """Use this if you want to load results directly from the output files of the analysis, than storing the results
         to data service and loading from there using ids.
@@ -43,7 +45,7 @@ def get_mapped_result_from_analysis(client, inventory_id: str, dmg_result_datase
         dmg_result = dmg_result_dataset.get_dataframe_from_csv()
 
         bldg_func_df = bldg_func_dataset.get_dataframe_from_csv()
-        bldg_func_df.rename(columns={'building_guid': 'guid'}, inplace=True)
+        bldg_func_df.rename(columns={"building_guid": "guid", "samples": "failure"}, inplace=True)
 
         arch_mapping = Dataset.from_data_service(archetype_mapping_id, DataService(client)).get_dataframe_from_csv()
 
@@ -57,7 +59,7 @@ def get_mapped_result_from_analysis(client, inventory_id: str, dmg_result_datase
     @staticmethod
     def get_mapped_result_from_dataset_id(client, inventory_id: str, dmg_result_id: str, bldg_func_id,
                                           archetype_mapping_id: str,
-                                          groupby_col_name: str = "max_state", arch_col='archetype'):
+                                          groupby_col_name: str = "max_state", arch_col="archetype"):
         """Use this if your damage results are already stored in the data service and you have their dataset ids.
         All the inputs (except groupby_col_name) are dataset ids.
 
@@ -84,7 +86,7 @@ def get_mapped_result_from_dataset_id(client, inventory_id: str, dmg_result_id:
 
         bldg_func_dataset = Dataset.from_data_service(bldg_func_id, DataService(client))
         bldg_func_df = bldg_func_dataset.get_dataframe_from_csv()
-        bldg_func_df.rename(columns={'building_guid': 'guid'}, inplace=True)
+        bldg_func_df.rename(columns={"building_guid": "guid", "samples": "failure"}, inplace=True)
 
         archtype_mapping_dataset = Dataset.from_data_service(archetype_mapping_id, DataService(client))
         arch_mapping = archtype_mapping_dataset.get_dataframe_from_csv()
@@ -100,7 +102,7 @@ def get_mapped_result_from_dataset_id(client, inventory_id: str, dmg_result_id:
     def get_mapped_result_from_path(inventory_path: str, dmg_result_path: str,
                                     func_result_path: str,
                                     archetype_mapping_path: str,
-                                    groupby_col_name: str, arch_col='archetype'):
+                                    groupby_col_name: str, arch_col="archetype"):
         """
 
         Args:
@@ -121,7 +123,7 @@ def get_mapped_result_from_path(inventory_path: str, dmg_result_path: str,
         inventory = pd.DataFrame(gpd.read_file("zip://" + inventory_path))
         dmg_result = pd.read_csv(dmg_result_path)
         bldg_func_df = pd.read_csv(func_result_path)
-        bldg_func_df.rename(columns={'building_guid': 'guid'}, inplace=True)
+        bldg_func_df.rename(columns={"building_guid": "guid", "samples": "failure"}, inplace=True)
         arch_mapping = pd.read_csv(archetype_mapping_path)
 
         max_state_df = DataProcessUtil.get_max_damage_state(dmg_result)
@@ -134,7 +136,7 @@ def get_mapped_result_from_path(inventory_path: str, dmg_result_path: str,
 
     @staticmethod
     def create_mapped_dmg_result(inventory, dmg_result, arch_mapping, groupby_col_name="max_state",
-                                 arch_col='archetype'):
+                                 arch_col="archetype"):
         """
 
         Args:
@@ -147,26 +149,26 @@ def create_mapped_dmg_result(inventory, dmg_result, arch_mapping, groupby_col_na
 
         """
         dmg_states = dmg_result[groupby_col_name].unique().tolist()  # get unique damage states
-        dmg_merged = pd.merge(inventory, dmg_result, on='guid')
+        dmg_merged = pd.merge(inventory, dmg_result, on="guid")
         mapped_df = pd.merge(dmg_merged, arch_mapping, on=arch_col)
-        unique_categories = arch_mapping.groupby(by=['cluster', 'category'], sort=False).count().reset_index()
+        unique_categories = arch_mapping.groupby(by=["cluster", "category"], sort=False).count().reset_index()
 
-        group_by = mapped_df.groupby(by=[groupby_col_name, 'cluster', 'category']).count().reset_index()
-        group_by = group_by.loc[:, ['guid', groupby_col_name, 'cluster', 'category']]
-        group_by.rename(columns={'guid': 'count'}, inplace=True)
+        group_by = mapped_df.groupby(by=[groupby_col_name, "cluster", "category"]).count().reset_index()
+        group_by = group_by.loc[:, ["guid", groupby_col_name, "cluster", "category"]]
+        group_by.rename(columns={"guid": "count"}, inplace=True)
 
-        pivot = group_by.pivot_table(values='count', index=['cluster', 'category'], columns=groupby_col_name,
+        pivot = group_by.pivot_table(values="count", index=["cluster", "category"], columns=groupby_col_name,
                                      fill_value=0)
 
         table = pd.DataFrame()
-        table[['category', 'cluster']] = unique_categories[['category', 'cluster']]
-        result_by_cluster = pd.merge(table, pivot, how='left', on=['cluster', 'category'])
+        table[["category", "cluster"]] = unique_categories[["category", "cluster"]]
+        result_by_cluster = pd.merge(table, pivot, how="left", on=["cluster", "category"])
 
         # Add missing max damage states. Handles case when no inventory fall under some damage states.
         result_by_cluster = result_by_cluster.reindex(result_by_cluster.columns.union(
             dmg_states, sort=False), axis=1, fill_value=0)
 
-        result_by_category = result_by_cluster.groupby(by=['category'], sort=False).sum(min_count=1).reset_index()
+        result_by_category = result_by_cluster.groupby(by=["category"], sort=False).sum(min_count=1).reset_index()
 
         result_by_cluster[dmg_states] = result_by_cluster[dmg_states].fillna(-1).astype(int)
         result_by_category[dmg_states] = result_by_category[dmg_states].fillna(-1).astype(int)
@@ -179,62 +181,68 @@ def create_mapped_dmg_result(inventory, dmg_result, arch_mapping, groupby_col_na
         return {"by_cluster": json_by_cluster, "by_category": json_by_category}
 
     @staticmethod
-    def create_mapped_func_result(inventory, bldg_func, arch_mapping, arch_col='archetype'):
+    def create_mapped_func_result(inventory, bldg_func, arch_mapping, arch_col="archetype"):
         """
 
         Args:
             inventory: dataframe represent inventory
-            bldg_func: building func dataset
+            bldg_func: building func state dataset
             arch_mapping: Path to the archetype mappings
             arch_col: archetype column to use for the clustering
 
         Returns:
             ret_json: JSON of the results ordered by cluster and category.
 
         """
+        def _sum_average(series):
+            return reduce(lambda x, y: np.mean(x + y).round(0), series)
+
         func_state = ["percent_functional", "percent_non_functional", "num_functional", "num_non_functional"]
-        func_merged = pd.merge(inventory, bldg_func, on='guid')
+
+        # unify mcs and bldg func naming
+        bldg_func.rename(columns={"building_guid": "guid", "samples": "failure"}, inplace=True)
+
+        func_merged = pd.merge(inventory, bldg_func, on="guid")
         mapped_df = pd.merge(func_merged, arch_mapping, on=arch_col)
-        unique_categories = arch_mapping.groupby(by=['category'], sort=False, as_index=False).count()['category']
-        unique_cluster = arch_mapping.groupby(by=['cluster', 'category'], sort=False, as_index=False).count()[[
-            'cluster', 'category']]
-
-        # group by cluster
-        result_by_cluster = mapped_df.groupby(by=['cluster', 'category'], sort=False, as_index=False).agg(
-            {'guid': 'count',
-             'probability': 'mean'})
-        result_by_cluster.rename(columns={'guid': 'tot_count', 'probability': 'percent_functional'}, inplace=True)
-        result_by_cluster["percent_non_functional"] = 1 - result_by_cluster["percent_functional"]
-        result_by_cluster["num_functional"] = (result_by_cluster["tot_count"] * result_by_cluster[
-            "percent_functional"]).round(0)
-        result_by_cluster["num_non_functional"] = (result_by_cluster["tot_count"] * result_by_cluster[
-            "percent_non_functional"]).round(0)
-        result_by_cluster = result_by_cluster.drop('tot_count', 1)
-        result_by_cluster = pd.merge(unique_cluster, result_by_cluster, how='left', on=['cluster', 'category'])
-        # Add missing max damage states. Handles case when no inventory fall under some damage states.
-        result_by_cluster = result_by_cluster.reindex(result_by_cluster.columns.union(
-            func_state, sort=False), axis=1, fill_value=0)
-        # replace NaN
-        result_by_cluster[func_state] = result_by_cluster[func_state].fillna(-1)
-        result_by_cluster[["num_functional", "num_non_functional"]] = result_by_cluster[["num_functional",
-                                                                                         "num_non_functional"]].astype(
-            int)
-
-        # group by category
-        result_by_category = mapped_df.groupby(by=['category'], sort=False, as_index=False).agg({'guid': 'count',
-                                                                                                 'probability': 'mean'})
-        result_by_category.rename(columns={'guid': 'tot_count', 'probability': 'percent_functional'}, inplace=True)
-        result_by_category["percent_non_functional"] = 1 - result_by_category["percent_functional"]
-        result_by_category["num_functional"] = (
-                result_by_category["tot_count"] * result_by_category["percent_functional"]).round(0)
-        result_by_category["num_non_functional"] = (
-                result_by_category["tot_count"] * result_by_category["percent_non_functional"]).round(0)
-        result_by_category = result_by_category.drop('tot_count', 1)
-        result_by_category = pd.merge(unique_categories, result_by_category, how='left', on=['category'])
-        # replace NaN
-        result_by_category[func_state] = result_by_category[func_state].fillna(-1)
-        result_by_category[["num_functional", "num_non_functional"]] = result_by_category[
-            ["num_functional", "num_non_functional"]].astype(int)
+        unique_categories = arch_mapping.groupby(by=["category"], sort=False, as_index=False).count()["category"]
+        unique_cluster = arch_mapping.groupby(by=["cluster", "category"], sort=False, as_index=False).count()[[
+            "cluster", "category"]]
+
+        mapped_df = mapped_df[["guid", "failure", "category", "cluster"]]
+        mapped_df["failure_array"] = mapped_df["failure"].apply(lambda x: np.array([int(x) for x in x.split(",")]))
+
+        def _group_by(by_column, unique):
+            # group by cluster
+            result = mapped_df.groupby(by=by_column, sort=False, as_index=False).agg(
+                {"guid": "count", "failure_array": [_sum_average]})
+
+            # clean up
+            result.rename(columns={"guid": "tot_count", "failure_array": "num_functional"}, inplace=True)
+
+            # 0 (failed), 1 (not failed). MCS
+            # 0 otherwise (not functional), 1 (functional),  Functionality
+            result["num_non_functional"] = result["tot_count"].squeeze() - result["num_functional"].squeeze()
+            result["percent_functional"] = result["num_functional"].squeeze() / result["tot_count"].squeeze()
+            result["percent_non_functional"] = 1 - result["percent_functional"]
+
+            # remove the tuples in column
+            result.columns = [x[0] if len(x) > 1 else x for x in result.columns]
+
+            # more clean up
+            result = pd.merge(unique, result, how="left", on=by_column)
+
+            # Add missing max damage states. Handles case when no inventory fall under some damage states.
+            result = result.reindex(result.columns.union(func_state, sort=False), axis=1, fill_value=0)
+
+            # replace NaN
+            result[func_state] = result[func_state].fillna(-1)
+            result["tot_count"] = result["tot_count"].fillna(-1)
+            result[["num_functional", "num_non_functional"]] = result[["num_functional", "num_non_functional"]].astype(int)
+
+            return result
+
+        result_by_cluster = _group_by(by_column=["cluster", "category"], unique=unique_cluster)
+        result_by_category = _group_by(by_column=["category"], unique=unique_categories)
 
         cluster_records = result_by_cluster.to_json(orient="records")
         category_records = result_by_category.to_json(orient="records")
@@ -254,20 +262,20 @@ def get_max_damage_state(dmg_result):
             pd.DataFrame: Pandas dataframe that has column GUID and column max_state.
 
         """
-        if all(column in dmg_result.columns for column in ['DS_0', 'DS_1', 'DS_2', 'DS_3']):
-            dmg_states = ['DS_0', 'DS_1', 'DS_2', 'DS_3']
-        elif all(column in dmg_result.columns for column in ['insignific', 'moderate', 'heavy', 'complete']):
-            dmg_states = ['insignific', 'moderate', 'heavy', 'complete']
+        if all(column in dmg_result.columns for column in ["DS_0", "DS_1", "DS_2", "DS_3"]):
+            dmg_states = ["DS_0", "DS_1", "DS_2", "DS_3"]
+        elif all(column in dmg_result.columns for column in ["insignific", "moderate", "heavy", "complete"]):
+            dmg_states = ["insignific", "moderate", "heavy", "complete"]
         elif all(column in dmg_result.columns for column in ["ds-none", "ds-slight", "ds-moderat", "ds-extensi",
                                                              "ds-complet"]):
             dmg_states = ["ds-none", "ds-slight", "ds-moderat", "ds-extensi", "ds-complet"]
         else:
             raise ValueError("Invalid damage state names. Cannot create mapped max damage state.")
 
-        guids = dmg_result[['guid']]
+        guids = dmg_result[["guid"]]
         max_val = dmg_result[dmg_states].max(axis=1)
         max_key = dmg_result[dmg_states].idxmax(axis=1)
         dmg_concat = pd.concat([guids, max_val, max_key], axis=1)
-        dmg_concat.rename(columns={0: 'max_prob', 1: 'max_state'}, inplace=True)
+        dmg_concat.rename(columns={0: "max_prob", 1: "max_state"}, inplace=True)
 
         return dmg_concat