From e8c34c67c345264f8817a84aff3b47d40686220b Mon Sep 17 00:00:00 2001 From: Jonah Paten Date: Fri, 20 Dec 2024 21:38:35 -0800 Subject: [PATCH] feat: 4305 added gspread analytics package (#4314) * feat: added spreadsheet functions to ga package (#4305) * chore: refactored ga metric and dimension names to constants (#4305) * chore: finished refactor (#4305) * chore: bumped analytics package version (#4310) * chore: added new dependncies to setup.py (#4310) * fix: removed redundant api authentication (#4305) --- analytics/analytics_package/analytics/api.py | 6 + .../analytics_package/analytics/fields.py | 25 ++ .../analytics_package/analytics/sheets_api.py | 245 ++++++++++++++++++ .../analytics/sheets_elements.py | 98 +++++++ analytics/analytics_package/setup.py | 4 +- analytics/requirements.txt | 2 + 6 files changed, 378 insertions(+), 2 deletions(-) create mode 100644 analytics/analytics_package/analytics/fields.py create mode 100644 analytics/analytics_package/analytics/sheets_api.py create mode 100644 analytics/analytics_package/analytics/sheets_elements.py diff --git a/analytics/analytics_package/analytics/api.py b/analytics/analytics_package/analytics/api.py index 0b78935f7..dd3f459ec 100644 --- a/analytics/analytics_package/analytics/api.py +++ b/analytics/analytics_package/analytics/api.py @@ -24,6 +24,12 @@ lambda service, params: service.reports().query(**params).execute() ) +drive_service_params = ( + ["https://www.googleapis.com/auth/drive", "https://www.googleapis.com/auth/spreadsheets"], + "drive", "v3", + {}, +) + next_port = None default_service_system = None diff --git a/analytics/analytics_package/analytics/fields.py b/analytics/analytics_package/analytics/fields.py new file mode 100644 index 000000000..ec328bc80 --- /dev/null +++ b/analytics/analytics_package/analytics/fields.py @@ -0,0 +1,25 @@ +# Metric names +METRIC_EVENT_COUNT = 'eventCount' +METRIC_TOTAL_USERS = 'totalUsers' + +# DIMENSIONS +DIMENSION_PAGE_PATH = { + 'id': 'pagePath', + 'alias': 'page_path', + 'remove_matches': None, +} +DIMENSION_BUILTIN_URL = { + 'id': 'linkUrl', + 'alias': 'builtin_url', + 'remove_matches': r"\s*", +} +DIMENSION_EVENT_NAME = { + 'id': 'eventName', + 'alias': 'event_name', + 'remove_matches': None, +} +DIMENSION_CUSTOM_URL = { + 'id': 'customEvent:click_url', + 'alias': 'outbound_url', + 'remove_matches': r"\(not set\)", +} diff --git a/analytics/analytics_package/analytics/sheets_api.py b/analytics/analytics_package/analytics/sheets_api.py new file mode 100644 index 000000000..9dfbeef64 --- /dev/null +++ b/analytics/analytics_package/analytics/sheets_api.py @@ -0,0 +1,245 @@ +import gspread +import gspread_formatting +from enum import Enum +from googleapiclient.discovery import build + +class FILE_OVERRIDE_BEHAVIORS(Enum): + OVERRIDE_IF_IN_SAME_PLACE = 1 + EXIT_IF_IN_SAME_PLACE = 2 + EXIT_ANYWHERE = 3 + +class WORKSHEET_OVERRIDE_BEHAVIORS(Enum): + OVERRIDE = 1 + EXIT = 2 + +FONT_SIZE_PTS = 10 +PTS_PIXELS_RATIO = 4/3 +DEFAULT_BUFFER_CHARS = 2 + +def extract_credentials(authentication_response): + """Extracts the credentials from the tuple from api.authenticate""" + return authentication_response[3] + +def authenticate_gspread(authentication_response): + """Authenticates the gspread client using the credentials in the tuple from api.authenticate""" + gc = gspread.authorize(extract_credentials(authentication_response)) + return gc + +def authenticate_drive_api(authentication_response): + """Authenticates the Drive API using the response from api.authenticate""" + return authentication_response[0] + +def check_sheet_exists(gc, sheet_name): + """ + Determine if a sheet named 'sheet_name' exists anywhere in the project. + + :param gc: the gspread client + :param sheet_name: the name of the sheet to check for + :returns: True if the sheet exists, otherwise False + """ + try: + gc.open(sheet_name) + return True + except gspread.exceptions.SpreadsheetNotFound: + return False + +def execute_drive_list(drive_api, search_params): + """ + Execute a files().list() request on the Drive API with the given search parameters. + Returns the 'files' components of the response. + + Positional arguments: + :param drive_api: the Drive API object + :param search_params: the search parameters, see https://developers.google.com/drive/api/v3/search-files + :returns: the 'files' components of the response + """ + files_found = [] + page_token = None + while True: + request = drive_api.files().list(q=search_params, spaces="drive", pageToken=page_token) + response = request.execute() + page_token = response.get("nextPageToken", None) + files_found += response.get("files", []) + if page_token is None: + break + return files_found + +def search_for_folder_id(drive_api, folder_name, allow_trashed = False, allow_duplicates = False): + """ + Search for a folder by name in the Drive API. + Returns a list of folder ids that match the search criteria. + + :param drive_api: the Drive API object + :param folder_name: the name of the folder to search for + :param allow_trashed: whether to include trashed folders in the search, defaults to False + :param allow_duplicates: whether to allow multiple folders with the same name, defaults to False + :returns: a list of folder ids that match the search criteria + """ + search_params = f"name = '{folder_name}' and mimeType = 'application/vnd.google-apps.folder'" + if not allow_trashed: + search_params += " and trashed = false" + + files = execute_drive_list(drive_api, search_params) + files_exact_match = tuple(filter(lambda file: file["name"] == folder_name, files)) + + if len(files_exact_match) > 1: + if not allow_duplicates: + raise RuntimeError("Too many files returned") + if len(files_exact_match) == 0: + raise RuntimeError("No such folder exists") + + return [file["id"] for file in files_exact_match] + + +def create_sheet_in_folder(authentication_response, sheet_name, parent_folder_name=None, override_behavior=FILE_OVERRIDE_BEHAVIORS.EXIT_ANYWHERE): + """ + Create a new sheet in the project with the given name and parent folder. + Returns the new sheet. + + :param authentication_response: the service parameters tuple + :param sheet_name: the name of the new sheet + :param parent_folder_name: the name of the parent folder for the new sheet + :param override_behavior: the behavior to take if the sheet already exists + :returns: the gspread.Spreadsheet object of the new sheet + :rtype: gspread.Spreadsheet + """ + # Build Drive API + gc = authenticate_gspread(authentication_response) + drive_api = authenticate_drive_api(authentication_response) + parent_folder_id = None if parent_folder_name is None else search_for_folder_id(drive_api, parent_folder_name)[0] + + # Check if sheet already exists and handle based on input + if check_sheet_exists(gc, sheet_name): + if override_behavior == FILE_OVERRIDE_BEHAVIORS.EXIT_ANYWHERE: + raise RuntimeError("Sheet already exists") + matching_search = f"name = '{sheet_name}' and mimeType = 'application/vnd.google-apps.spreadsheet'" + if parent_folder_id is None: + matching_search += " and 'root' in parents" + else: + matching_search += f" and '{parent_folder_id}' in parents" + matching_files = execute_drive_list(drive_api, matching_search) + + if len(matching_files) > 0: + if override_behavior == FILE_OVERRIDE_BEHAVIORS.EXIT_IF_IN_SAME_PLACE: + raise RuntimeError("File already exists in the same folder") + elif override_behavior == FILE_OVERRIDE_BEHAVIORS.OVERRIDE_IF_IN_SAME_PLACE: + for file in matching_files: + drive_api.files().delete(fileId=file["id"]).execute() + # Create file body + body = { + 'name': sheet_name, + 'mimeType': 'application/vnd.google-apps.spreadsheet', + } + if parent_folder_id is not None: + body["parents"] = [parent_folder_id] + request = drive_api.files().create(body=body) + new_sheet = request.execute() + + # Get id of fresh sheet + spread_id = new_sheet["id"] + + # Open new file + return gc.open_by_key(spread_id) + +def fill_worksheet_with_df( + sheet, + df, + worksheet_name, + overlapBehavior, + options={ + "bold_header": True, + "center_header": True, + "freeze_header": True, + "column_widths": {"justify": True, "buffer_chars": DEFAULT_BUFFER_CHARS} + } + ): + """ + Fill a worksheet with the contents of a DataFrame. + If the worksheet already exists, the behavior is determined by overlapBehavior. + The options dictionary can be used to customize the formatting of the worksheet. + + :param sheet: the gspread.Spreadsheet object + :param df: the DataFrame to fill the worksheet with + :param worksheet_name: the name of the worksheet to fill. Cannot be "Sheet1" + :param overlapBehavior: the behavior to take if the worksheet already exists. + :param options: the formatting options for the worksheet. + Should be a dictionary with optional elements "bold_header", "center_header", "freeze_header", and "column_widths", optional + """ + # Sheet1 is special since it's created by default, so it's not allowed + assert worksheet_name != "Sheet1" + + # Check if worksheet already exists and handle based on overlapBehavior + try: + worksheet = sheet.worksheet(worksheet_name) + if overlapBehavior == WORKSHEET_OVERRIDE_BEHAVIORS.EXIT: + raise RuntimeError("Worksheet already exists") + except gspread.exceptions.WorksheetNotFound: + worksheet = sheet.add_worksheet( + title=worksheet_name, rows=df.shape[0], cols=df.shape[1] + ) + + # Add data to worksheet + worksheet.update([df.columns.values.tolist()] + df.values.tolist()) + + # Format worksheet + # Justify Column Widths + if "column_widths" not in options or options["column_widths"]["justify"]: + text_widths = df.astype(str).columns.map( + lambda column_name: df[column_name].astype(str).str.len().max() + ) + header_widths = df.columns.str.len() + buffer_chars = ( + DEFAULT_BUFFER_CHARS + if ("column_widths" not in options or "buffer_chars" not in options["column_widths"]) + else options["column_widths"]["buffer_chars"] + ) + column_widths = [ + round((max(len_tuple) + buffer_chars) * FONT_SIZE_PTS * 1/PTS_PIXELS_RATIO) + for len_tuple in zip(text_widths, header_widths) + ] + column_positions = [ + gspread.utils.rowcol_to_a1(1, i + 1)[0] for i, _ in enumerate(column_widths) + ] + gspread_formatting.set_column_widths(worksheet, zip(column_positions, column_widths)) + # Freeze Header + if "freeze_header" not in options or options["freeze_header"]: + gspread_formatting.set_frozen(worksheet, rows=1) + format_options = gspread_formatting.CellFormat() + # Bold Header + if "bold_header" not in options or options["bold_header"]: + format_options += gspread_formatting.CellFormat(textFormat=gspread_formatting.TextFormat(bold=True)) + # Center Header + if "center_header" not in options or options["center_header"]: + format_options += gspread_formatting.CellFormat(horizontalAlignment="CENTER") + gspread_formatting.format_cell_range( + worksheet, + f"A1:{gspread.utils.rowcol_to_a1(1, len(df.columns))}", + format_options + ) + + # Delete Sheet1 if it has been created by default + if "Sheet1" in [i.title for i in sheet.worksheets()]: + sheet.del_worksheet(sheet.worksheet("Sheet1")) + +def fill_spreadsheet_with_df_dict(sheet, df_dict, overlapBehavior, options={}): + """ + Fill a sheet with the contents of a dictionary of DataFrames. + The keys of the dictionary are the names of the worksheets, and the values contain the data to be placed in the sheet. + If any worksheets would be overidden, the behavior is determined by overlapBehavior. + + :param sheet: the gspread.Spreadsheet object + :param df_dict: the dictionary of DataFrames to fill the worksheets with + :param overlapBehavior: the behavior to take if any of the worksheets already exist + :param options: the formatting options for the worksheets. + Should be a dictionary with optional elements "bold_header", "center_header", "freeze_header", and "column_widths", optional + """ + if overlapBehavior == WORKSHEET_OVERRIDE_BEHAVIORS.EXIT: + for worksheet_name in df_dict.keys(): + try: + sheet.worksheet(worksheet_name) + raise RuntimeError("Worksheet already exists") + except gspread.exceptions.WorksheetNotFound: + pass + for worksheet_name, df in df_dict.items(): + fill_worksheet_with_df(sheet, df, worksheet_name, overlapBehavior, options=options) + diff --git a/analytics/analytics_package/analytics/sheets_elements.py b/analytics/analytics_package/analytics/sheets_elements.py new file mode 100644 index 000000000..828d9303c --- /dev/null +++ b/analytics/analytics_package/analytics/sheets_elements.py @@ -0,0 +1,98 @@ +import pandas as pd +from .charts import get_data_df +from .fields import * +from urllib.parse import urlparse + +def get_flat_data_df(analytics_params, metrics, dimensions, remove_matches=None): + """ + Get a df from the Analytics API with a flat structure (no multiindex). + + :param analytics_params: the parameters for the Analytics API, including authentication and property ids + :param metrics: the metrics to get + :param dimensions: the dimensions to get + :param remove_matches: a list of regex patterns or None elements to remove from each dimension. + Each regex or None element should correspond with an element of dimensions and remove_matches must be the same length as dimensions. + If the value is None, no patterns are removed, defaults to None. + + :return: a DataFrame with the data from the Analytics API + """ + if remove_matches is not None: + assert len(remove_matches) == len(dimensions) + + df = get_data_df( + metrics, + [dimension["id"] for dimension in dimensions], + **analytics_params, + ) + if remove_matches is not None: + for i, match in enumerate([dimension["remove_matches"] for dimension in dimensions]): + if match is not None: + df = df.loc[~df.index.get_level_values(i).str.fullmatch(match)] + return df.reset_index().rename(columns=get_rename_dict(dimensions)).copy() + +def get_rename_dict(dimensions): + """Get a dictionary to rename the columns of a DataFrame.""" + return dict( + zip([dimension["id"] for dimension in dimensions], [dimension["alias"] for dimension in dimensions]) + ) + +def get_outbound_sheets_df(analytics_params): + """ + Get a DF with outbound links from the Analytics API. Merges the builtin and custom events for outbound links. + + :param analytics_params: the parameters for the Analytics API, including authentication and property ids + :return: a DataFrame with the outbound links from the Analytics API + """ + pd.set_option('future.no_silent_downcasting', True) + # Get the builtin "Click" event + df_builtin_links = get_flat_data_df( + analytics_params, + [METRIC_EVENT_COUNT, METRIC_TOTAL_USERS], + [DIMENSION_PAGE_PATH, DIMENSION_BUILTIN_URL, DIMENSION_EVENT_NAME], + remove_matches=[None, r"\s*", None] + ).groupby( + [DIMENSION_PAGE_PATH["alias"], DIMENSION_BUILTIN_URL["alias"]] + ).sum().reset_index() + + # Get the custom "outbound_link_click" event + df_custom_links = get_flat_data_df( + analytics_params, + [METRIC_EVENT_COUNT, METRIC_TOTAL_USERS], + [DIMENSION_EVENT_NAME, DIMENSION_CUSTOM_URL, DIMENSION_PAGE_PATH], + remove_matches=[DIMENSION_EVENT_NAME["remove_matches"], r"\(not set\)", None], + ).groupby( + [DIMENSION_PAGE_PATH["alias"], DIMENSION_CUSTOM_URL["alias"]] + ).sum().reset_index() + # Concatenate the two dataframes, avoiding duplicates + # Keep the link from the builtin event, unless the link contains a #fragment, in which case keep the link from the custom event + df_builtin_links["builtin"] = True + df_builtin_links["truncated_url"] = df_builtin_links[DIMENSION_BUILTIN_URL["alias"]] + df_custom_links["truncated_url"] = df_custom_links[DIMENSION_CUSTOM_URL["alias"]].str.replace(r"#.*", "", regex=True) + df_outbound_links_fragments = df_custom_links.loc[df_custom_links[DIMENSION_CUSTOM_URL["alias"]].str.contains("#")].copy() + df_outbound_links_fragments["is_fragment"] = True + df_all_links = pd.concat( + [df_builtin_links, df_outbound_links_fragments], ignore_index=True + ) + # Use the builtin link, unless the link is not in the custom links, in which case use the custom link + df_all_links = df_all_links.loc[ + ~(df_all_links["truncated_url"].isin(df_outbound_links_fragments["truncated_url"]) & df_all_links["builtin"]) + ].sort_values(METRIC_EVENT_COUNT, ascending=False) + df_all_links["is_fragment"] = df_all_links["is_fragment"].fillna(False).astype(bool) + # Use the builtin link, unless the link is a fragment, in which case use the custom link + df_all_links["complete_url"] = df_all_links["builtin_url"].where( + ~df_all_links["is_fragment"], + df_all_links["outbound_url"] + ) + df_all_links["hostname"] = df_all_links["complete_url"].map(lambda x: urlparse(x).hostname) + df_all_links = df_all_links.drop( + columns=["builtin_url", "outbound_url", "builtin", "is_fragment"] + ).rename( + columns={ + DIMENSION_PAGE_PATH["alias"]: "Page Path", + "complete_url": "Outbound Link", + METRIC_EVENT_COUNT: "Total Clicks", + METRIC_TOTAL_USERS: "Total Users", + "hostname": "Hostname", + } + )[["Page Path", "Hostname", "Outbound Link", "Total Clicks", "Total Users"]] + return df_all_links.copy().reset_index(drop=True) \ No newline at end of file diff --git a/analytics/analytics_package/setup.py b/analytics/analytics_package/setup.py index 075d133f6..76ac7ad07 100644 --- a/analytics/analytics_package/setup.py +++ b/analytics/analytics_package/setup.py @@ -2,7 +2,7 @@ setup( name="analytics", - version="3.0.1", + version="3.1.0", packages=["analytics"], - install_requires=["matplotlib", "pandas", "numpy", "google-auth-oauthlib", "google-api-python-client"], + install_requires=["matplotlib", "pandas", "numpy", "google-auth-oauthlib", "google-api-python-client", "gspread", "gspread-formatting"], ) \ No newline at end of file diff --git a/analytics/requirements.txt b/analytics/requirements.txt index f6937ad6c..fa7f71974 100644 --- a/analytics/requirements.txt +++ b/analytics/requirements.txt @@ -26,6 +26,8 @@ google-auth==2.34.0 google-auth-httplib2==0.2.0 google-auth-oauthlib==1.2.1 googleapis-common-protos==1.65.0 +gspread==6.1.4 +gspread-formatting==1.2.0 httplib2==0.22.0 idna==3.10 imagesize==1.4.1