From 13725a25858634bf8a730210e631fc8b269b809c Mon Sep 17 00:00:00 2001 From: schwarzam Date: Tue, 11 Jun 2024 11:31:49 -0300 Subject: [PATCH 1/3] refactor dict_to_query_urlparams and add unit tests --- pyproject.toml | 1 + src/hipscat/io/paths.py | 40 ++++++++++++++++++++++++++++++++-- tests/hipscat/io/test_paths.py | 24 ++++++++++++++++++++ 3 files changed, 63 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1e783288..d47b324c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ dynamic = ["version"] requires-python = ">=3.9" dependencies = [ + "aiohttp", # http filesystem support "astropy", "fsspec>=2023.10.0", # Used for abstract filesystems "healpy", diff --git a/src/hipscat/io/paths.py b/src/hipscat/io/paths.py index d4bb5ec5..09fd6c24 100644 --- a/src/hipscat/io/paths.py +++ b/src/hipscat/io/paths.py @@ -4,6 +4,9 @@ import re from typing import Dict, List +from urllib.parse import parse_qs, urlencode, urlparse, urlunparse + +from fsspec.implementations.http import HTTPFileSystem from hipscat.io.file_io.file_pointer import FilePointer, append_paths_to_pointer, get_fs from hipscat.pixel_math.healpix_pixel import INVALID_PIXEL, HealpixPixel @@ -87,7 +90,10 @@ def get_healpix_from_path(path: str) -> HealpixPixel: def pixel_catalog_files( - catalog_base_dir: FilePointer, pixels: List[HealpixPixel], storage_options: Dict | None = None + catalog_base_dir: FilePointer, + pixels: List[HealpixPixel], + query_params: dict = None, + storage_options: Dict | None = None, ) -> List[FilePointer]: """Create a list of path *pointers* for pixel catalog files. This will not create the directory or files. @@ -103,25 +109,55 @@ def pixel_catalog_files( Args: catalog_base_dir (FilePointer): base directory of the catalog (includes catalog name) pixels (List[HealpixPixel]): the healpix pixels to create pointers to + query_params (dict): Params to append to URL. Ex: {'cols': ['ra', 'dec'], 'fltrs': ['r>=10', 'g<18']} storage_options (dict): the storage options for the file system to target when generating the paths + Returns (List[FilePointer]): A list of paths to the pixels, in the same order as the input pixel list. """ fs, _ = get_fs(catalog_base_dir, storage_options) base_path_stripped = catalog_base_dir.removesuffix(fs.sep) + + url_params = "" + if isinstance(fs, HTTPFileSystem) and query_params: + url_params = dict_to_query_urlparams(query_params) + return [ fs.sep.join( [ base_path_stripped, f"{ORDER_DIRECTORY_PREFIX}={pixel.order}", f"{DIR_DIRECTORY_PREFIX}={pixel.dir}", - f"{PIXEL_DIRECTORY_PREFIX}={pixel.pixel}.parquet", + f"{PIXEL_DIRECTORY_PREFIX}={pixel.pixel}.parquet" + url_params, ] ) for pixel in pixels ] +def dict_to_query_urlparams(query_params: dict) -> str: + """Converts a dictionary to a url query parameter string + + Args: + query_params (dict): dictionary of query parameters. + Returns: + query parameter string to append to a url + """ + + if not query_params: + return "" + + query = {} + for key, value in query_params.items(): + if isinstance(value, list): + value = ",".join(value).replace(" ", "") + query[key] = value + + # Build the query string and add the "?" prefix + url_params = "?" + urlencode(query, doseq=True) + return url_params + + def pixel_catalog_file(catalog_base_dir: FilePointer, pixel_order: int, pixel_number: int) -> FilePointer: """Create path *pointer* for a pixel catalog file. This will not create the directory or file. diff --git a/tests/hipscat/io/test_paths.py b/tests/hipscat/io/test_paths.py index 40c54e05..12790a9c 100644 --- a/tests/hipscat/io/test_paths.py +++ b/tests/hipscat/io/test_paths.py @@ -54,6 +54,30 @@ def test_pixel_catalog_files(): assert expected == result +def test_pixel_catalog_files_w_query_params(): + expected = [ + "https://foo/Norder=0/Dir=0/Npix=5.parquet?columns=ID%2CRA%2CDEC%2Cr_auto&filters=r_auto%3C13" + ] + query_params = {"columns": ["ID", "RA", "DEC", "r_auto"], "filters": ["r_auto<13"]} + result = paths.pixel_catalog_files("https://foo", [HealpixPixel(0, 5)], query_params=query_params) + assert expected == result + + +def test_dict_to_query_urlparams(): + expected = "?columns=ID%2CRA%2CDEC%2Cr_auto&filters=r_auto%3C13" + query_params = {"columns": ["ID", "RA", "DEC", "r_auto"], "filters": ["r_auto<13"]} + result = paths.dict_to_query_urlparams(query_params) + assert result == expected + + expected = "?columns=ID%2CRA%2CDEC%2Cr_auto&filters=r_auto%3C13" + query_params = {"columns": [" ID", "RA ", "DEC ", "r_auto"], "filters": ["r_auto < 13"]} + result = paths.dict_to_query_urlparams(query_params) + assert result == expected + + result = paths.dict_to_query_urlparams({}) + assert result == "" + + def test_get_healpix_from_path(): expected = HealpixPixel(5, 34) From 9d13f620151b2d0a287a4210cf4660f90ffc2bb2 Mon Sep 17 00:00:00 2001 From: schwarzam Date: Wed, 12 Jun 2024 11:59:26 -0300 Subject: [PATCH 2/3] cleaned not used urllib imports --- src/hipscat/io/paths.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hipscat/io/paths.py b/src/hipscat/io/paths.py index 09fd6c24..e99d6122 100644 --- a/src/hipscat/io/paths.py +++ b/src/hipscat/io/paths.py @@ -4,7 +4,7 @@ import re from typing import Dict, List -from urllib.parse import parse_qs, urlencode, urlparse, urlunparse +from urllib.parse import urlencode from fsspec.implementations.http import HTTPFileSystem From d261c8e94768b043b2f83b68c64b5f78d2e017a9 Mon Sep 17 00:00:00 2001 From: schwarzam Date: Wed, 12 Jun 2024 13:16:34 -0300 Subject: [PATCH 3/3] handled more edge cases on dict_to_query_urlparams --- src/hipscat/io/paths.py | 5 +++++ tests/hipscat/io/test_paths.py | 12 ++++++++++++ 2 files changed, 17 insertions(+) diff --git a/src/hipscat/io/paths.py b/src/hipscat/io/paths.py index e99d6122..17ade760 100644 --- a/src/hipscat/io/paths.py +++ b/src/hipscat/io/paths.py @@ -149,10 +149,15 @@ def dict_to_query_urlparams(query_params: dict) -> str: query = {} for key, value in query_params.items(): + if not all([key, value]): + continue if isinstance(value, list): value = ",".join(value).replace(" ", "") query[key] = value + if not query: + return "" + # Build the query string and add the "?" prefix url_params = "?" + urlencode(query, doseq=True) return url_params diff --git a/tests/hipscat/io/test_paths.py b/tests/hipscat/io/test_paths.py index 12790a9c..2c0bdca3 100644 --- a/tests/hipscat/io/test_paths.py +++ b/tests/hipscat/io/test_paths.py @@ -77,6 +77,18 @@ def test_dict_to_query_urlparams(): result = paths.dict_to_query_urlparams({}) assert result == "" + result = paths.dict_to_query_urlparams(None) + assert result == "" + + result = paths.dict_to_query_urlparams({"": ""}) + assert result == "" + + result = paths.dict_to_query_urlparams({None: ""}) + assert result == "" + + result = paths.dict_to_query_urlparams({"": "nonempty"}) + assert result == "" + def test_get_healpix_from_path(): expected = HealpixPixel(5, 34)