diff --git a/src/luna/pathology/cli/dsa_upload.py b/src/luna/pathology/cli/dsa_upload.py index 7a2d9e35..2ad29944 100644 --- a/src/luna/pathology/cli/dsa_upload.py +++ b/src/luna/pathology/cli/dsa_upload.py @@ -136,6 +136,7 @@ def upload_annotation_to_dsa( Returns: DataFrame[SlideSchema]: slide manifest """ + uuids = [] for slide in slide_manifest.itertuples(name="Slide"): uuids = _upload_annotation_to_dsa( dsa_endpoint_url, @@ -148,10 +149,8 @@ def upload_annotation_to_dsa( insecure, storage_options, ) - slide_manifest.at[ - slide.Index, annotation_column.replace("url", "uuid") - ] = uuids[0] - return slide_manifest + uuids.append(uuids[0]) + return slide_manifest.assign(**{annotation_column: uuids}) def _upload_annotation_to_dsa( diff --git a/src/luna/pathology/cli/dsa_viz.py b/src/luna/pathology/cli/dsa_viz.py index 6f2612ab..da5cb292 100644 --- a/src/luna/pathology/cli/dsa_viz.py +++ b/src/luna/pathology/cli/dsa_viz.py @@ -125,7 +125,7 @@ def save_dsa_annotation( dsa_annotation: dict, output_urlpath: str, image_filename: str, - storage_options: dict, + storage_options: dict = {}, ): """Helper function to save annotation elements to a json file. @@ -152,16 +152,15 @@ def save_dsa_annotation( Path(output_urlpath_prefix) / f"{annotation_name_replaced}_{image_id}.json" ) - try: - with fs.open(output_path, "w").open() as outfile: - json.dump(dsa_annotation, outfile) - logger.info( - f"Saved {len(dsa_annotation['elements'])} to {fs.unstrip_protocol(str(output_path))}" - ) - return fs.unstrip_protocol(str(output_path)) - except Exception as exc: - logger.error(exc) - return None + if not fs.exists(output_urlpath_prefix): + fs.mkdir(output_urlpath_prefix) + + with fs.open(output_path, "w") as outfile: + json.dump(dsa_annotation, outfile) + logger.info( + f"Saved {len(dsa_annotation['elements'])} to {fs.unstrip_protocol(str(output_path))}" + ) + return fs.unstrip_protocol(str(output_path)) @timed @@ -171,9 +170,10 @@ def stardist_polygon_cli( image_filename: str = "???", annotation_name: str = "???", output_urlpath: str = "???", - line_colors: dict[str, str] = {}, - fill_colors: dict[str, str] = {}, - storage_options: dict = {}, + line_colors: Optional[Dict[str, str]] = None, + fill_colors: Optional[Dict[str, str]] = None, + storage_options: Dict = {}, + output_storage_options: Dict = {}, local_config: str = "", ): """Build DSA annotation json from stardist geojson classification results @@ -209,12 +209,12 @@ def stardist_polygon( slide_manifest: DataFrame[SlideSchema], output_urlpath: str, annotation_name: str, - line_colors: Dict[str, str], - fill_colors: Dict[str, str], - storage_options: Dict, - output_storage_options: Dict, - annotation_column: str = "stardist_polygon_geojson_url", - output_column: str = "regional_dsa_url", + line_colors: Optional[Dict[str, str]] = None, + fill_colors: Optional[Dict[str, str]] = None, + storage_options: Dict = {}, + output_storage_options: Dict = {}, + annotation_column: str = "", + output_column: str = "", ): """Build DSA annotation json from stardist geojson classification results @@ -232,12 +232,17 @@ def stardist_polygon( Returns: DataFrame[SlideSchema]: slide manifest """ + if not annotation_column: + annotation_column = f"{annotation_name}_geojson_url" + if not output_column: + output_column = f"{annotation_name}_dsa_url" + if annotation_column not in slide_manifest.columns: raise ValueError(f"{annotation_column} not found in slide manifest") client = get_or_create_dask_client() futures = [] - for row in slide_manifest.itertuples(name="Slide"): - image_filename = os.path.basename(row.url) + for _, row in slide_manifest.iterrows(): + image_filename = os.path.basename(row["url"]) future = client.submit( __stardist_polygon, row[annotation_column], @@ -253,8 +258,8 @@ def stardist_polygon( futures.append(future) progress(futures) dsa_annotation_urls = client.gather(futures) - for dsa_annotation_url in dsa_annotation_urls: - slide_manifest.at[row.Index, output_column] = dsa_annotation_url + for idx, dsa_annotation_url in enumerate(dsa_annotation_urls): + slide_manifest.at[idx, output_column] = dsa_annotation_url return slide_manifest @@ -264,10 +269,10 @@ def __stardist_polygon( output_urlpath: str, image_filename: str, annotation_name: str, - line_colors: Dict[str, str], - fill_colors: Dict[str, str], - storage_options: Dict, - output_storage_options: Dict, + line_colors: Optional[Dict[str, str]] = None, + fill_colors: Optional[Dict[str, str]] = None, + storage_options: Dict = {}, + output_storage_options: Dict = {}, ): """Build DSA annotation from stardist geojson classification results @@ -333,8 +338,8 @@ def stardist_polygon_tile_cli( image_filename: str = "???", annotation_name_prefix: str = "???", output_urlpath: str = "???", - line_colors: dict[str, str] = {}, - fill_colors: dict[str, str] = {}, + line_colors: Optional[Dict[str, str]] = None, + fill_colors: Optional[Dict[str, str]] = None, storage_options: dict = {}, output_storage_options: dict = {}, local_config: str = "", @@ -375,12 +380,12 @@ def stardist_polygon_tile( slide_manifest: DataFrame[SlideSchema], output_urlpath: str, annotation_name_prefix: str, - line_colors: Dict[str, str], - fill_colors: Dict[str, str], - storage_options: Dict, - output_storage_options: Dict, - annotation_column: str = "stardist_polygon_geojson_url", - output_column_suffix: str = "regional_dsa_url", + line_colors: Optional[Dict[str, str]] = None, + fill_colors: Optional[Dict[str, str]] = None, + storage_options: Dict = {}, + output_storage_options: Dict = {}, + annotation_column: str = "", + output_column_suffix: str = "", ): """Build DSA annotation json from stardist geojson classification and labeled tiles @@ -398,16 +403,20 @@ def stardist_polygon_tile( Returns: dict[str,str]: annotation file path """ + if not annotation_column: + annotation_column = f"{annotation_name_prefix}_geojson_url" + if not output_column_suffix: + output_column_suffix = f"{annotation_name_prefix}_dsa_url" if annotation_column not in slide_manifest.columns: raise ValueError(f"{annotation_column} not found in slide manifest") client = get_or_create_dask_client() futures = [] - for row in slide_manifest.itertuples(name="Slide"): - image_filename = os.path.basename(row.url) + for _, row in slide_manifest.iterrows(): + image_filename = os.path.basename(row["url"]) future = client.submit( __stardist_polygon_tile, row[annotation_column], - row.tiles_url, + row["tiles_url"], output_urlpath, image_filename, annotation_name_prefix, @@ -419,13 +428,16 @@ def stardist_polygon_tile( futures.append(future) progress(futures) - dsa_annotation_url_map = client.gather(futures) - for tile_label, dsa_annotation_url in dsa_annotation_url_map.iteritems(): - slide_manifest.at[ - row.Index, f"{tile_label}_{output_column_suffix}" - ] = dsa_annotation_url - - return slide_manifest + dsa_annotation_url_maps = client.gather(futures) + tile_labels = dsa_annotation_url_maps[0].keys() + return slide_manifest.assign( + **{ + f"{tile_label}_{output_column_suffix}": [ + x[tile_label] for x in dsa_annotation_url_maps + ] + for tile_label in tile_labels + } + ) def __stardist_polygon_tile( @@ -434,10 +446,10 @@ def __stardist_polygon_tile( output_urlpath: str, image_filename: str, annotation_name_prefix: str, - line_colors: Dict[str, str], - fill_colors: Dict[str, str], - storage_options: Dict, - output_storage_options: Dict, + line_colors: Optional[Dict[str, str]] = None, + fill_colors: Optional[Dict[str, str]] = None, + storage_options: Dict = {}, + output_storage_options: Dict = {}, ): """Build DSA annotation json from stardist geojson classification and labeled tiles @@ -576,12 +588,12 @@ def stardist_cell( slide_manifest: DataFrame[SlideSchema], output_urlpath: str, annotation_name: str, - line_colors: Optional[Dict[str, str]], - fill_colors: Optional[Dict[str, str]], - storage_options: Dict, - output_storage_options: Dict, - annotation_column: str = "stardist_cell_tsv_url", - output_column: str = "stardist_cell_dsa_url", + line_colors: Optional[Dict[str, str]] = None, + fill_colors: Optional[Dict[str, str]] = None, + storage_options: Dict = {}, + output_storage_options: Dict = {}, + annotation_column: str = "", + output_column: str = "", ): """Build DSA annotation json from TSV classification data generated by stardist @@ -604,12 +616,16 @@ def stardist_cell( Returns: DataFrame[SlideSchema]: slide manifest """ + if not annotation_column: + annotation_column = f"{annotation_name}_tsv_url" + if not output_column: + output_column = f"{annotation_name}_dsa_url" if annotation_column not in slide_manifest.columns: raise ValueError(f"{annotation_column} not found in slide manifest") client = get_or_create_dask_client() futures = [] - for row in slide_manifest.itertuples(name="Slide"): - image_filename = os.path.basename(row.url) + for _, row in slide_manifest.iterrows(): + image_filename = os.path.basename(row["url"]) future = client.submit( __stardist_cell, row[annotation_column], @@ -625,10 +641,7 @@ def stardist_cell( futures.append(future) progress(futures) dsa_annotation_urls = client.gather(futures) - for dsa_annotation_url in dsa_annotation_urls: - slide_manifest.at[row.Index, output_column] = dsa_annotation_url - - return slide_manifest + return slide_manifest.assign(**{output_column: dsa_annotation_urls}) def __stardist_cell( @@ -636,10 +649,10 @@ def __stardist_cell( output_urlpath: str, image_filename: str, annotation_name: str, - line_colors: Optional[dict[str, str]], - fill_colors: Optional[dict[str, str]], - storage_options: dict, - output_storage_options: dict, + line_colors: Optional[Dict[str, str]] = None, + fill_colors: Optional[Dict[str, str]] = None, + storage_options: dict = {}, + output_storage_options: dict = {}, ): """Build DSA annotation json from TSV classification data generated by stardist @@ -763,12 +776,12 @@ def regional_polygon( slide_manifest: DataFrame[SlideSchema], output_urlpath: str, annotation_name: str, - line_colors: Optional[Dict[str, str]], - fill_colors: Optional[Dict[str, str]], - storage_options: Dict, - output_storage_options: Dict, - annotation_column: str = "regional_geojson_url", - output_column: str = "regional_dsa_url", + line_colors: Optional[Dict[str, str]] = None, + fill_colors: Optional[Dict[str, str]] = None, + storage_options: Dict = {}, + output_storage_options: Dict = {}, + annotation_column: str = "", + output_column: str = "", ): """Build DSA annotation json from regional annotation geojson @@ -787,12 +800,16 @@ def regional_polygon( DataFrame[SlideSchema]: slide schema """ + if not annotation_column: + annotation_column = f"{annotation_name}_geojson_url" + if not output_column: + output_column = f"{annotation_name}_dsa_url" if annotation_column not in slide_manifest.columns: raise ValueError(f"{annotation_column} not found in slide manifest") client = get_or_create_dask_client() futures = [] - for row in slide_manifest.itertuples(name="Slide"): - image_filename = os.path.basename(row.url) + for _, row in slide_manifest.iterrows(): + image_filename = os.path.basename(row["url"]) future = client.submit( __regional_polygon, row[annotation_column], @@ -808,10 +825,7 @@ def regional_polygon( futures.append(future) progress(futures) dsa_annotation_urls = client.gather(futures) - for dsa_annotation_url in dsa_annotation_urls: - slide_manifest.at[row.Index, output_column] = dsa_annotation_url - - return slide_manifest + return slide_manifest.assign(**{output_column: dsa_annotation_urls}) def __regional_polygon( @@ -819,10 +833,10 @@ def __regional_polygon( output_urlpath: str, image_filename: str, annotation_name: str, - line_colors: Optional[dict[str, str]], - fill_colors: Optional[dict[str, str]], - storage_options: dict, - output_storage_options: dict, + line_colors: Optional[Dict[str, str]] = None, + fill_colors: Optional[Dict[str, str]] = None, + storage_options: Dict = {}, + output_storage_options: Dict = {}, ): """Build DSA annotation json from regional annotation geojson @@ -926,12 +940,12 @@ def qupath_polygon( image_filename: str, annotation_name: str, classes_to_include: List, - line_colors: Optional[Dict[str, str]], - fill_colors: Optional[Dict[str, str]], - storage_options: Dict, - output_storage_options: Dict, - annotation_column: str = "qupath_geojson_url", - output_column: str = "qupath_dsa_url", + line_colors: Optional[Dict[str, str]] = None, + fill_colors: Optional[Dict[str, str]] = None, + storage_options: Dict = {}, + output_storage_options: Dict = {}, + annotation_column: str = "", + output_column: str = "", ): """Build DSA annotation json from Qupath polygon geojson @@ -953,12 +967,16 @@ def qupath_polygon( Returns: DataFrame[SlideSchema]: slide manifest """ + if not annotation_column: + annotation_column = f"{annotation_name}_geojson_url" + if not output_column: + output_column = f"{annotation_name}_dsa_url" if annotation_column not in slide_manifest.columns: raise ValueError(f"{annotation_column} not found in slide manifest") client = get_or_create_dask_client() futures = [] - for row in slide_manifest.itertuples(name="Slide"): - image_filename = os.path.basename(row.url) + for _, row in slide_manifest.iterrows(): + image_filename = os.path.basename(row["url"]) future = client.submit( __qupath_polygon, row[annotation_column], @@ -975,10 +993,7 @@ def qupath_polygon( futures.append(future) progress(futures) dsa_annotation_urls = client.gather(futures) - for dsa_annotation_url in dsa_annotation_urls: - slide_manifest.at[row.Index, output_column] = dsa_annotation_url - - return slide_manifest + return slide_manifest.assign(**{output_column: dsa_annotation_urls}) def __qupath_polygon( @@ -987,10 +1002,10 @@ def __qupath_polygon( image_filename: str, annotation_name: str, classes_to_include: List, - line_colors: Optional[Dict[str, str]], - fill_colors: Optional[Dict[str, str]], - storage_options: Dict, - output_storage_options: Dict, + line_colors: Optional[Dict[str, str]] = None, + fill_colors: Optional[Dict[str, str]] = None, + storage_options: Dict = {}, + output_storage_options: Dict = {}, ): """Build DSA annotation json from Qupath polygon geojson @@ -1001,7 +1016,8 @@ def __qupath_polygon( e.g. ["Tumor", "Stroma", ...] line_colors (map, optional): line color map with {feature name:rgb values} fill_colors (map, optional): fill color map with {feature name:rgba values} - storage_options (dict): storage options to pass to read/write functions + storage_options (dict): storage options to pass to read functions + output_storage_options (dict): storage options to pass to write functions Returns: dict: dsa annotation @@ -1112,8 +1128,8 @@ def bitmask_polygon( output_urlpath: str, image_filename: str, annotation_name: str, - line_colors: Optional[Dict[str, str]], - fill_colors: Optional[Dict[str, str]], + line_colors: Optional[Dict[str, str]] = None, + fill_colors: Optional[Dict[str, str]] = None, scale_factor: Optional[int] = 1, storage_options: Dict = {}, output_storage_options: Dict = {}, @@ -1180,9 +1196,10 @@ def heatmap_cli( column: str = "???", tile_size: int = "???", # type: ignore scale_factor: Optional[int] = 1, - fill_colors: dict[str, str] = {}, - line_colors: dict[str, str] = {}, + fill_colors: Optional[dict[str, str]] = None, + line_colors: Optional[dict[str, str]] = None, storage_options: dict = {}, + output_storage_options: dict = {}, local_config: str = "", ): """Generate heatmap based on the tile scores @@ -1202,7 +1219,8 @@ def heatmap_cli( scale_factor (int, optional): scale to match the image on DSA. line_colors (dict, optional): line color map with {feature name:rgb values} fill_colors (dict, optional): fill color map with {feature name:rgba values} - storage_options (dict): storage options to pass to read/write functions + storage_options (dict): storage options to pass to write functions + output_storage_options (dict): storage options to pass to write functions local_config (string): local config yaml file Returns: @@ -1231,11 +1249,12 @@ def heatmap( annotation_name: str, column: List[str], tile_size: int, - scale_factor: Optional[int], - fill_colors: Optional[Dict[str, str]], - line_colors: Optional[Dict[str, str]], - storage_options: Dict, - output_storage_options: Dict, + scale_factor: Optional[int] = None, + fill_colors: Optional[Dict[str, str]] = None, + line_colors: Optional[Dict[str, str]] = None, + output_column: str = "", + storage_options: Dict = {}, + output_storage_options: Dict = {}, ): """Generate heatmap based on the tile scores @@ -1259,15 +1278,17 @@ def heatmap( Returns: dict: annotation file path. None if error in writing the file. """ + if not output_column: + output_column = f"{annotation_name}_dsa_url" if "tiles_url" not in slide_manifest.columns: raise ValueError("tiles_url not found in slide manifest") client = get_or_create_dask_client() futures = [] - for row in slide_manifest.itertuples(name="Slide"): - image_filename = os.path.basename(row.url) + for _, row in slide_manifest.iterrows(): + image_filename = os.path.basename(row["url"]) future = client.submit( __heatmap, - row.tiles_url, + row["tiles_url"], output_urlpath, image_filename, annotation_name, @@ -1283,10 +1304,7 @@ def heatmap( futures.append(future) progress(futures) dsa_annotation_urls = client.gather(futures) - for dsa_annotation_url in dsa_annotation_urls: - slide_manifest.at[row.Index, "heatmap_url"] = dsa_annotation_url - - return slide_manifest + return slide_manifest.assign(**{output_column: dsa_annotation_urls}) def __heatmap( @@ -1296,11 +1314,11 @@ def __heatmap( annotation_name: str, column: List[str], tile_size: int, - scale_factor: Optional[int], - fill_colors: Optional[Dict[str, str]], - line_colors: Optional[Dict[str, str]], - storage_options: Dict, - output_storage_options: Dict, + scale_factor: Optional[int] = None, + fill_colors: Optional[Dict[str, str]] = None, + line_colors: Optional[Dict[str, str]] = None, + storage_options: Dict = {}, + output_storage_options: Dict = {}, ): """Generate heatmap based on the tile scores @@ -1433,8 +1451,8 @@ def bmp_polygon( output_urlpath: str, label_map: Dict[int, str], annotation_name: str, - line_colors: Optional[Dict[str, str]], - fill_colors: Optional[Dict[str, str]], + line_colors: Optional[Dict[str, str]] = None, + fill_colors: Optional[Dict[str, str]] = None, scale_factor: Optional[int] = 1, storage_options: Dict = {}, output_storage_options: Dict = {}, @@ -1466,8 +1484,8 @@ def bmp_polygon( raise ValueError(f"{annotation_column} not found in slide manifest") client = get_or_create_dask_client() futures = [] - for row in slide_manifest.itertuples(name="Slide"): - image_filename = os.path.basename(row.url) + for _, row in slide_manifest.iterrows(): + image_filename = os.path.basename(row["url"]) future = client.submit( __bmp_polygon, row[annotation_column], @@ -1484,10 +1502,7 @@ def bmp_polygon( futures.append(future) progress(futures) dsa_annotation_urls = client.gather(futures) - for dsa_annotation_url in dsa_annotation_urls: - slide_manifest.at[row.Index, output_column] = dsa_annotation_url - - return slide_manifest + return slide_manifest.assign(**{output_column: dsa_annotation_urls}) def __bmp_polygon( @@ -1496,8 +1511,8 @@ def __bmp_polygon( image_filename: str, label_map: Dict[int, str], annotation_name: str, - line_colors: Optional[Dict[str, str]], - fill_colors: Optional[Dict[str, str]], + line_colors: Optional[Dict[str, str]] = None, + fill_colors: Optional[Dict[str, str]] = None, scale_factor: Optional[int] = 1, storage_options: Dict = {}, output_storage_options: Dict = {}, diff --git a/src/luna/pathology/cli/extract_tile_shape_features.py b/src/luna/pathology/cli/extract_tile_shape_features.py index 91a62d55..e3f76542 100644 --- a/src/luna/pathology/cli/extract_tile_shape_features.py +++ b/src/luna/pathology/cli/extract_tile_shape_features.py @@ -146,6 +146,7 @@ def extract_tile_shape_features( storage_options: dict = {}, output_storage_options: dict = {}, objects_column="stardist_geojson_url", + annotation_column="tile_shape_features_url", properties: List[str] = [ "area", "convex_area", @@ -176,6 +177,7 @@ def extract_tile_shape_features( output_storage_options (dict): storage options to pass to writing functions local_config (str): local config yaml file objects_column (str): slide manifest column name with stardist geoJSON URLs + annotation_column (str): column to add to slide manifest with url to extracted features properties (List[str]): properties to extract Returns: @@ -184,16 +186,16 @@ def extract_tile_shape_features( client = get_or_create_dask_client() futures = [] - for row in slide_manifest.itertuples(name="Slide"): + for _, row in slide_manifest.iterrows(): future = client.submit( __extract_tile_shape_features, row[objects_column], - row.tiles_url, - row.url, + row["tiles_url"], + row["url"], output_urlpath, resize_factor, detection_probability_threshold, - row.id, + row["id"], statistical_descriptors, cellular_features, property_type, @@ -207,10 +209,10 @@ def extract_tile_shape_features( progress(futures) results = client.gather(futures) - for idx, result in results.enumerate(): - slide_manifest.at[idx, "tile_shape_features_url"] = result["shape_features_url"] - return slide_manifest + return slide_manifest.assign( + **{annotation_column: [x["shape_features_url"] for x in results]} + ) def __extract_tile_shape_features( diff --git a/src/luna/pathology/cli/generate_tiles.py b/src/luna/pathology/cli/generate_tiles.py index eb4207b7..07d49586 100644 --- a/src/luna/pathology/cli/generate_tiles.py +++ b/src/luna/pathology/cli/generate_tiles.py @@ -31,6 +31,7 @@ def cli( dask_options: dict = {}, local_config: str = "", output_urlpath: str = ".", + force: bool = False, ) -> dict: """Rasterize a slide into smaller tiles, saving tile metadata as rows in a csv file @@ -56,6 +57,7 @@ def cli( config["slide_urlpath"], config["tile_size"], config["output_urlpath"], + config["force"], config["requested_magnification"], config["storage_options"], config["output_storage_options"], @@ -68,6 +70,7 @@ def generate_tiles( slide_manifest: DataFrame[SlideSchema], tile_size: int, output_urlpath: str, + force: bool = True, requested_magnification: Optional[int] = None, storage_options: dict = {}, output_storage_options: dict = {}, @@ -81,22 +84,21 @@ def generate_tiles( slide.url, tile_size, output_urlpath, + force, requested_magnification, storage_options, output_storage_options, ) futures.append(future) results = client.gather(futures) - for idx, result in enumerate(results): - slide_manifest.at[idx, "tiles_url"] = result["tiles_url"] - - return slide_manifest + return slide_manifest.assign(tiles_url=[x["tiles_url"] for x in results]) def __generate_tiles( slide_urlpath: str, tile_size: int, output_urlpath: str, + force: bool, requested_magnification: Optional[int] = None, storage_options: dict = {}, output_storage_options: dict = {}, @@ -120,7 +122,7 @@ def __generate_tiles( slide_id = Path(slide_urlpath).stem ofs, output_path = fsspec.core.url_to_fs(output_urlpath, **output_storage_options) output_file = str(Path(output_path) / f"{slide_id}.tiles.parquet") - if ofs.exists(output_file): + if not force and ofs.exists(output_file): logger.info("Output file exists: {ofs.unstrip_protocol(output_file)}") return diff --git a/src/luna/pathology/cli/infer_tile_labels.py b/src/luna/pathology/cli/infer_tile_labels.py index 1b73c74f..1fd1220e 100644 --- a/src/luna/pathology/cli/infer_tile_labels.py +++ b/src/luna/pathology/cli/infer_tile_labels.py @@ -41,6 +41,7 @@ def cli( num_cores: int = 4, batch_size: int = 8, output_urlpath: str = ".", + force: bool = False, kwargs: dict = {}, use_gpu: bool = False, dask_options: dict = {}, @@ -63,6 +64,7 @@ def cli( num_cores (int): Number of cores to use for CPU parallelization batch_size (int): size in batch dimension to chuck inference (8-256 recommended, depending on memory usage) output_urlpath (str): output/working directory + force (bool): overwrite outputs if they exist kwargs (dict): additional keywords to pass to model initialization use_gpu (bool): use GPU if available dask_options (dict): options to pass to dask client @@ -94,6 +96,7 @@ def cli( config["slide_urlpath"], config["tile_size"], (Path(temp_dir) / "generate_tiles").as_uri(), + config["force"], config["tile_magnification"], config["storage_options"], ) @@ -105,12 +108,14 @@ def cli( config["filter_query"], config["batch_size"], (Path(temp_dir) / "detect_tissue").as_uri(), + config["force"], config["storage_options"], ) save_tiles_result = _save_tiles( detect_tissue_result["tiles_urlpath"], config["slide_urlpath"], (Path(temp_dir) / "save_tiles").as_uri(), + config["force"], config["batch_size"], config["storage_options"], ) @@ -120,6 +125,7 @@ def cli( tiles_urlpath, slide_id, config["output_urlpath"], + config["force"], config["torch_model_repo_or_dir"], config["model_name"], config["num_cores"], @@ -143,6 +149,7 @@ def infer_tile_labels( num_cores: int = 1, batch_size: int = 2000, output_urlpath: str = ".", + force: bool = True, kwargs: dict = {}, use_gpu: bool = False, insecure: bool = False, @@ -164,6 +171,7 @@ def infer_tile_labels( num_cores (int): Number of cores to use for CPU parallelization batch_size (int): size in batch dimension to chuck inference (8-256 recommended, depending on memory usage) output_urlpath (str): output/working directory + force (bool): overwrite outputs if they exist kwargs (dict): additional keywords to pass to model initialization use_gpu (bool): use GPU if available insecure (bool): insecure SSL @@ -189,12 +197,14 @@ def infer_tile_labels( batch_size=batch_size, storage_options=storage_options, output_urlpath=output_urlpath, + force=force, output_storage_options=output_storage_options, ) slide_manifest = save_tiles( slide_manifest, output_urlpath, + force, batch_size, storage_options, output_storage_options, @@ -207,6 +217,7 @@ def infer_tile_labels( row.tiles_url, row.id, output_urlpath, + force, torch_model_repo_or_dir, model_name, num_cores, @@ -221,15 +232,14 @@ def infer_tile_labels( progress(futures) results = client.gather(futures) - for idx, result in results.enumerate(): - slide_manifest.at[idx, "tiles_url"] = result - return slide_manifest + return slide_manifest.assign(tiles_url=[x["tiles_url"] for x in results]) def __infer_tile_labels( tiles_urlpath: str, slide_id: str, output_urlpath: str, + force: bool, torch_model_repo_or_dir: str, model_name: str, num_cores: int, @@ -271,7 +281,7 @@ def __infer_tile_labels( output_file = str(Path(output_path_prefix) / f"{slide_id}.tiles.parquet") - if ofs.exists(output_file): + if not force and ofs.exists(output_file): logger.info(f"outputs already exist: {output_file}") return diff --git a/src/luna/pathology/cli/run_stardist_cell_detection.py b/src/luna/pathology/cli/run_stardist_cell_detection.py index a944d5ca..233354af 100644 --- a/src/luna/pathology/cli/run_stardist_cell_detection.py +++ b/src/luna/pathology/cli/run_stardist_cell_detection.py @@ -122,10 +122,9 @@ def stardist_simple( ) futures.append(future) results = client.gather(futures) - for idx, result in enumerate(results): - slide_manifest.at[idx, annotation_column] = results["geojson_url"] - - return slide_manifest + return slide_manifest.assign( + **{annotation_column: [x["geojson_url"] for x in results]} + ) @local_cache_urlpath( @@ -182,7 +181,7 @@ def __stardist_simple( runner_type = "SINGULARITY" slide_filename = Path(slide_path).name - command = f"QuPath script --image /inputs/{slide_filename} --args [cellSize={cell_expansion_size},imageType={image_type},{debug_opts}] /scripts/stardist_simple.groovy" + command = f"echo QuPath script --image /inputs/{slide_filename} --args [cellSize={cell_expansion_size},imageType={image_type},{debug_opts}] /scripts/stardist_simple.groovy" logger.info(f"Launching QuPath via {runner_type}:{image} ...") logger.info( f"\tvolumes={slide_urlpath}:'/inputs/{slide_filename}', {slide_path}:'/output_dir'" @@ -319,10 +318,13 @@ def stardist_cell_lymphocyte( futures = [] for row in slide_manifest.itertuples(name="Slide"): + fs, output_path = fsspec.core.url_to_fs( + output_urlpath, **output_storage_options + ) future = client.submit( __stardist_cell_lymphocyte, row.url, - output_urlpath, + fs.unstrip_protocol(str(Path(output_path) / row.id)), row.id, num_cores, use_gpu, @@ -334,10 +336,9 @@ def stardist_cell_lymphocyte( ) futures.append(future) results = client.gather(futures) - for idx, result in enumerate(results): - slide_manifest.at[idx, annotation_column] = result["geojson_url"] - - return slide_manifest + return slide_manifest.assign( + **{annotation_column: [x["geojson_url"] for x in results]} + ) @local_cache_urlpath( diff --git a/src/luna/pathology/cli/run_tissue_detection.py b/src/luna/pathology/cli/run_tissue_detection.py index 0f888418..5ce861dd 100644 --- a/src/luna/pathology/cli/run_tissue_detection.py +++ b/src/luna/pathology/cli/run_tissue_detection.py @@ -104,6 +104,7 @@ def cli( tile_magnification: Optional[int] = None, batch_size: int = 2000, output_urlpath: str = ".", + force: bool = False, dask_options: dict = {}, storage_options: dict = {}, output_storage_options: dict = {}, @@ -119,6 +120,7 @@ def cli( tile_magnification (Optional[int]): Magnification scale at which to generate tiles batch_size (int): batch size for processing output_urlpath (str): Output url/path + force (bool): overwrite outputs if they exist dask_options (dict): dask options storage_options (dict): storage options to pass to reading functions output_storage_options (dict): storage options to pass to writing functions @@ -157,6 +159,7 @@ def cli( config["filter_query"], config["batch_size"], config["output_urlpath"], + config["force"], config["storage_options"], config["output_storage_options"], ) @@ -171,6 +174,7 @@ def detect_tissue( tile_magnification: Optional[int] = None, filter_query: str = "", batch_size: int = 2000, + force: bool = True, storage_options: dict = {}, output_urlpath: str = ".", output_storage_options: dict = {}, @@ -183,6 +187,7 @@ def detect_tissue( tile_magnification (Optional[int]): Magnification scale at which to generate tiles filter_query (str): pandas query by which to filter tiles based on their various tissue detection scores batch_size (int): batch size for processing + force (bool): overwite outputs if they exist storage_options (dict): storage options to pass to reading functions output_urlpath (str): Output url/path output_storage_options (dict): storage options to pass to writing functions @@ -213,6 +218,7 @@ def detect_tissue( filter_query, batch_size, output_urlpath, + force, storage_options, output_storage_options, ) @@ -221,8 +227,9 @@ def detect_tissue( results = client.gather(futures) - for idx, result in enumerate(results): - slide_manifest.at[idx, "tiles_url"] = result["tiles_url"] + slide_manifest = slide_manifest.assign( + tiles_url=[x["tiles_url"] for x in results] + ) return slide_manifest @@ -240,6 +247,7 @@ def __detect_tissue( filter_query: str = "", batch_size: int = 2000, output_urlpath: str = ".", + force: bool = False, storage_options: dict = {}, output_storage_options: dict = {}, ) -> Dict: @@ -247,8 +255,11 @@ def __detect_tissue( output_urlpath, **output_storage_options ) + if not output_filesystem.exists(output_path): + output_filesystem.mkdir(output_path) + tiles_output_path = str(Path(output_path) / f"{slide_id}.tiles.parquet") - if output_filesystem.exists(tiles_output_path): + if not force and output_filesystem.exists(tiles_output_path): logger.info( "Outputs already exist: {output_filesystem.unstrip_protocol(tiles_output_path)}" ) diff --git a/src/luna/pathology/cli/save_tiles.py b/src/luna/pathology/cli/save_tiles.py index dfdbc211..07b00716 100644 --- a/src/luna/pathology/cli/save_tiles.py +++ b/src/luna/pathology/cli/save_tiles.py @@ -24,6 +24,7 @@ def cli( tiles_urlpath: str = "???", batch_size: int = 2000, output_urlpath: str = ".", + force: bool = False, storage_options: dict = {}, output_storage_options: dict = {}, dask_options: dict = {}, @@ -39,6 +40,7 @@ def cli( tiles_urlpath (str): url/path to tile manifest (.parquet) batch_size (int): size in batch dimension to chuck jobs output_urlpath (str): output url/path prefix + force (bool): overwrite outputs if they exist storage_options (dict): storage options to reading functions output_storage_options (dict): storage options to writing functions dask_options (dict): dask options @@ -55,6 +57,7 @@ def cli( config["tiles_urlpath"], config["slide_urlpath"], config["output_urlpath"], + config["force"], config["batch_size"], config["storage_options"], config["output_storage_options"], @@ -66,6 +69,7 @@ def cli( def save_tiles( slide_manifest: DataFrame[SlideSchema], output_urlpath: str, + force: bool = True, batch_size: int = 2000, storage_options: dict = {}, output_storage_options: dict = {}, @@ -78,6 +82,7 @@ def save_tiles( Args: slide_manifest (DataFrame[SlideSchema]): slide manifest from slide_etl output_urlpath (str): output url/path prefix + force (bool): overwrite outputs if they exist batch_size (int): size in batch dimension to chuck jobs storage_options (dict): storage options to reading functions output_storage_options (dict): storage options to writing functions @@ -104,6 +109,7 @@ def save_tiles( slide.tiles_url, slide.url, output_urlpath, + force, batch_size, storage_options, output_storage_options, @@ -111,16 +117,14 @@ def save_tiles( futures.append(future) results = client.gather(futures) - for idx, result in enumerate(results): - slide_manifest.at[idx, "tiles_url"] = result["tiles_url"] - - return slide_manifest + return slide_manifest.assign(tiles_url=[x["tiles_url"] for x in results]) def _save_tiles( tiles_urlpath: str, slide_urlpath: str, output_urlpath: str, + force: bool, batch_size: int = 2000, storage_options: dict = {}, output_storage_options: dict = {},