From 1c2fb0b25d5dd946a88eb87100b18c1b3d4eecfc Mon Sep 17 00:00:00 2001
From: Alex Kong <alkong@ucdavis.edu>
Date: Wed, 11 Dec 2024 14:18:43 -0800
Subject: [PATCH 01/12] Format the maldi-pipeline notebook more cleanly

---
 templates/maldi-pipeline.ipynb | 310 +++++++++++++++------------------
 1 file changed, 137 insertions(+), 173 deletions(-)

diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb
index de1c35b..5a57e7b 100644
--- a/templates/maldi-pipeline.ipynb
+++ b/templates/maldi-pipeline.ipynb
@@ -4,14 +4,22 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# MALDI Extraction"
+    "# MALDI Extraction\n",
+    "\n",
+    "This notebook is the analysis pipeline for the MALDI data. For each MALDI run, this notebook runs the following workflow:\n",
+    "\n",
+    "- **Spectra extraction**: read m/z spectra and corresponding intensities from the binary files\n",
+    "- **Peak filtering**: identify most prominent m/z peaks, along with their widths, heights, areas, etc.\n",
+    "- **Coordinate integration**: map peak information onto the MALDI slide\n",
+    "- **Glycan matching**: map filtered peaks to master glycan list\n",
+    "- **Core-level analysis (TMAs only)**: crop out specific cores, extract core-level stats"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Libraries"
+    "## 0. Import Modules"
    ]
   },
   {
@@ -34,21 +42,23 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## File Paths"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
    "metadata": {
     "tags": []
    },
-   "outputs": [],
    "source": [
-    "data_name = \"panc2055_imzML\"\n",
-    "data_file = pathlib.Path(data_name) / \"panc2055.imzML\""
+    "## 1. Define Global Constants\n",
+    "\n",
+    "### 1.1. File paths\n",
+    "\n",
+    "The following variables are defined pointing to paths in your MALDI run.\n",
+    "\n",
+    "* `base_dir`: the path to your MALDI data. **Change this to match your run location**.\n",
+    "* `library_dir`: the path inside `base_dir` to your `\"libraries\"` folder. This will contain the master list to use for glycan matching.\n",
+    "* `extraction_dir`: the path inside `base_dir` to your `\"extracted\"` folder. Contains the integrated images for each filtered peak and glycan-matched peak across the slide.\n",
+    "* `debug_dir`: the path inside `base_dir` to your `\"debug\"` folder. Individual peak height and width info is saved here for debugging purposes.\n",
+    "* `imzml_dir`: the path inside `base_dir` to your `\"imzml\"` folder. This will contain the `\".imzml\"` and `\".ibd\"` files extracted from SCiLS.\n",
+    "* `imzml_file`: the name of the `imzml` file saved inside the `imzml_dir`. **Change this to match your imzml file name**.\n",
+    "* `imzml_path`: the full path to the `imzml_file`"
    ]
   },
   {
@@ -59,22 +69,21 @@
    },
    "outputs": [],
    "source": [
-    "base_dir = pathlib.Path(\"../data\")\n",
-    "imzml_dir = base_dir / \"imzml\"\n",
+    "base_dir = pathlib.Path(\"../data/panc2055\")\n",
     "library_dir = base_dir / \"libraries\"\n",
     "extraction_dir = base_dir / data_name / \"extracted\"\n",
-    "debug_dir = base_dir / data_name / \"debug\""
+    "debug_dir = base_dir / data_name / \"debug\"\n",
+    "\n",
+    "imzml_dir = base_dir / \"imzml\"\n",
+    "imzml_file = \"panc2055.imzML\"\n",
+    "imzml_path = imzml_dir / data_file"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
+   "cell_type": "markdown",
+   "metadata": {},
    "source": [
-    "data_path = imzml_dir / data_file"
+    "Create the directory structure (if it already exists, nothing is overwritten)."
    ]
   },
   {
@@ -95,7 +104,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Plotting Parameters"
+    "### 1.2. Plotting parameters\n",
+    "\n",
+    "Define the pltoting parameters."
    ]
   },
   {
@@ -122,32 +133,32 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Load necessary files"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### ImzML Data file"
+    "### 1.3. Intensity percentile\n",
+    "\n",
+    "Define the percentile to use for thresholding intensity values."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "imz_data = ImzMLParser(data_path, include_spectra_metadata=\"full\")"
+    "intensity_percentile = 99"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Library Peak List"
+    "## 2. Data Loading\n",
+    "\n",
+    "### 2.1. Spectra\n",
+    "\n",
+    "Load in the spectra information defined by the `.imzml`/`.ibd` file. The following variables are extracted:\n",
+    "\n",
+    "* `total_mass_df`: tabulates every m/z value found along with their corresponding intensity (summed across all pixels).\n",
+    "* `thresholds`: defines the nth intensity value (defined by `intensity_percentile`, computed across all m/z values) found across each pixel."
    ]
   },
   {
@@ -158,97 +169,62 @@
    },
    "outputs": [],
    "source": [
-    "library_peak_list = library_dir / \"glycan_peaklist_KL.csv\"\n",
-    "library_peak_df = pd.read_csv(library_peak_list)\n",
+    "# define the .imzml/.ibd loader object\n",
+    "imz_data = ImzMLParser(data_path, include_spectra_metadata=\"full\")\n",
     "\n",
-    "library_peak_df.head()"
+    "# extract the spectra and threshold array\n",
+    "total_mass_df, thresholds = extraction.extract_spectra(\n",
+    "    imz_data=imz_data, intensity_percentile=intensity_percentile\n",
+    ")\n",
+    "\n",
+    "# display the format and the top few peaks extracted\n",
+    "display(total_mass_df.head())"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Constants"
+    "* `global_intensity_threshold`: the intensity threshold defined by `intensity_percentile`, computed across all intensities extracted. This will be used by the plotting helper functions. Alternatively, you may override with your own parameter"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "intensity_percentile = 99"
-   ]
-  },
-  {
-   "cell_type": "markdown",
    "metadata": {},
+   "outputs": [],
    "source": [
-    "## Spectrum Extraction"
+    "# define the global intensity threshold\n",
+    "global_intensity_threshold = np.percentile(total_mass_df[\"intensity\"].values, intensity_percentile)\n",
+    "print(f\"Global Intensity Threshold: {global_intensity_threshold}\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Extract the *m/z* and *intensity* values."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "total_mass_df, thresholds = extraction.extract_spectra(\n",
-    "    imz_data=imz_data, intensity_percentile=intensity_percentile\n",
-    ")"
+    "For additional verification, set `largest_intensity_count` to see the `n` largest intensities."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "display(total_mass_df)"
+    "largest_intensity_count = 10\n",
+    "total_mass_df.nlargest(largest_intensity_count, [\"intensity\"])"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Global Intensity Threshold\n",
+    "### 2.2. Master library peaks\n",
     "\n",
-    "Display the $n$ largest intensities, as well as the $m$-th intensity percentile, and set that as the *global intensity threshold*."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "largest_intensity_count = 10"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "total_mass_df.nlargest(largest_intensity_count, [\"intensity\"])"
+    "Load the master glycan peak list, this will be used by the library matching process.\n",
+    "\n",
+    "**NOTE: Defining a singular master glycan peaklist is a WIP. For now, ask a lab member for the peak list to use. This file must be manually copied into the `\"libraries\"` subfolder.**"
    ]
   },
   {
@@ -259,22 +235,26 @@
    },
    "outputs": [],
    "source": [
-    "global_intensity_threshold = np.percentile(total_mass_df[\"intensity\"].values, intensity_percentile)\n",
-    "print(f\"Global Intensity Threshold: {global_intensity_threshold}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Peak Detection"
+    "library_peak_list = library_dir / \"glycan_peaklist_KL.csv\"\n",
+    "library_peak_df = pd.read_csv(library_peak_list)\n",
+    "\n",
+    "# visualize the top few master peaks defined\n",
+    "library_peak_df.head()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Rolling Window Method"
+    "## 3. Peak Analysis\n",
+    "\n",
+    "### 3.1. Define prominence thresholds\n",
+    "\n",
+    "Although various peaks can be identified across a run's spectra, the vast majority of them will be too small to indicate any meaningful signal. To address this, a prominence-based peak filtering method is used (see https://www.mathworks.com/help/signal/ug/prominence.html for a good definition of prominence).\n",
+    "\n",
+    "The first step is to use a rolling window-based approach to extract the prominence thresholds to use for peak filtering.\n",
+    "\n",
+    "* `window_size`: this parameter can be increased or decreased for more or less aggressive peak filtering respectively"
    ]
   },
   {
@@ -294,7 +274,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Plot Intensities"
+    "Visualizes how the thresholds you chose affect the peak candidates identified."
    ]
   },
   {
@@ -317,7 +297,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Signal Extraction"
+    "### 3.2. Extract and filter the m/z peaks\n",
+    "\n",
+    "Once you're happy with the prominence thresholds defined in `log_int_percentile`, run the following cell to identify the m/z peaks, which does the following:\n",
+    "\n",
+    "1. A traditional local maxima-based approach applies the first filter\n",
+    "2. For the remaining candidates, the `log_int_percentile` prominence thresholds apply a second filter to remove insignificant peaks"
    ]
   },
   {
@@ -330,16 +315,16 @@
    "source": [
     "peak_candidate_idxs, peak_candidates = extraction.signal_extraction(\n",
     "    total_mass_df=total_mass_df, log_int_percentile=log_int_percentile\n",
-    ")"
+    ")\n",
+    "\n",
+    "print(f\"Candiate Peak Count: {len(peak_candidates)}\")"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "print(f\"Candiate Peak Count: {len(peak_candidates)}\")"
+    "Visualize the discovered peaks."
    ]
   },
   {
@@ -362,7 +347,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Get Peak Widths"
+    "### 3.3. Compute peak widths\n",
+    "\n",
+    "For each peak, compute the corresponding width at 10% of the height defined from the peak's base. This will be necessary for coordinate integration (WIP)."
    ]
   },
   {
@@ -385,16 +372,11 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Save Peak Spectra"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "save_peak_spectra_debug = True"
+    "### 3.4. Save Peak Spectra\n",
+    "\n",
+    "Define the m/z value of each peak, along with their corresponding lower and upper m/z bounds.\n",
+    "\n",
+    "* `save_peak_spectra_debug`: whether to save the corresponding peak spectra graphs to the `\"debug\"` folder. We highly recommend leaving this as `True`."
    ]
   },
   {
@@ -405,6 +387,8 @@
    },
    "outputs": [],
    "source": [
+    "save_peak_spectra_debug = True\n",
+    "\n",
     "panel_df = extraction.peak_spectra(\n",
     "    total_mass_df=total_mass_df,\n",
     "    peak_df=peak_df,\n",
@@ -415,25 +399,20 @@
     "    r_ips_r=r_ips_r,\n",
     "    save_peak_spectra_debug=save_peak_spectra_debug,\n",
     "    debug_dir=debug_dir,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "panel_df"
+    ")\n",
+    "\n",
+    "display(panel_df.head())"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Integrate Coordinates\n",
+    "## 4. Coordinate Integration\n",
     "\n",
-    "Generate the images and save them in an *xarray*, where the dimensions are: Image (indexed by peak value), $x$, and $y$."
+    "Once peaks have been identified, we need a way of mapping this information back to the slide. Across a coordinate's m/z spectrum, if it is also an identified peak from step 3, we store the corresponding intensity at that coordinate.\n",
+    "\n",
+    "**NOTE: recording the raw intensity is not a traditional coordinate integration technique, which uses AOC. The AOC technique is currently in development.**"
    ]
   },
   {
@@ -447,22 +426,14 @@
     "image_data = extraction.coordinate_integration(peak_df=peak_df, imz_data=imz_data)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "image_data"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Histogram preview of the Intensities of a given Peak\n",
+    "For QC purposes, visualize the intensity distribution around a desired peak\n",
     "\n",
-    "Set a value for `desired_peak_hist` (ideally something from your library) and it'll find the nearest peak, and display a histogram of the intensities of the image with `bin_count` bins."
+    "* `desired_peak_hist`: the peak around where you want to visualize corresponding intensities (ideally something from your library)\n",
+    "* `bin_count`: number of bins to use for the histogram"
    ]
   },
   {
@@ -474,18 +445,16 @@
    "outputs": [],
    "source": [
     "desired_peak_hist = 1809.639659\n",
-    "bin_count = 40"
+    "bin_count = 40\n",
+    "\n",
+    "_ = image_data.sel(peak=[desired_peak_hist], method=\"nearest\").plot.hist(bins=bin_count)"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
+   "cell_type": "markdown",
+   "metadata": {},
    "source": [
-    "image_data.sel(peak=[desired_peak_hist], method=\"nearest\").plot.hist(bins=bin_count)"
+    "Save the integrated intensity images per peak to the `\"extracted\"` folder. For every peak, a float32 and int32 image will be saved to the `\"float\"` and `\"int\"` subdirectories respectively."
    ]
   },
   {
@@ -503,14 +472,11 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Match Glycan Library with Extracted Peaks"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Constants"
+    "## 5. Glycan Library Matching\n",
+    "\n",
+    "While the peaks are useful, they're not particularly useful without knowing what glycan encompasses them. The master glycan library list (`library_peak_df`) defines the list of glycans of interest as well as the m/z value they're centered at. In this way, peak values can be mapped to their corresponding glycan within a tolerance range.\n",
+    "\n",
+    "* `ppm`: the tolerance range. Smaller values will lead to stricter matching and vice versa for larger values."
    ]
   },
   {
@@ -521,20 +487,18 @@
    },
    "outputs": [],
    "source": [
-    "ppm = 100"
+    "ppm = 100\n",
+    "\n",
+    "matched_peaks_df = extraction.library_matching(\n",
+    "    image_xr=image_data, library_peak_df=library_peak_df, ppm=ppm, extraction_dir=extraction_dir\n",
+    ")"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
+   "cell_type": "markdown",
+   "metadata": {},
    "source": [
-    "matched_peaks_df = extraction.library_matching(\n",
-    "    image_xr=image_data, library_peak_df=library_peak_df, ppm=ppm, extraction_dir=extraction_dir\n",
-    ")"
+    "As with the original peak images, the library matched intensity images are also saved, this time to the `\"extracted/library_matched\"` folder. There will likewise be `\"float\"` and `\"int\"` subdirectories containing float32 and int32 representations."
    ]
   },
   {
@@ -554,7 +518,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Core Naming and Cropping"
+    "## 6. Core Naming and Cropping"
    ]
   },
   {
@@ -697,7 +661,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.6"
+   "version": "3.11.10"
   },
   "vscode": {
    "interpreter": {

From 1e3be3398fdf972f2e6cf6371731fc3d7d40e982 Mon Sep 17 00:00:00 2001
From: Alex Kong <alkong@ucdavis.edu>
Date: Thu, 12 Dec 2024 12:10:00 -0800
Subject: [PATCH 02/12] Clean up some documentation

---
 templates/maldi-pipeline.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb
index 5a57e7b..b2188e4 100644
--- a/templates/maldi-pipeline.ipynb
+++ b/templates/maldi-pipeline.ipynb
@@ -474,7 +474,7 @@
    "source": [
     "## 5. Glycan Library Matching\n",
     "\n",
-    "While the peaks are useful, they're not particularly useful without knowing what glycan encompasses them. The master glycan library list (`library_peak_df`) defines the list of glycans of interest as well as the m/z value they're centered at. In this way, peak values can be mapped to their corresponding glycan within a tolerance range.\n",
+    "While the filtered peaks provide meaningful information, they're not particularly useful without knowing what glycan encompasses them. The master glycan library list (`library_peak_df`) defines the list of glycans of interest as well as the m/z value they're centered at. In this way, peak values can be mapped to their corresponding glycan within a tolerance range.\n",
     "\n",
     "* `ppm`: the tolerance range. Smaller values will lead to stricter matching and vice versa for larger values."
    ]

From 8089ae5842e21a033b809eedbb73817b55524f84 Mon Sep 17 00:00:00 2001
From: Alex Kong <alkong@ucdavis.edu>
Date: Thu, 12 Dec 2024 12:22:11 -0800
Subject: [PATCH 03/12] Remove extraneous sentence from
 global_intensity_threshold documentation

---
 templates/maldi-pipeline.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb
index b2188e4..c1576a4 100644
--- a/templates/maldi-pipeline.ipynb
+++ b/templates/maldi-pipeline.ipynb
@@ -185,7 +185,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "* `global_intensity_threshold`: the intensity threshold defined by `intensity_percentile`, computed across all intensities extracted. This will be used by the plotting helper functions. Alternatively, you may override with your own parameter"
+    "* `global_intensity_threshold`: the intensity threshold defined by `intensity_percentile`, computed across all intensities extracted. This is used by the plotting functions."
    ]
   },
   {

From 0baa3e33dd8af2968929bd25d7eb4f009702ff5a Mon Sep 17 00:00:00 2001
From: Alex Kong <alkong@ucdavis.edu>
Date: Thu, 12 Dec 2024 12:25:12 -0800
Subject: [PATCH 04/12] Clarify AOC in-progress

---
 templates/maldi-pipeline.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb
index c1576a4..ae15b4e 100644
--- a/templates/maldi-pipeline.ipynb
+++ b/templates/maldi-pipeline.ipynb
@@ -412,7 +412,7 @@
     "\n",
     "Once peaks have been identified, we need a way of mapping this information back to the slide. Across a coordinate's m/z spectrum, if it is also an identified peak from step 3, we store the corresponding intensity at that coordinate.\n",
     "\n",
-    "**NOTE: recording the raw intensity is not a traditional coordinate integration technique, which uses AOC. The AOC technique is currently in development.**"
+    "**NOTE: recording the raw intensity is not a traditional coordinate integration technique, which normally uses AOC. The AOC technique is currently in development.**"
    ]
   },
   {

From 97e107095414867e4d37e4c2c4826e21dbf46bf0 Mon Sep 17 00:00:00 2001
From: Alex Kong <alkong@ucdavis.edu>
Date: Thu, 12 Dec 2024 12:26:45 -0800
Subject: [PATCH 05/12] Remove redundancy

---
 templates/maldi-pipeline.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb
index ae15b4e..1520264 100644
--- a/templates/maldi-pipeline.ipynb
+++ b/templates/maldi-pipeline.ipynb
@@ -474,7 +474,7 @@
    "source": [
     "## 5. Glycan Library Matching\n",
     "\n",
-    "While the filtered peaks provide meaningful information, they're not particularly useful without knowing what glycan encompasses them. The master glycan library list (`library_peak_df`) defines the list of glycans of interest as well as the m/z value they're centered at. In this way, peak values can be mapped to their corresponding glycan within a tolerance range.\n",
+    "While the filtered peaks provide meaningful information, they're not particularly useful without knowing what glycan encompasses them. The master glycan library list (`library_peak_df`) defines the glycans of interest as well as the m/z value they're centered at. In this way, peak values can be mapped to their corresponding glycan within a tolerance range.\n",
     "\n",
     "* `ppm`: the tolerance range. Smaller values will lead to stricter matching and vice versa for larger values."
    ]

From 4fd18fb66315d358cdb74a56d8e6b28ab5dec5c3 Mon Sep 17 00:00:00 2001
From: Alex Kong <alkong@ucdavis.edu>
Date: Mon, 16 Dec 2024 15:00:58 -0800
Subject: [PATCH 06/12] Update documentation to reflect manual copying of
 directory structure

---
 templates/maldi-pipeline.ipynb | 57 ++++++++++++++++++++++------------
 1 file changed, 37 insertions(+), 20 deletions(-)

diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb
index 1520264..5036aad 100644
--- a/templates/maldi-pipeline.ipynb
+++ b/templates/maldi-pipeline.ipynb
@@ -50,15 +50,13 @@
     "\n",
     "### 1.1. File paths\n",
     "\n",
-    "The following variables are defined pointing to paths in your MALDI run.\n",
+    "The following variables are defined pointing to paths in your MALDI run. **Only `base_dir` should be changed, the other folders should maintain the existing names and subdirectory structure.**\n",
     "\n",
-    "* `base_dir`: the path to your MALDI data. **Change this to match your run location**.\n",
-    "* `library_dir`: the path inside `base_dir` to your `\"libraries\"` folder. This will contain the master list to use for glycan matching.\n",
-    "* `extraction_dir`: the path inside `base_dir` to your `\"extracted\"` folder. Contains the integrated images for each filtered peak and glycan-matched peak across the slide.\n",
-    "* `debug_dir`: the path inside `base_dir` to your `\"debug\"` folder. Individual peak height and width info is saved here for debugging purposes.\n",
-    "* `imzml_dir`: the path inside `base_dir` to your `\"imzml\"` folder. This will contain the `\".imzml\"` and `\".ibd\"` files extracted from SCiLS.\n",
-    "* `imzml_file`: the name of the `imzml` file saved inside the `imzml_dir`. **Change this to match your imzml file name**.\n",
-    "* `imzml_path`: the full path to the `imzml_file`"
+    "* `base_dir`: the path to your MALDI data.\n",
+    "* `imzml_dir`: the path inside `base_dir` to your `imzml` folder. This will contain the `.imzml` and `.ibd` files extracted from SCiLS.\n",
+    "* `library_dir`: the path inside `base_dir` to your `libraries` folder. This will contain the master list to use for glycan matching.\n",
+    "* `extraction_dir`: the path inside `base_dir` to your `extracted` folder. Contains the integrated images for each filtered peak and glycan-matched peak across the slide.\n",
+    "* `debug_dir`: the path inside `base_dir` to your `debug` folder. Individual peak height and width info is saved here for debugging purposes."
    ]
   },
   {
@@ -70,13 +68,10 @@
    "outputs": [],
    "source": [
     "base_dir = pathlib.Path(\"../data/panc2055\")\n",
-    "library_dir = base_dir / \"libraries\"\n",
-    "extraction_dir = base_dir / data_name / \"extracted\"\n",
-    "debug_dir = base_dir / data_name / \"debug\"\n",
-    "\n",
     "imzml_dir = base_dir / \"imzml\"\n",
-    "imzml_file = \"panc2055.imzML\"\n",
-    "imzml_path = imzml_dir / data_file"
+    "library_dir = base_dir / \"libraries\"\n",
+    "extraction_dir = base_dir / \"output\" / \"extracted\"\n",
+    "debug_dir = base_dir / \"output\" / \"debug\""
    ]
   },
   {
@@ -95,11 +90,35 @@
    "outputs": [],
    "source": [
     "# Create directories\n",
-    "for directory in [base_dir, library_dir, extraction_dir, debug_dir]:\n",
+    "for directory in [base_dir, imzml_dir, library_dir, extraction_dir, debug_dir]:\n",
     "    if not os.path.exists(directory):\n",
     "        directory.mkdir(parents=True, exist_ok=True)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**NOTE: At this point, ensure you have the following files in your folders in the following locations:**\n",
+    "\n",
+    "* .imzml/.ibd files: extracted using SCiLS. **Either explicitly point to the `imzml` subfolder when extracting, or manually copy these files to the `imzml` subfolder afterwards.**\n",
+    "* Master glycan list: defining a singular master glycan list is a WIP. For now, ask a lab member for the peak list to use. **This file must be manually copied into the `\"libraries\"` subfolder.**\n",
+    "\n",
+    "And define the following variable:\n",
+    "\n",
+    "* `imzml_file`: the name of the `.imzml` file in the `imzml` subfolder."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "imzml_file = \"panc2055.imzml\"\n",
+    "imzml_path = imzml_file / imzml_path"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -170,7 +189,7 @@
    "outputs": [],
    "source": [
     "# define the .imzml/.ibd loader object\n",
-    "imz_data = ImzMLParser(data_path, include_spectra_metadata=\"full\")\n",
+    "imz_data = ImzMLParser(imzml_path, include_spectra_metadata=\"full\")\n",
     "\n",
     "# extract the spectra and threshold array\n",
     "total_mass_df, thresholds = extraction.extract_spectra(\n",
@@ -222,9 +241,7 @@
    "source": [
     "### 2.2. Master library peaks\n",
     "\n",
-    "Load the master glycan peak list, this will be used by the library matching process.\n",
-    "\n",
-    "**NOTE: Defining a singular master glycan peaklist is a WIP. For now, ask a lab member for the peak list to use. This file must be manually copied into the `\"libraries\"` subfolder.**"
+    "Load the master glycan peak list, this will be used by the library matching process."
    ]
   },
   {
@@ -661,7 +678,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.10"
+   "version": "3.11.9"
   },
   "vscode": {
    "interpreter": {

From 2c710bc8d534ab18dfe7701b81515bda54444a9e Mon Sep 17 00:00:00 2001
From: Alex Kong <alkong@ucdavis.edu>
Date: Mon, 16 Dec 2024 15:02:08 -0800
Subject: [PATCH 07/12] Additional touchups

---
 templates/maldi-pipeline.ipynb | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb
index 5036aad..01a326e 100644
--- a/templates/maldi-pipeline.ipynb
+++ b/templates/maldi-pipeline.ipynb
@@ -99,10 +99,10 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**NOTE: At this point, ensure you have the following files in your folders in the following locations:**\n",
+    "**NOTE: At this point, ensure you have the following files in the following locations:**\n",
     "\n",
-    "* .imzml/.ibd files: extracted using SCiLS. **Either explicitly point to the `imzml` subfolder when extracting, or manually copy these files to the `imzml` subfolder afterwards.**\n",
-    "* Master glycan list: defining a singular master glycan list is a WIP. For now, ask a lab member for the peak list to use. **This file must be manually copied into the `\"libraries\"` subfolder.**\n",
+    "* **`.imzml`/`.ibd` files**: extracted using SCiLS. **Either explicitly point to the `imzml` subfolder when extracting, or manually copy these files to the `imzml` subfolder afterwards.**\n",
+    "* **Master glycan list**: defining a singular master glycan list is a WIP. For now, ask a lab member for the peak list to use. **This file must be manually copied into the `libraries` subfolder.**\n",
     "\n",
     "And define the following variable:\n",
     "\n",

From b934482da04771c6ee96625ab578e64ff1b73373 Mon Sep 17 00:00:00 2001
From: Alex Kong <alkong@ucdavis.edu>
Date: Mon, 16 Dec 2024 15:05:55 -0800
Subject: [PATCH 08/12] OCD-level fixes

---
 templates/maldi-pipeline.ipynb | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb
index 01a326e..af34d2f 100644
--- a/templates/maldi-pipeline.ipynb
+++ b/templates/maldi-pipeline.ipynb
@@ -393,7 +393,7 @@
     "\n",
     "Define the m/z value of each peak, along with their corresponding lower and upper m/z bounds.\n",
     "\n",
-    "* `save_peak_spectra_debug`: whether to save the corresponding peak spectra graphs to the `\"debug\"` folder. We highly recommend leaving this as `True`."
+    "* `save_peak_spectra_debug`: whether to save the corresponding peak spectra graphs to the `debug` folder. We highly recommend leaving this as `True`."
    ]
   },
   {
@@ -471,7 +471,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Save the integrated intensity images per peak to the `\"extracted\"` folder. For every peak, a float32 and int32 image will be saved to the `\"float\"` and `\"int\"` subdirectories respectively."
+    "Save the integrated intensity images per peak to the `\"extracted\"` folder. For every peak, a float32 and int32 image will be saved to the `float` and `int` subdirectories respectively."
    ]
   },
   {
@@ -515,7 +515,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "As with the original peak images, the library matched intensity images are also saved, this time to the `\"extracted/library_matched\"` folder. There will likewise be `\"float\"` and `\"int\"` subdirectories containing float32 and int32 representations."
+    "As with the original peak images, the library matched intensity images are also saved, this time to the `output/library_matched` folder. There will likewise be `float` and `int` subdirectories containing float32 and int32 representations."
    ]
   },
   {

From 41ef0867399ac3f175558635805add4b9d43d0ff Mon Sep 17 00:00:00 2001
From: Alex Kong <alkong@ucdavis.edu>
Date: Mon, 16 Dec 2024 15:07:52 -0800
Subject: [PATCH 09/12] Ensure the library matched folder is pointed to
 correctly

---
 templates/maldi-pipeline.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb
index af34d2f..80eef3e 100644
--- a/templates/maldi-pipeline.ipynb
+++ b/templates/maldi-pipeline.ipynb
@@ -515,7 +515,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "As with the original peak images, the library matched intensity images are also saved, this time to the `output/library_matched` folder. There will likewise be `float` and `int` subdirectories containing float32 and int32 representations."
+    "As with the original peak images, the library matched intensity images are also saved, this time to the `output/extracted/library_matched` folder. There will likewise be `float` and `int` subdirectories containing float32 and int32 representations."
    ]
   },
   {

From d6f36b6b176a7d36a91a49723ecf31d020e3b91b Mon Sep 17 00:00:00 2001
From: Alex Kong <alkong@ucdavis.edu>
Date: Tue, 17 Dec 2024 13:31:01 -0800
Subject: [PATCH 10/12] Fixes implemented after first presentation run through

---
 templates/maldi-pipeline.ipynb | 59 +++++++++-------------------------
 1 file changed, 15 insertions(+), 44 deletions(-)

diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb
index 80eef3e..fab3580 100644
--- a/templates/maldi-pipeline.ipynb
+++ b/templates/maldi-pipeline.ipynb
@@ -148,23 +148,12 @@
     "plt.rcParams[\"figure.constrained_layout.use\"] = False"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 1.3. Intensity percentile\n",
-    "\n",
-    "Define the percentile to use for thresholding intensity values."
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "intensity_percentile = 99"
-   ]
+   "source": []
   },
   {
    "cell_type": "markdown",
@@ -196,26 +185,13 @@
     "    imz_data=imz_data, intensity_percentile=intensity_percentile\n",
     ")\n",
     "\n",
-    "# display the format and the top few peaks extracted\n",
-    "display(total_mass_df.head())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "* `global_intensity_threshold`: the intensity threshold defined by `intensity_percentile`, computed across all intensities extracted. This is used by the plotting functions."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
     "# define the global intensity threshold\n",
+    "intensity_percentile = 99\n",
     "global_intensity_threshold = np.percentile(total_mass_df[\"intensity\"].values, intensity_percentile)\n",
-    "print(f\"Global Intensity Threshold: {global_intensity_threshold}\")"
+    "print(f\"Global Intensity Threshold: {global_intensity_threshold}\")\n",
+    "\n",
+    "# display the format and the top few peaks extracted\n",
+    "display(total_mass_df.head())"
    ]
   },
   {
@@ -269,9 +245,7 @@
     "\n",
     "Although various peaks can be identified across a run's spectra, the vast majority of them will be too small to indicate any meaningful signal. To address this, a prominence-based peak filtering method is used (see https://www.mathworks.com/help/signal/ug/prominence.html for a good definition of prominence).\n",
     "\n",
-    "The first step is to use a rolling window-based approach to extract the prominence thresholds to use for peak filtering.\n",
-    "\n",
-    "* `window_size`: this parameter can be increased or decreased for more or less aggressive peak filtering respectively"
+    "The first step is to use a rolling window-based approach to extract the prominence thresholds to use for peak filtering."
    ]
   },
   {
@@ -427,9 +401,7 @@
    "source": [
     "## 4. Coordinate Integration\n",
     "\n",
-    "Once peaks have been identified, we need a way of mapping this information back to the slide. Across a coordinate's m/z spectrum, if it is also an identified peak from step 3, we store the corresponding intensity at that coordinate.\n",
-    "\n",
-    "**NOTE: recording the raw intensity is not a traditional coordinate integration technique, which normally uses AOC. The AOC technique is currently in development.**"
+    "Once peaks have been identified, we need a way of mapping this information back to the slide. Across a coordinate's m/z spectrum, if it is also an identified peak from step 3, we store the corresponding intensity at that coordinate."
    ]
   },
   {
@@ -471,7 +443,10 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Save the integrated intensity images per peak to the `\"extracted\"` folder. For every peak, a float32 and int32 image will be saved to the `float` and `int` subdirectories respectively."
+    "Save the integrated intensity images per peak to the `extracted` folder. For every peak, a float32 and int32 image will be saved to the `float` and `int` subdirectories respectively.\n",
+    "\n",
+    "* The `float` images should be used for quantitative downstresam analysis\n",
+    "* The `int` images are saved for visualization, as they're more compatible with most image viewers"
    ]
   },
   {
@@ -491,9 +466,7 @@
    "source": [
     "## 5. Glycan Library Matching\n",
     "\n",
-    "While the filtered peaks provide meaningful information, they're not particularly useful without knowing what glycan encompasses them. The master glycan library list (`library_peak_df`) defines the glycans of interest as well as the m/z value they're centered at. In this way, peak values can be mapped to their corresponding glycan within a tolerance range.\n",
-    "\n",
-    "* `ppm`: the tolerance range. Smaller values will lead to stricter matching and vice versa for larger values."
+    "While the filtered peaks provide meaningful information, they're not particularly useful without knowing what glycan encompasses them. The master glycan library list (`library_peak_df`) defines the glycans of interest as well as the m/z value they're centered at. In this way, peak values can be mapped to their corresponding glycan within a tolerance range."
    ]
   },
   {
@@ -504,10 +477,8 @@
    },
    "outputs": [],
    "source": [
-    "ppm = 100\n",
-    "\n",
     "matched_peaks_df = extraction.library_matching(\n",
-    "    image_xr=image_data, library_peak_df=library_peak_df, ppm=ppm, extraction_dir=extraction_dir\n",
+    "    image_xr=image_data, library_peak_df=library_peak_df, ppm=100, extraction_dir=extraction_dir\n",
     ")"
    ]
   },
@@ -678,7 +649,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.11.10"
   },
   "vscode": {
    "interpreter": {

From e319b8e683eb92086191611807bce277d1086bdb Mon Sep 17 00:00:00 2001
From: Alex Kong <alkong@ucdavis.edu>
Date: Tue, 17 Dec 2024 13:34:01 -0800
Subject: [PATCH 11/12] Change default ppm parameter to 50

---
 templates/maldi-pipeline.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb
index fab3580..832a68d 100644
--- a/templates/maldi-pipeline.ipynb
+++ b/templates/maldi-pipeline.ipynb
@@ -478,7 +478,7 @@
    "outputs": [],
    "source": [
     "matched_peaks_df = extraction.library_matching(\n",
-    "    image_xr=image_data, library_peak_df=library_peak_df, ppm=100, extraction_dir=extraction_dir\n",
+    "    image_xr=image_data, library_peak_df=library_peak_df, ppm=50, extraction_dir=extraction_dir\n",
     ")"
    ]
   },

From 036887a25b74199213a84afe4b609de83d8e5ea1 Mon Sep 17 00:00:00 2001
From: Alex Kong <alkong@ucdavis.edu>
Date: Tue, 17 Dec 2024 13:35:23 -0800
Subject: [PATCH 12/12] Fix file path of the imzml_file

---
 templates/maldi-pipeline.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb
index 832a68d..af3318f 100644
--- a/templates/maldi-pipeline.ipynb
+++ b/templates/maldi-pipeline.ipynb
@@ -116,7 +116,7 @@
    "outputs": [],
    "source": [
     "imzml_file = \"panc2055.imzml\"\n",
-    "imzml_path = imzml_file / imzml_path"
+    "imzml_path = imzml_dir / imzml_file"
    ]
   },
   {