diff --git a/data_curation/README.md b/data_curation/README.md new file mode 100644 index 000000000..b9160e282 --- /dev/null +++ b/data_curation/README.md @@ -0,0 +1,76 @@ +The scripts in this directory are used to organize datasets before importing into a PostgreSQL database for the SPT application or analysis. + +1. [Doing import / upload](#doing-import--upload) +2. [Doing import / upload for just one dataset](#doing-import--upload-for-just-one-dataset) +3. [Import without using the wrapper scripts](#import-without-using-the-wrapper-scripts) +4. [Show all progress](#show-progress) + +Datasets are stored in subdirectories of `datasets/`. The procedure for adding a new dataset is documented in [`datasets/template/README.md`](datasets/template/README.md). + +## Doing import / upload +A script which does a mass import of all available datasets is provided here as `import_datasets.sh`. It assumes that the [`spatialprofilingtoolbox` Python package](https://pypi.org/project/spatialprofilingtoolbox/) has been installed. + +The usage is, for example: +```bash +./import_datasets.sh ~/.spt_db.config.local --drop-first +``` +- `~/.spt_db.config.local` is an example name of a [database configuration file](https://github.com/nadeemlab/SPT/blob/main/spatialprofilingtoolbox/workflow/assets/.spt_db.config.template). +- The `--drop-first` option causes dropping/deleting a dataset with the same study name as one which is about to be uploaded. Without this option, upload will only take place if the dataset is not already in the database. + +## Doing import / upload for just one dataset +For example: + +```bash +./import_datasets.sh ~/.spt_db.config.local --drop-first moldoveanu +``` +or +```bash +./import_datasets.sh ~/.spt_db.config.local --no-drop-first moldoveanu +``` + +## Import without using the wrapper +The import-all-datasets-here script is provided for convenience only, as a wrapper around `spt` CLI commands. + +For one dataset you may prefer to use your own custom script templated on the following: + +```bash +mkdir rundir; cd rundir +spt workflow configure --workflow="tabular import" --config-file=workflow.config +./run.sh +``` + +For the above, the `workflow.config` file should look something like this: +```ini +[general] +db_config_file = /Users/username/.spt_db.config.local + +[database visitor] +study_name = Melanoma CyTOF ICI + +[tabular import] +input_path = datasets/moldoveanu/generated_artifacts +``` + +If you wish for Nextflow to pull directly from S3, rather than a local directory like `.../generated_artifacts`, `workflow.config` may look like this instead: + +```ini +[general] +db_config_file = /Users/username/.spt_db.config.local + +[database visitor] +study_name = Melanoma CyTOF ICI + +[tabular import] +input_path = s3://bucketname/moldoveanu +``` + +In the S3 case, you would have to make sure that credentials are available. Currently Nextflow requires, in the case of session-specific credentials, a "profile" in `~/.aws/credentials`, usually the profile named `default`. + +You can monitor progress by watching the Nextflow logs: + +```bash +tail -f -n1000 work/*/*.command.log +``` + +## Show all progress +By default `import_datasets.sh` is parallelized at the per-dataset level. To see basic progress across , use `./show_progress.sh` . diff --git a/data_curation/convenience_scripts/__init__.py b/data_curation/convenience_scripts/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/data_curation/convenience_scripts/bimodality_assessor.py b/data_curation/convenience_scripts/bimodality_assessor.py new file mode 100644 index 000000000..abf9aa356 --- /dev/null +++ b/data_curation/convenience_scripts/bimodality_assessor.py @@ -0,0 +1,171 @@ +""" +See gist: + +https://gist.github.com/jimmymathews/ca0a03d04dcc7265eac55a66ec20d67a +""" + +import warnings + +import numpy as np +import pandas as pd +from numpy.typing import ArrayLike, NDArray + +from sklearn.mixture import GaussianMixture # type: ignore +from sklearn.exceptions import ConvergenceWarning # type: ignore + +warnings.simplefilter('error', ConvergenceWarning) + + +class BimodalityAssessor: + """Assess bimodality for a univariate feature.""" + + def __init__(self, feature_values: ArrayLike, quiet: bool = False) -> None: + self.quiet = quiet + self._initialize_estimator() + self._record_feature_values(feature_values) + self._attempt_fitting() + + def _initialize_estimator(self) -> None: + self.estimator = GaussianMixture( + n_components=BimodalityAssessor._number_populations(), + max_iter=20, + random_state=0, + ) + self.estimator.means_init = np.array([[-1], [1]]) + + def _record_feature_values(self, feature_values: ArrayLike) -> None: + self.feature_array = BimodalityAssessor._convert_to_tall_numpy(feature_values) + self.feature_values = feature_values + + def _attempt_fitting(self) -> None: + try: + self.get_estimator().fit(self.get_feature_array()) + self.convergence_failure = False + except ConvergenceWarning: + if not self.quiet: + print('Gaussian mixture model estimation failed to converge.') + self.convergence_failure = True + + def failed_to_converge(self) -> bool: + return self.convergence_failure + + def get_feature_assignment_table_markdown(self) -> str: + return self.get_feature_assignment_table().to_markdown() + + def get_feature_assignment_table(self) -> pd.DataFrame: + dataframe = pd.DataFrame({ + 'Feature': self.get_feature_values_list(), + 'GMM likelihood-based population label': self.get_dichotomized_feature(), + }) + return dataframe + + def get_dichotomized_feature(self, use_threshold: bool = False, original = None) -> list[int]: + if use_threshold: + threshold = self.infer_effective_threshold(weighted_mean=True) + feature_values = self.feature_values if original is None else original + return [ + 1 if value >= threshold else 0 + for value in feature_values + ] + return self.get_estimator().predict(self.get_feature_array()) + + def infer_effective_threshold(self, weighted_mean: bool = False) -> float | str: + if weighted_mean: + means = self.get_means() + deviations = self.get_standard_deviations() + weights = [1/d for d in deviations] + return (means[0]*weights[0] + means[1]*weights[1]) / (weights[0] + weights[1]) + dichotomized_feature = self.get_dichotomized_feature() + if len(set(dichotomized_feature)) == 1: + return 'Only 1 label' + pairs = sorted([ + (self.get_feature_values_list()[i], dichotomized_feature[i]) + for i in range(len(dichotomized_feature)) + ], key=lambda pair: (pair[1], pair[0])) + lower_limit = None + upper_limit = None + inconsistent = False + for i in range(len(pairs) - 1): + feature_value, discrete_value = pairs[i] + next_feature_value, next_discrete_value = pairs[i + 1] + if (discrete_value == 0 and next_discrete_value == 1) or (discrete_value == 1 and next_discrete_value == 0): + if (lower_limit is None) and (upper_limit is None): + lower_limit = feature_value + upper_limit = next_feature_value + else: + inconsistent = True + if inconsistent: + print('\n'.join([str(p) for p in pairs])) + return 'Assignments inconsistent with thresholding' + if (lower_limit is None) and (upper_limit is None): + return 'No threshold behavior detected somehow' + return (lower_limit + upper_limit) / 2 + + def get_number_of_errors(self, answers: list[int]) -> int: + dichotomized_feature = self.get_dichotomized_feature() + number_errors = sum( + 1 + for i in range(len(dichotomized_feature)) + if dichotomized_feature[i] == answers[i] + ) + return min(number_errors, len(answers) - number_errors) + + def get_weights(self) -> list[float]: + return list(self.get_estimator().weights_) + + @classmethod + def _number_populations(cls) -> int: + return 2 + + @classmethod + def _convert_to_tall_numpy(cls, feature_values: ArrayLike) -> NDArray[np.float64]: + return np.array(feature_values).reshape(-1, 1) + + def get_estimator(self) -> GaussianMixture: + return self.estimator + + def get_feature_array(self) -> NDArray[np.float64]: + return self.feature_array + + def get_feature_values_list(self) -> list[float]: + return self.get_feature_array().transpose()[0] + + def get_average_mahalanobis_distance(self) -> float: + means = self.get_means() + standard_deviations = self.get_standard_deviations() + distance1 = abs(means[0] - means[1]) / standard_deviations[0] + distance2 = abs(means[1] - means[0]) / standard_deviations[1] + weights = self.get_weights() + return weights[0] * weights[1] * (distance1 + distance2) / 2 + + def get_means(self) -> list[float]: + return list(self.get_estimator().means_) + + def get_standard_deviations(self) -> list[float]: + return [np.sqrt(s) for s in self.get_estimator().covariances_] + + +def create_bimodal_vector(s: pd.Series, downsample: int | None = None, quiet: bool = False) -> pd.Series: + """Create a bimodal vector from a univariate feature. + + Parameters + ---------- + s : pd.Series + A univariate feature. + """ + if downsample is not None: + subsample = s.sample(min(downsample, len(s))) + else: + subsample = s + assessor = BimodalityAssessor(subsample, quiet=quiet) + quality = assessor.get_average_mahalanobis_distance() + if quality >= 0.5: + return assessor.get_dichotomized_feature(use_threshold=True, original=s) + threshold = np.nanmean(s) + + def thresholding(value): + if np.isnan(value): + return 0 + return 1 if value >= threshold else 0 + + return s.apply(thresholding) diff --git a/data_curation/convenience_scripts/configure_all_imports.sh b/data_curation/convenience_scripts/configure_all_imports.sh new file mode 100755 index 000000000..76c84a57f --- /dev/null +++ b/data_curation/convenience_scripts/configure_all_imports.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +source convenience_scripts/verifications_and_configurations.sh + +dbconfigargument="$1" +dbconfig=$(handle_dbconfig_argument $dbconfigargument) + +check_exists_else_fail "$dbconfig" +check_for_spt_availability +create_run_directory + +one_inclusion="$2" +if [[ "$one_inclusion" == "" ]]; +then + one_inclusion="ALL_DATASET_DIRECTORY_HANDLES" +fi + +SECONDS=0 +if [[ "$one_inclusion" == "ALL_DATASET_DIRECTORY_HANDLES" ]]; +then + datasets=$(get_available_dataset_handles) +else + datasets=$(get_available_dataset_handles | grep -o "$one_inclusion") +fi +create_run_directories_for_datasets "$datasets" $PWD +configure_run_directories_for_datasets "$datasets" $PWD + +echo "Run directory structure:" +tree -L 3 runs/ +echo "" + +echo "Configuration took $SECONDS seconds ("$(( SECONDS / 60 ))" minutes)." diff --git a/data_curation/convenience_scripts/import_functions.sh b/data_curation/convenience_scripts/import_functions.sh new file mode 100644 index 000000000..745a6c928 --- /dev/null +++ b/data_curation/convenience_scripts/import_functions.sh @@ -0,0 +1,151 @@ + +source convenience_scripts/verifications_and_configurations.sh + +function get_dbname { + dataset_subdirectory_handle="$1" + base_directory="$2" + handle=$(get_study_handle $dataset_subdirectory_handle $base_directory) + lower=$(echo "$handle" | tr '[:upper:]' '[:lower:]') + snake=$(echo "$lower" | tr ' ' '_') + echo $snake +} + +function get_dbconfig_argument { + dbconfigargument="$1" + dbconfig=$(handle_dbconfig_argument $dbconfigargument) + echo $dbconfig +} + +function compute_progress { + run_directory="$1" + prefix_size=$2 + manifests=$(cat $run_directory/work/*/*/.command.log | grep -o 'Number of cell manifest files: [0-9]\+' | grep -o '[0-9]\+' | tail -n1) + completed=$(cat $run_directory/work/*/*/.command.log | grep -o 'Performance report [0-9]\+' | grep -o '[0-9]\+' | sort -n | tail -n1) + if [[ "$completed" == "" ]]; then completed=0; fi; + remaining=$(( $manifests - $completed )) + width=$(( $(tput cols) - $prefix_size )) + if (( $manifests > $width )); + then + _completed=$(echo "$completed * $width / $manifests" | bc) + _remaining=$(echo "$remaining * $width / $manifests" | bc) + manifests=$width + completed=${_completed} + remaining=${_remaining} + fi + progress1=$(printf "%0.s=" $(seq 1 $completed)) + if [[ "$remaining" != "0" ]]; + then + progress2=$(printf "%0.s_" $(seq 1 $remaining)) + else + progress2="" + fi + echo "$progress1$progress2" +} + +function show_progress_of { + lines=$# + go_up="\033[F" + for line in $(seq 1 $lines) + do + echo '' + done + while [[ true ]]; + do + for line in $(seq 1 $lines) + do + printf "$go_up" + done + for line in $(seq 1 $lines) + do + dataset=${!line} + run_directory=runs/$dataset/import + echo -n "$dataset: " + compute_progress $run_directory $(( ${#dataset} + 3 )) + done + sleep 4 + done +} + +function show_progress { + datasets=$(ls runs/) + show_progress_of $datasets +} + +function import_datasets { + datasets="$1" + basedirectory="$2" + drop_first="$3" + dbconfig="$4" + for dataset in $datasets; do + handle=$(get_study_handle $dataset $basedirectory) + if [[ "$drop_first" == "yes" ]]; + then + echo "Dropping "$handle"." + spt db drop --study-name="$handle" --database-config-file="$dbconfig" + fi + echo "Importing: $dataset ($handle)" + import_dataset $dataset $basedirectory & + sleep 2 + done + wait +} + +function import_dataset { + dataset="$1" + basedirectory="$2" + rundirectory=$(get_run_directory $dataset $basedirectory) + cd $rundirectory + echo "Doing configured SPT run (tabular import) in $rundirectory ." + ./run.sh + cd $basedirectory +} + +function dump_metaschema_db { + dbname=default_study_lookup + formatted_date=$(printf '%(%Y_%m_%d)T\n' -1) + pg_dump -h localhost -U postgres -Fc -O -x "$dbname" > $dbname.$formatted_date.sqldump +} + +function dump_dataset { + dataset_subdirectory_handle="$1" + base_directory="$2" + dbname=$(get_dbname $dataset_subdirectory_handle $base_directory) + formatted_date=$(date +'%Y-%m-%d') + pg_dump -h localhost -U postgres -Fc -O -x "$dbname" > $dbname.$formatted_date.sqldump +} + +function dump_all_datasets { + base_directory="$PWD" + datasets=$(ls runs/) + for dataset in $datasets; + do + dump_dataset $dataset $base_directory + done +} + +function restore_db { + dbname="$1" + filename="$2" + database_config_file="$3" + host=$(cat $database_config_file | grep 'endpoint' | sed 's/endpoint = //g') + user=$(cat $database_config_file | grep 'user ' | sed 's/user = //g') + password=$(cat $database_config_file | grep 'password ' | sed 's/password = //g') + cmd="PGPASSWORD=$password pg_restore -v -x -O -C -c -j 4 -h $host -U $user -d postgres $filename" + echo "Command is: $cmd" + PGPASSWORD=$password pg_restore -v -x -O -C -j 4 -h $host -U $user -d postgres $filename +} + +function extract_dbname { + echo "$1" | grep -o '^[a-z_]\+' +} + +function restore_all { + database_config_file="$1" + filenames=$(ls *.sqldump) + for filename in $filenames + do + echo "Restoring $filename" + dbname=$(extract_dbname $filename) + restore_db $dbname $filename $database_config_file + done +} diff --git a/data_curation/convenience_scripts/verifications_and_configurations.sh b/data_curation/convenience_scripts/verifications_and_configurations.sh new file mode 100644 index 000000000..316cc0885 --- /dev/null +++ b/data_curation/convenience_scripts/verifications_and_configurations.sh @@ -0,0 +1,153 @@ + +function get_study_handle { + dataset_subdirectory_handle="$1" + base_directory="$2" + python_command=" +import json +with open('$base_directory/datasets/$dataset_subdirectory_handle/generated_artifacts/study.json', 'rt', encoding='utf-8') as file: + print(json.loads(file.read())['Study name']) +" + python -c "$python_command" +} + +function check_exists_else_fail { + echo -n "Checking for file $1 ... " + if [[ ! -f $1 ]]; + then + echo "does not exist." + exit 1 + else + echo "exists." + fi +} + +function handle_dbconfig_argument { + dbconfig="$1" + echo "Found database configuration file: $dbconfig" >&2 + echo $dbconfig +} + +function check_for_spt_availability { + location=$(command -v spt) + if [[ "$location" == "" ]]; + then + echo "spt command is not available." + exit 1 + fi +} + +function create_run_directory { + if [[ ! -d runs/ ]]; + then + mkdir runs + echo "Created runs/ ." + fi +} + +function get_available_dataset_handles_unsorted { + handles=$(ls -1 datasets | grep -v 'template') + for handle in $handles; + do + if [[ -f "datasets/$handle/generated_artifacts/file_manifest.tsv" ]]; + then + echo "$handle " + fi + done +} + +function get_available_dataset_handles { + get_available_dataset_handles_unsorted | sort +} + +function get_configured_run_handles_unsorted { + if [[ ! -d runs ]]; + then + return + fi + handles=$(ls -1 runs/) + for handle in $handles; + do + if [[ -f "runs/$handle/import/run.sh" ]]; + then + echo "$handle " + fi + done +} + +function get_configured_run_handles { + get_configured_run_handles_unsorted | sort | tr '\n' ' ' +} + +function get_run_directory { + dataset="$1" + basedirectory="$2" + echo "$basedirectory/runs/$dataset/import" +} + +function get_run_directory_parent { + dataset="$1" + basedirectory="$2" + echo "$basedirectory/runs/$dataset" +} + +function create_run_directories_for_datasets { + datasets="$1" + basedirectory=$2 + for dataset in $datasets; do + create_run_directory_for_dataset $dataset $basedirectory + done +} + +function create_run_directory_for_dataset { + dataset=$1 + basedirectory=$2 + parentdirectory=$(get_run_directory_parent $dataset $basedirectory) + if [[ ! -d "$parentdirectory" ]]; + then + echo "Creating run directory for '$dataset'." + mkdir "$parentdirectory" + fi + rundirectory=$(get_run_directory $dataset $basedirectory) + if [[ ! -d "$rundirectory" ]]; + then + echo "Creating import run directory for '$dataset'." + mkdir "$rundirectory" + fi +} + +function configure_run_directories_for_datasets { + datasets="$1" + basedirectory=$2 + for dataset in $datasets; do + configure_run_directory_for_dataset $dataset $basedirectory + done +} + +function configure_run_directory_for_dataset { + dataset=$1 + basedirectory=$2 + inputpath="$basedirectory/datasets/$dataset/generated_artifacts" + if [[ ! -d "$inputpath" ]]; + then + echo "Path $inputpath does not exist." + exit 1 + fi + rundirectory=$(get_run_directory $dataset $basedirectory) + studyname=$(get_study_handle $dataset $basedirectory) + + cd $rundirectory + echo "Configuring run in $rundirectory ." + rm -f configure.sh run.sh nextflow.config main.nf workflow.config + + echo "[general]" >> workflow.config + echo "db_config_file = $dbconfig" >> workflow.config + echo "" >> workflow.config + echo "[database visitor]" >> workflow.config + echo "study_name = $studyname" >> workflow.config + echo "" >> workflow.config + echo "[tabular import]" >> workflow.config + echo "input_path = $inputpath" >> workflow.config + + spt workflow configure --workflow="tabular import" --config-file=workflow.config + cd $basedirectory +} diff --git a/data_curation/datasets/moldoveanu/Moldoveanu2022-cytof-RAW.tar.gz.sha256 b/data_curation/datasets/moldoveanu/Moldoveanu2022-cytof-RAW.tar.gz.sha256 new file mode 100644 index 000000000..47e78af8d --- /dev/null +++ b/data_curation/datasets/moldoveanu/Moldoveanu2022-cytof-RAW.tar.gz.sha256 @@ -0,0 +1 @@ +700d107633feb9c303875aa6362f1115a5e5a8624e7e3b34e6ad1d95cd56c3a6 Moldoveanu2022-cytof-RAW.tar.gz diff --git a/data_curation/datasets/moldoveanu/_cell_measurement_aggregation.py b/data_curation/datasets/moldoveanu/_cell_measurement_aggregation.py new file mode 100644 index 000000000..249e29043 --- /dev/null +++ b/data_curation/datasets/moldoveanu/_cell_measurement_aggregation.py @@ -0,0 +1,92 @@ +"""Merge TIFF channel file data and aggregate over cell segments.""" + +import warnings + +from pandas import DataFrame +from pandas import Series +from pandas import merge +from numpy import nanmean +from numpy import isnan + +from _extraction_formats import create_sparse_dataframe # pylint: disable=E0611 +from ...convenience_scripts.bimodality_assessor import BimodalityAssessor + +def aggregate_cell(group: DataFrame, channel_name: str) -> float: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=RuntimeWarning) + value = float(nanmean(group[channel_name])) + if isnan(value): + value = 0 + return value + +def aggregate_cell_position(group: DataFrame, which: int) -> float: + columns = ['Column', 'Row'] + column = columns[which] + return float(nanmean(group[column])) + +class Aggregator: + """Aggregate channel and position information over cell segments.""" + def __init__(self, channel_names: list[str]): + self.channel_names = channel_names + + def aggregate(self, group: DataFrame) -> Series: + if group.shape[0] == 0: + raise ValueError('Some cell has no pixels.') + aggregation = Series({ + **{ + 'XMax': aggregate_cell_position(group, 0), + 'XMin': aggregate_cell_position(group, 0), + 'YMax': aggregate_cell_position(group, 1), + 'YMin': aggregate_cell_position(group, 1), + }, + **{ + f'{channel_name} Intensity': aggregate_cell(group, channel_name) + for channel_name in self.channel_names + } + }) + return aggregation + +def add_binary_vector(channel_name, df) -> bool: + feature_values = df[f'{channel_name} Intensity'] + assessor = BimodalityAssessor(feature_values) + quality = assessor.get_average_mahalanobis_distance() + if quality >= 0.5: + binary = assessor.get_dichotomized_feature(use_threshold=True) + df[f'{channel_name} Positive'] = binary + return True + threshold = nanmean(feature_values) + def thresholding(value): + if isnan(value): + return 0 + return 1 if value >= threshold else 0 + df[f'{channel_name} Positive'] = df[f'{channel_name} Intensity'].apply(thresholding) + return False + +def create_cell_measurement_table(channel_files: dict[str, str], mask_file: str) -> DataFrame: + df = create_sparse_dataframe( + mask_file, + value_column='Cell segment', + index_by_position=True, + keep_position_columns=True, + ) + + for channel_name, channel_file in channel_files.items(): + channel_df = create_sparse_dataframe( + channel_file, + value_column = str(channel_name), + index_by_position = True, + ) + df = merge(df, channel_df, how='left', left_index=True, right_index=True) + aggregator = Aggregator(list(channel_files.keys())) + measurements = df.groupby('Cell segment').apply(aggregator.aggregate) + gmm_thresholding = [] + mean_thresholding = [] + for channel_name in channel_files.keys(): + used_gmm = add_binary_vector(channel_name, measurements) + if used_gmm: + gmm_thresholding.append(channel_name) + else: + mean_thresholding.append(channel_name) + print(f'Used GMM thresholding for: {sorted(gmm_thresholding)}') + print(f'Used mean thresholding for: {sorted(mean_thresholding)}') + return measurements diff --git a/data_curation/datasets/moldoveanu/_cell_position_checking.py b/data_curation/datasets/moldoveanu/_cell_position_checking.py new file mode 100644 index 000000000..cf2179856 --- /dev/null +++ b/data_curation/datasets/moldoveanu/_cell_position_checking.py @@ -0,0 +1,46 @@ +"""Functions to check that cell data from masks matches supplement.""" +from pandas import read_excel + +from sklearn.neighbors import BallTree # type: ignore + +from _extraction_formats import get_supplement_filename # pylint: disable=E0611 +from _extraction_formats import form_sample_id # pylint: disable=E0611 + +def attempt_to_match_coordinates(supplement_cells, mask_cells): + if supplement_cells.shape[0] != mask_cells.shape[0]: + raise ValueError('Could not match cell masks to entries in supplement.') + cells1 = [(row['coord.x'], row['coord.y']) for _, row in supplement_cells.iterrows()] + cells2 = sorted([(row['Column'], row['Row']) for _, row in mask_cells.iterrows()]) + tree = BallTree(cells2) + search_threshold = 10 + kwargs = {'r': search_threshold, 'sort_results': True, 'return_distance': True} + indices, _ = tree.query_radius(cells1, **kwargs) + best_match = [] + for index_values in indices: + if len(index_values) == 0: + continue + best_match.append(index_values[0]) + defect = len(best_match) - len(set(best_match)) + if defect > 0: + raise ValueError('Some cells duplicated/mismatched.') + unmatched = len(cells2) - len(best_match) + if unmatched > 0: + raise ValueError('Some unmatchable cells from supplement.') + print('.', end='', flush=True) + +def check_cells_against_supplement_cells(cells): + _cells_sparse = read_excel(get_supplement_filename(), sheet_name=4, header=1) + _cells = {} + message = 'Checking that cells in supplement spreadsheet match cells in mask TIFFs. ' + print(message, end='', flush=True) + omitted = [] + for _sample_id, df in _cells_sparse.groupby('sample.id'): + sample_id = form_sample_id(_sample_id) + if not sample_id in cells.keys(): + omitted.append(sample_id) + continue + _cells[form_sample_id(sample_id)] = df[['coord.x', 'coord.y']] + attempt_to_match_coordinates(df, cells[sample_id]) + print(' Done.') + if len(omitted) > 0: + print(f'Note that some samples were omitted: {omitted}') diff --git a/data_curation/datasets/moldoveanu/_check_channel_references_in_phenotypes.py b/data_curation/datasets/moldoveanu/_check_channel_references_in_phenotypes.py new file mode 100644 index 000000000..b1525e338 --- /dev/null +++ b/data_curation/datasets/moldoveanu/_check_channel_references_in_phenotypes.py @@ -0,0 +1,25 @@ +"""Check that channel names in phenotypes definitions are known.""" +from os.path import join + +from pandas import read_csv + +def check_channel_references(): + directory = 'generated_artifacts' + channels = read_csv(join(directory, 'elementary_phenotypes.csv'), keep_default_na=False) + phenotypes = read_csv(join(directory, 'composite_phenotypes.csv'), keep_default_na=False) + channel_names = list(channels['Name']) + for _, row in phenotypes.iterrows(): + positives = row['Positive markers'].split(';') + negatives = row['Negative markers'].split(';') + if positives == ['']: + positives = [] + if negatives == ['']: + negatives = [] + positives_absent = [p for p in positives if p not in channel_names] + negatives_absent = [n for n in negatives if n not in channel_names] + absent = positives_absent + negatives_absent + if len(absent) > 0: + message = f'Markers {absent} in phenotype "{row["Name"]}" not in channel list.' + raise ValueError(message) + message = 'All phenotypes refer only to known channels.' + print(message) diff --git a/data_curation/datasets/moldoveanu/_extraction_formats.py b/data_curation/datasets/moldoveanu/_extraction_formats.py new file mode 100644 index 000000000..67ded3515 --- /dev/null +++ b/data_curation/datasets/moldoveanu/_extraction_formats.py @@ -0,0 +1,58 @@ +"""Convenience functions and constants related to data extraction.""" + +from pandas import DataFrame +from numpy import array as np_array +from scipy.sparse import coo_matrix # type: ignore +from PIL import Image + +def get_supplement_filename() -> str: + return 'sciimmunol.abi5072_tables_s1 to_s5.xlsx' + +def get_extraction_method() -> str: + return 'Core biopsy' + +def get_preservation_method() -> str: + return 'Formalin-fixed and paraffin-embedded' + +def get_storage_location() -> str: + return 'McGill University Health Centre' + +def form_intervention_description(target: str) -> str: + expected = ['CTLA4', 'PDL1'] + if target == 'both': + return f'Anti-{expected[0]} and anti-{expected[1]} therapy' + return f'Anti-{target} therapy' + +def form_assay_description(target: str) -> str: + expected = ['CTLA4', 'PDL1'] + if target == 'both': + return f'Response to anti-{expected[0]}/anti-{expected[1]} therapy' + return f'Response to anti-{target} therapy' + +def form_sample_id(base): + return 'Mold_sample_' + base + +def form_subject_id(base): + return 'Mold_subject_' + base + +def top_directory() -> str: + return 'CP_output_tiff' + +def set_index_by_position(df: DataFrame): + df['position'] = [(row['Column'], row['Row']) for _, row in df.iterrows()] + df.set_index('position', inplace=True) + +def create_sparse_dataframe( + filename: str, + value_column: str = 'Value', + index_by_position: bool = False, + keep_position_columns: bool = False, +) -> DataFrame: + array = np_array(Image.open(filename)) + sparse = coo_matrix(array) + df = DataFrame({value_column: sparse.data, 'Row': sparse.row, 'Column': sparse.col}) + if index_by_position: + set_index_by_position(df) + if not keep_position_columns: + df.drop(labels=['Row', 'Column'], inplace=True, axis=1) + return df diff --git a/data_curation/datasets/moldoveanu/clean.sh b/data_curation/datasets/moldoveanu/clean.sh new file mode 100755 index 000000000..b90f3e85b --- /dev/null +++ b/data_curation/datasets/moldoveanu/clean.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +supplementbase="sciimmunol.abi5072_tables_s1 to_s5" +rm -rf CP_output_tiff/ +rm "$supplementbase.xlsx" + +if [[ "$1" == "downloads" ]]; +then + rm "$supplementbase.zip" + mv Moldoveanu2022-cytof-RAW.tar.gz Moldoveanu2022-cytof-RAW.tar.gz.bak + echo "Moving Moldoveanu2022-cytof-RAW.tar.gz to backup. Delete if you want." +fi diff --git a/data_curation/datasets/moldoveanu/download.sh b/data_curation/datasets/moldoveanu/download.sh new file mode 100755 index 000000000..7609e4771 --- /dev/null +++ b/data_curation/datasets/moldoveanu/download.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# Download itself may be blocked by the remote servers, you may need to +# do the download step manually then run this script to check hashes. + +function do_checksum() { + filename="$1" + echo -n "Checking checksum of $filename ... " + sha256sum "$filename" > temp.txt + diff temp.txt "$filename.sha256" + status="$?" + rm temp.txt + if [[ $status -eq 0 ]]; + then + echo "verified, has expected contents." + else + echo "failed." + exit 1 + fi +} + +filebase="Moldoveanu2022-cytof-RAW" +if [[ ! -f $filebase.tar.gz ]]; +then + wget "https://zenodo.org/records/5903190/files/$filebase.tar.gz?download=1" + mv "Moldoveanu2022-cytof-RAW.tar.gz?download=1" "$filebase.tar.gz" +fi + +supplement_file="sciimmunol.abi5072_tables_s1 to_s5.zip" +supplement="https://www.science.org/doi/suppl/10.1126/sciimmunol.abi5072/suppl_file/sciimmunol.abi5072_tables_s1%20to_s5.zip" + +if [[ ! -f "$supplement_file" ]]; +then + wget "$supplement" +fi + +do_checksum $filebase.tar.gz +do_checksum "$supplement_file" diff --git a/data_curation/datasets/moldoveanu/extract.py b/data_curation/datasets/moldoveanu/extract.py new file mode 100644 index 000000000..f78fd6c41 --- /dev/null +++ b/data_curation/datasets/moldoveanu/extract.py @@ -0,0 +1,353 @@ +"""Extract data from downloads.""" +import re +from argparse import ArgumentParser +from os.path import join +from os import listdir +from os.path import isfile +from typing import cast +from multiprocessing import Pool + +from pandas import read_excel +from pandas import read_csv +from pandas import DataFrame +from pandas import Series +from pandas import merge +from pandas import concat +from numpy import mean + +from _extraction_formats import get_supplement_filename # pylint: disable=E0611 +from _extraction_formats import get_extraction_method # pylint: disable=E0611 +from _extraction_formats import get_preservation_method # pylint: disable=E0611 +from _extraction_formats import get_storage_location # pylint: disable=E0611 +from _extraction_formats import form_assay_description # pylint: disable=E0611 +from _extraction_formats import form_intervention_description # pylint: disable=E0611 +from _extraction_formats import form_sample_id # pylint: disable=E0611 +from _extraction_formats import form_subject_id # pylint: disable=E0611 +from _extraction_formats import top_directory # pylint: disable=E0611 +from _extraction_formats import create_sparse_dataframe # pylint: disable=E0611 +from _cell_position_checking import check_cells_against_supplement_cells # pylint: disable=E0611 +from _cell_measurement_aggregation import create_cell_measurement_table # pylint: disable=E0611 +from _check_channel_references_in_phenotypes import check_channel_references # pylint: disable=E0611 + +def parse_date(date_string: str, relative: bool=False, timepoint: int | None = None) -> str | None: + match = re.search(r'^([\d]{4})([\d]{2})([\d]{2})$', str(date_string)) + if match: + value = '' + if relative: + value = '-'.join(match.groups()) + ' relative to birth date' + else: + value = '-'.join(match.groups()) + if timepoint is not None: + value = value + f', timepoint {timepoint}' + return value + raise ValueError('Cannot parse date from: {date_string}') + +def retrieve_channels(): + channels = read_excel(get_supplement_filename(), sheet_name=2, header=2) + channels = channels[channels['Metal-Tag'] != ''] + channels = channels[['Metal-Tag', 'Target Name', 'Clone']] + def strip(string: str) -> str: + return str(string).strip() + channels['Clone'] = channels['Clone'].apply(strip) + print(f'Channels: {" ".join(list(channels[0:3]["Target Name"]))} ... ({len(channels)} total).') + return channels + +def retrieve_antibody_info(supplement_channels, manual_channels): + merged = get_merged_channels(supplement_channels, manual_channels) + return merged['Clone'] + +def get_merged_channels(supplement_channels: DataFrame, manual_channels: DataFrame) -> DataFrame: + df1 = supplement_channels.set_index('Target Name') + df2 = manual_channels.set_index('Supplement fragment') + df = merge(df2, df1, how='left', left_index=True, right_index=True) + reindexed = df.set_index('Name') + return reindexed + +def write_elementary_phenotypes(supplement_channels: DataFrame, manual_channels: DataFrame): + copyable = ['Name', 'Target structure class', 'Marking mechanism', 'Target full name'] + df = DataFrame({c: manual_channels[c] for c in copyable}) + df['Column header fragment prefix'] = df['Name'] + df['idx'] = df['Name'] + df.set_index('idx', inplace=True) + df['Antibody'] = retrieve_antibody_info(supplement_channels, manual_channels) + filename = join('generated_artifacts', 'elementary_phenotypes.csv') + order = [ + 'Name', 'Column header fragment prefix', 'Target structure class', 'Antibody', + 'Marking mechanism', 'Target full name', + ] + df = df[order] + df.to_csv(filename, sep=',', index=False) + +def get_metal_tag_suffix(channel_name: str, big_channels_df: DataFrame) -> str: + metal_tag = big_channels_df['Metal-Tag'][channel_name] + match = re.search(r'^([\d]{3})([a-zA-Z]{2})$', metal_tag) + if match is None: + raise ValueError(f'Could not parse metal tag: {metal_tag}') + parts = match.groups() + return f'{parts[1]}{parts[0]}' + +def get_tiff_filename( + sample_id: str, + channel_name: str, + big_channels_df: DataFrame, + samples: DataFrame, +) -> str: + suffix = get_metal_tag_suffix(channel_name, big_channels_df) + filepath_fragment = samples['Filename base'][sample_id] + directory = join(top_directory(), filepath_fragment) + files = list(listdir(directory)) + matches = [ + file for file in files + if re.search(f'_{suffix}.tiff$', file) + ] + if len(matches) > 1: + raise ValueError(f'More than one file matches for {channel_name}: {matches}') + if len(matches) == 0: + raise ValueError(f'No file matches for {channel_name}.') + return join(directory, matches[0]) + +def form_cells_mask_filename(base: str) -> str: + return f'{base}_ilastik_s2_Probabilities_mask.tiff' + +def retrieve_sample_subjects_stuff(): + samples = read_excel(get_supplement_filename(), sheet_name=1, header=1) + samples = samples[samples['Cohort'] == 'ICI'] + samples['Sample ID'] = form_sample_id(samples['Sample_ID']) + samples['Source subject'] = form_subject_id(samples['Sample_ID']) + samples['Source site'] = samples['Tissue_Source_Simplified'] + age = 'Source subject age at specimen collection' + samples[age] = samples['acquisition_date'].apply(lambda d: parse_date(d, relative=True)) + sex_codes = {'M': 'Male', 'F': 'Female'} + samples['Sex'] = samples['Sex'].apply(lambda code: sex_codes[code]) + samples['Extraction method'] = get_extraction_method() + def date1(d: str): + return parse_date(d, timepoint=1) + samples['Extraction date'] = samples['acquisition_date'].apply(date1) + samples['Preservation method'] = get_preservation_method() + samples['Storage location'] = get_storage_location() + samples['Assay'] = samples['Treatment'].apply(form_assay_description) + response_codes = {'Yes': 'Responder', 'No': 'Non-responder'} + samples['Assessment'] = samples['Response'].apply(lambda code: response_codes[code]) + samples['Intervention'] = samples['Treatment'].apply(form_intervention_description) + samples['Mask filename'] = samples['filename'].apply(form_cells_mask_filename) + samples['Filename base'] = samples['filename'] + samples['idx'] = samples['Sample ID'] + samples.set_index('idx', inplace=True) + columns = [ + 'Sample ID', + 'Source subject', + 'Source site', + age, + 'Sex', + 'Extraction method', + 'Extraction date', + 'Preservation method', + 'Storage location', + 'Assay', + 'Assessment', + 'Intervention', + 'Mask filename', + 'Filename base', + ] + samples = samples[columns] + print(f'{samples.shape[0]} samples detected.') + return samples + +def get_cells_mask_filename(sample_row: Series, print_sample: bool=False) -> str: + sample = re.sub('^Mold_sample_', '', sample_row["Sample ID"]) + if print_sample: + print(f'{sample} ', end='', flush=True) + return join(top_directory(), sample_row['Filename base'], sample_row['Mask filename']) + +def centroid_aggregation(sparse_values: DataFrame) -> Series: + return Series({ + 'Row': cast(float, mean(sparse_values['Row'])), + 'Column': cast(float, mean(sparse_values['Column'])), + }) + +def retrieve_cell_positions_one_sample(sample_info: Series) -> DataFrame: + filename = get_cells_mask_filename(sample_info, print_sample=True) + sparse_df = create_sparse_dataframe(filename) + centroids = sparse_df.groupby('Value').apply(centroid_aggregation) # type: ignore + return centroids + +def retrieve_cell_positions(samples: DataFrame) -> dict[str, DataFrame]: + cells = {} + print('Processing cell data from TIFF masks.') + for sample_id, sample_info in samples.iterrows(): + cells[str(sample_id)] = retrieve_cell_positions_one_sample(sample_info) + print(f'{len(cells[str(sample_id)])} cells. ', end='', flush=True) + print('') + sizes = [df.shape[0] for _, df in cells.items()] + peek = ", ".join([str(s) for s in sizes[0:5]]) + print(f'{peek} ... cells found (across {len(cells)} samples).') + print(f'{sum(sizes)} cells total.') + return cells + +def check_all_tiff_channel_files_available(channels: DataFrame, samples: DataFrame): + try: + for sample_id in samples.index: + for channel in channels.index: + filename = get_tiff_filename(sample_id, channel, channels, samples) + if not isfile(filename): + raise ValueError(f'{sample_id} at {channel} not found. Tried {filename}.') + except Exception as e: + print('Some error searching for channel TIFF files.') + raise e + print('TIFF files found for all channel/sample combinations.') + + +class CellManifestWriter: + """Write one sample's worth of cells to file.""" + def write_cell_file(self, + index: int, + sample_id: str, + sample_info: Series, + channels: DataFrame, + samples: DataFrame, + ): + channel_files = { + channel: get_tiff_filename(sample_id, channel, channels, samples) + for channel in channels.index + } + mask_file = get_cells_mask_filename(sample_info) + df = create_cell_measurement_table(channel_files, mask_file) + base = f'{index}.csv' + outfile = join('generated_artifacts', base) + df.to_csv(outfile, sep=',', index=False) + print(f'Done with {sample_id} (file {index}).') + return (sample_id, base) + + +def handle_cell_measurements( + samples: DataFrame, + channels: DataFrame, + number_cores: int, +) -> list[tuple[str, str]]: + print('Aggregating component TIFF files, for each sample, over cell segments.') + writer = CellManifestWriter() + samples_items = [ + (str(_sample_id), sample_row) + for _sample_id, sample_row in samples.sort_values(by='Sample ID').iterrows() + ] + arguments = zip( + range(samples.shape[0]), + [item[0] for item in samples_items], + [item[1] for item in samples_items], + [channels for _ in range(samples.shape[0])], + [samples for _ in range(samples.shape[0])], + ) + print(f'Using {number_cores} cores.') + with Pool(number_cores) as pool: + cell_files_written = pool.starmap(writer.write_cell_file, arguments) + return cell_files_written + +def write_samples(samples: DataFrame): + columns = [ + 'Sample ID', + 'Source subject', + 'Source site', + 'Source subject age at specimen collection', + 'Extraction method', + 'Extraction date', + 'Preservation method', + 'Storage location', + 'Assay', + 'Assessment', + ] + df = samples[columns].sort_values(by='Sample ID') + filename = join('generated_artifacts', 'samples.tsv') + df.to_csv(filename, sep='\t', index=False) + +def write_diagnosis(samples: DataFrame): + columns = [ + 'Source subject', + 'Assessment', + ] + df = samples[columns].rename( + { + 'Source subject': 'Subject of diagnosis', + 'Assessment': 'Diagnosis', + }, + axis=1, + ) + df['Diagnosed condition'] = 'Response to immune checkpoint inhibitor therapy' + df = df[['Subject of diagnosis', 'Diagnosed condition', 'Diagnosis']] + df.sort_values(by='Subject of diagnosis', inplace=True) + df['Date of diagnosis'] = 'timepoint 3' + df['Last date of considered evidence'] = 'timepoint 3' + filename = join('generated_artifacts', 'diagnosis.tsv') + df.to_csv(filename, sep='\t', index=False) + +def write_subjects(samples: DataFrame): + columns = [ + 'Source subject', + 'Sex', + ] + df = samples[columns].rename({'Source subject': 'Subject ID'}, axis=1) + df.sort_values(by='Subject ID', inplace=True) + filename = join('generated_artifacts', 'subjects.tsv') + df.to_csv(filename, sep='\t', index=False) + +def write_interventions(samples: DataFrame): + columns = [ + 'Source subject', + 'Intervention', + ] + df = samples[columns].rename({'Source subject': 'Subject of intervention'}, axis=1) + df['Date of intervention'] = 'timepoint 2' + df.sort_values(by='Subject of intervention', inplace=True) + filename = join('generated_artifacts', 'interventions.tsv') + df.to_csv(filename, sep='\t', index=False) + +def write_file_manifest(cell_files_written: list[tuple[str, str]]): + entries = [ + [ + 'file_' + re.sub(r'\.csv$', '', filename), + filename, + sample, + 'Tabular cell manifest', + ] + for sample, filename in cell_files_written + ] + columns = ['File ID', 'File name', 'Sample ID', 'Data type'] + cells = DataFrame(entries, columns=columns) + specials = read_csv(join('manually_created', 'file_manifest_specials.tsv'), sep='\t') + specials['Sample ID'] = '' + df = concat([cells, specials]) + df['Project ID'] = 'Melanoma CyTOF ICI' + filename = join('generated_artifacts', 'file_manifest.tsv') + df.to_csv(filename, sep='\t', index=False) + +def extract(number_cores: int): + supplement_channels = retrieve_channels() + manual_channels = read_csv(join('manually_created', 'channels.tsv'), sep='\t') + write_elementary_phenotypes(supplement_channels, manual_channels) + check_channel_references() + channels = get_merged_channels(supplement_channels, manual_channels) + samples = retrieve_sample_subjects_stuff() + write_samples(samples) + write_subjects(samples) + write_diagnosis(samples) + write_interventions(samples) + cell_files_written = handle_cell_measurements(samples, channels, number_cores) + write_file_manifest(cell_files_written) + check_all_tiff_channel_files_available(channels, samples) + cell_positions = retrieve_cell_positions(samples) + check_cells_against_supplement_cells(cell_positions) + + +if __name__=='__main__': + parser = ArgumentParser( + prog='extract', + ) + parser.add_argument( + '--cores', + dest='cores', + type=int, + required=False, + default=1, + help='Number of cores for parallelization.' + ) + args = parser.parse_args() + extract(args.cores) diff --git a/data_curation/datasets/moldoveanu/extract.sh b/data_curation/datasets/moldoveanu/extract.sh new file mode 100755 index 000000000..b7c11d9d8 --- /dev/null +++ b/data_curation/datasets/moldoveanu/extract.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +if [[ ! -d CP_output_tiff ]]; +then + filebase="Moldoveanu2022-cytof-RAW" + gunzip -c "$filebase.tar.gz" >$filebase.tar + tar -xvf "$filebase.tar" + for f in $(find CP_output_tiff/ | grep '/\._'); do rm -rf "$f"; done +fi + +if [[ ! -f "sciimmunol.abi5072_tables_s1 to_s5.xlsx" ]]; +then + supplement_file="sciimmunol.abi5072_tables_s1 to_s5.zip" + unzip "$supplement_file" +fi + +cp manually_created/composite_phenotypes.csv generated_artifacts/ +cp manually_created/study.json generated_artifacts/study.json +python extract.py $@ diff --git a/data_curation/datasets/moldoveanu/manually_created/channels.tsv b/data_curation/datasets/moldoveanu/manually_created/channels.tsv new file mode 100644 index 000000000..199aaedf4 --- /dev/null +++ b/data_curation/datasets/moldoveanu/manually_created/channels.tsv @@ -0,0 +1,36 @@ +Name Supplement fragment Target structure class Marking mechanism Target full name +SMA SMA protein Imaging mass cytometry Survival Of Motor Neuron 1 +MEK1/2 MEK1/2 protein Imaging mass cytometry Mitogen-Activated Protein Kinase Kinase 2 +MKI67 Ki67 protein Imaging mass cytometry Marker Of Proliferation Ki-67 +CD14 CD14 protein Imaging mass cytometry Monocyte Differentiation Antigen CD14 +ERK1/2 ERK1/2 protein Imaging mass cytometry Mitogen-Activated Protein Kinase 3 +CD16 CD16 protein Imaging mass cytometry Fc Gamma Receptor IIIa +CD31 CD31 protein Imaging mass cytometry Platelet And Endothelial Cell Adhesion Molecule 1 +ICOS ICOS protein Imaging mass cytometry Inducible T Cell Costimulator +CD29 CD29 protein Imaging mass cytometry Integrin Subunit Beta 1 +PDL1 PD-L1(SP142) protein Imaging mass cytometry Programmed Cell Death 1 Ligand 1 +OX40 OX40 protein Imaging mass cytometry Tumor Necrosis Factor Receptor Superfamily Member 4 +CD45 CD45 protein Imaging mass cytometry Protein Tyrosine Phosphatase Receptor Type C +LAG3 LAG3 protein Imaging mass cytometry Lymphocyte Activating 3 +TIM3 TIM3 protein Imaging mass cytometry Hepatitis A Virus Cellular Receptor 2 +FOXP3 FOXP3 protein Imaging mass cytometry Forkhead Box P3 +CD4 CD4 protein Imaging mass cytometry T-Cell Surface Glycoprotein CD4 +CCR7 CCR7 protein Imaging mass cytometry C-C Motif Chemokine Receptor 7 +CD68 CD68 protein Imaging mass cytometry Macrophage Antigen CD68 +VISTA VISTA protein Imaging mass cytometry V-Set Immunoregulatory Receptor +CD20 CD20 protein Imaging mass cytometry B-Lymphocyte Cell-Surface Antigen B1 +CD8A CD8a protein Imaging mass cytometry T-Cell Surface Glycoprotein CD8 +pMEK1/2 pMEK1/2 phosphorylated protein Imaging mass cytometry Phosphorylated Mitogen-Activated Protein Kinase Kinase 2 +SOX10 SOX10 protein Imaging mass cytometry SRY-Box Transcription Factor 10 +CTNNB1 b-Catenin protein Imaging mass cytometry Catenin Beta 1 +CD45RA CD45RA protein Imaging mass cytometry Protein Tyrosine Phosphatase Receptor Type C isoform A +IGLL5 Granz B protein Imaging mass cytometry Immunoglobulin Lambda Like Polypeptide 5 +CD40 CD40 protein Imaging mass cytometry B Cell Surface Antigen CD40 +COL1 Collagen I protein Imaging mass cytometry Collagen I +CD3 CD3 protein complex Imaging mass cytometry T-Cell Receptor Complex +pERK1/2 p-ERK1/2 phosphorylated protein Imaging mass cytometry Phosphorylated Mitogen-Activated Protein Kinase 3 +CASP3 cleav-Casp3 protein Imaging mass cytometry Caspase 3 Apoptosis-Related Cysteine Peptidase +CD45RO CD45RO protein Imaging mass cytometry Protein Tyrosine Phosphatase Receptor Type C isoform O +HLA-DR HLA-DR protein complex Imaging mass cytometry Major Histocompatibility Complex Class II DR +S100 S100 protein family Imaging mass cytometry S100 Calcium Binding Protein +H3 HH3 protein family Imaging mass cytometry Histone H3 \ No newline at end of file diff --git a/data_curation/datasets/moldoveanu/manually_created/composite_phenotypes.csv b/data_curation/datasets/moldoveanu/manually_created/composite_phenotypes.csv new file mode 100644 index 000000000..b4610f462 --- /dev/null +++ b/data_curation/datasets/moldoveanu/manually_created/composite_phenotypes.csv @@ -0,0 +1,14 @@ +Name,Positive markers,Negative markers +Melanocyte,S100,CD20;CD3;CD45;CD31 +Melanoma,SOX10,CD20;CD3;CD45;CD31 +Endothelial stroma,CD31,CD20;CD45;CD3;SOX10 +B cells,CD20,CD31;SOX10 +T cells,CD3,CD20 +Cytotoxic T cell antigen-experienced,CD3;CD8A;CD45RO,CD20 +T regulatory cells,FOXP3;CD4;CD3,SOX10;CD31;CD20 +Lineage including macrophage,CD68,CD3;SOX10 +Lineage including monocyte,CD14,CD3;SOX10 +Naive cytotoxic T cell,CD8A;CD3;CD45RA,CD20;CD31;SOX10;CD4 +T helper cell antigen-experienced,CD4;CD3;CD45RO,SOX10 +T helper cell,CD3;CD4,CD20 +Naive T helper cell,CD3;CD4;CD45RA,SOX10 diff --git a/data_curation/datasets/moldoveanu/manually_created/file_manifest_specials.tsv b/data_curation/datasets/moldoveanu/manually_created/file_manifest_specials.tsv new file mode 100644 index 000000000..755f687c5 --- /dev/null +++ b/data_curation/datasets/moldoveanu/manually_created/file_manifest_specials.tsv @@ -0,0 +1,8 @@ +File ID File name Data type +Channels file elementary_phenotypes.csv Manifest of basic channels/phenotypes +Phenotypes file composite_phenotypes.csv Manifest of composite phenotypes +Samples file samples.tsv Samples +Subjects file subjects.tsv Manifest of subjects +Study file study.json Project-level data +Diagnosis file diagnosis.tsv List of subject diagnoses +Interventions file interventions.tsv List of subject intervention events \ No newline at end of file diff --git a/data_curation/datasets/moldoveanu/manually_created/study.json b/data_curation/datasets/moldoveanu/manually_created/study.json new file mode 100644 index 000000000..135b14cd5 --- /dev/null +++ b/data_curation/datasets/moldoveanu/manually_created/study.json @@ -0,0 +1,358 @@ +{ + "Study name": "Melanoma CyTOF ICI", + "Institution": "Rosalind and Morris Goodman Cancer Institute, McGill University", + "Study contact person": { + "Name": "Dan Moldoveanu", + "Contact reference": "dan.moldeveanu@mcgill.org" + }, + "Publications": [ + { + "Title": "Spatially mapping the immune landscape of melanoma using imaging mass cytometry", + "Document type": "Article", + "Publisher": "Cancer Immunology Research", + "Date": "2022-04-01", + "URL": "https://doi.org/10.1126/sciimmunol.abi5072", + "Authors": [ + "Dan Moldoveanu", + "LeeAnn Ramsay", + "Mathieu Lajoie", + "Luke Anderson-Trocme", + "Marine Lingrand", + "Diana Berry", + "Lucas J M Perus", + "Yuhong Wei", + "Cleber Moraes", + "Rached Alkallas", + "Shivshankari Rajkumar", + "Dongmei Zuo", + "Matthew Dankner", + "Eric Hongbo Xu", + "Nicholas R Bertos", + "Hamed S Najafabadi", + "Simon Gravel", + "Santiago Costantino", + "Martin J Richer", + "Amanda W Lund", + "Sonia V Del Rincon", + "Alan Spatz", + "Wilson H Miller Jr", + "Rahima Jamal", + "R\u00e9jean Lapointe", + "Anne-Marie Mes-Masson", + "Simon Turcotte", + "Kevin Petrecca", + "Sinziana Dumitra", + "Ari-Nareg Meguerditchian", + "Keith Richardson", + "Francine Tremblay", + "Beatrice Wang", + "May Chergui", + "Marie-Christine Guiot", + "Kevin Watters", + "John Stagg", + "Daniela F Quail", + "Catalin Mihalcioiu", + "Sarkis Meterissian", + "Ian R Watson" + ] + }, + { + "Title": "Raw CyTOF images associated with Moldoveanu et al. 2022, Science Immunology", + "Document type": "Dataset", + "Publisher": "Zenodo (CERN)", + "Date": "2022-01-25", + "URL": "https://zenodo.org/record/5903190#.Y_tTntLMJH4", + "Authors": [ + "Dan Moldoveanu", + "LeeAnn Ramsay", + "Mathieu Lajoie", + "Luke Anderson-Trocme", + "Marine Lingrand", + "Diana Berry", + "Lucas J M Perus", + "Yuhong Wei", + "Cleber Moraes", + "Rached Alkallas", + "Shivshankari Rajkumar", + "Dongmei Zuo", + "Matthew Dankner", + "Eric Hongbo Xu", + "Nicholas R Bertos", + "Hamed S Najafabadi", + "Simon Gravel", + "Santiago Costantino", + "Martin J Richer", + "Amanda W Lund", + "Sonia V Del Rincon", + "Alan Spatz", + "Wilson H Miller Jr", + "Rahima Jamal", + "R\u00e9jean Lapointe", + "Anne-Marie Mes-Masson", + "Simon Turcotte", + "Kevin Petrecca", + "Sinziana Dumitra", + "Ari-Nareg Meguerditchian", + "Keith Richardson", + "Francine Tremblay", + "Beatrice Wang", + "May Chergui", + "Marie-Christine Guiot", + "Kevin Watters", + "John Stagg", + "Daniela F Quail", + "Catalin Mihalcioiu", + "Sarkis Meterissian", + "Ian R Watson" + ] + } + ], + "People": [ + { + "Full name": "Dan Moldoveanu", + "Surname": "Moldoveanu", + "Given name": "Dan", + "ORCID": "" + }, + { + "Full name": "LeeAnn Ramsay", + "Surname": "Ramsay", + "Given name": "LeeAnn", + "ORCID": "" + }, + { + "Full name": "Mathieu Lajoie", + "Surname": "Lajoie", + "Given name": "Mathieu", + "ORCID": "" + }, + { + "Full name": "Luke Anderson-Trocme", + "Surname": "Anderson-Trocme", + "Given name": "Luke", + "ORCID": "" + }, + { + "Full name": "Marine Lingrand", + "Surname": "Lingrand", + "Given name": "Marine", + "ORCID": "" + }, + { + "Full name": "Diana Berry", + "Surname": "Berry", + "Given name": "Diana", + "ORCID": "" + }, + { + "Full name": "Lucas J M Perus", + "Surname": "Perus", + "Given name": "Lucas", + "ORCID": "" + }, + { + "Full name": "Yuhong Wei", + "Surname": "Wei", + "Given name": "Yuhong", + "ORCID": "" + }, + { + "Full name": "Cleber Moraes", + "Surname": "Moraes", + "Given name": "Cleber", + "ORCID": "" + }, + { + "Full name": "Rached Alkallas", + "Surname": "Alkallas", + "Given name": "Rached", + "ORCID": "" + }, + { + "Full name": "Shivshankari Rajkumar", + "Surname": "Rajkumar", + "Given name": "Shivshankari", + "ORCID": "" + }, + { + "Full name": "Dongmei Zuo", + "Surname": "Zuo", + "Given name": "Dongmei", + "ORCID": "" + }, + { + "Full name": "Matthew Dankner", + "Surname": "Dankner", + "Given name": "Matthew", + "ORCID": "" + }, + { + "Full name": "Eric Hongbo Xu", + "Surname": "Xu", + "Given name": "Eric", + "ORCID": "" + }, + { + "Full name": "Nicholas R Bertos", + "Surname": "Bertos", + "Given name": "Nicholas", + "ORCID": "" + }, + { + "Full name": "Hamed S Najafabadi", + "Surname": "Najafabadi", + "Given name": "Hamed", + "ORCID": "" + }, + { + "Full name": "Simon Gravel", + "Surname": "Gravel", + "Given name": "Simon", + "ORCID": "" + }, + { + "Full name": "Santiago Costantino", + "Surname": "Costantino", + "Given name": "Santiago", + "ORCID": "" + }, + { + "Full name": "Martin J Richer", + "Surname": "Richer", + "Given name": "Martin", + "ORCID": "" + }, + { + "Full name": "Amanda W Lund", + "Surname": "Lund", + "Given name": "Amanda", + "ORCID": "" + }, + { + "Full name": "Sonia V Del Rincon", + "Surname": "Rincon", + "Given name": "Sonia", + "ORCID": "" + }, + { + "Full name": "Alan Spatz", + "Surname": "Spatz", + "Given name": "Alan", + "ORCID": "" + }, + { + "Full name": "Wilson H Miller Jr", + "Surname": "Jr", + "Given name": "Wilson", + "ORCID": "" + }, + { + "Full name": "Rahima Jamal", + "Surname": "Jamal", + "Given name": "Rahima", + "ORCID": "" + }, + { + "Full name": "R\u00e9jean Lapointe", + "Surname": "Lapointe", + "Given name": "R\u00e9jean", + "ORCID": "" + }, + { + "Full name": "Anne-Marie Mes-Masson", + "Surname": "Mes-Masson", + "Given name": "Anne-Marie", + "ORCID": "" + }, + { + "Full name": "Simon Turcotte", + "Surname": "Turcotte", + "Given name": "Simon", + "ORCID": "" + }, + { + "Full name": "Kevin Petrecca", + "Surname": "Petrecca", + "Given name": "Kevin", + "ORCID": "" + }, + { + "Full name": "Sinziana Dumitra", + "Surname": "Dumitra", + "Given name": "Sinziana", + "ORCID": "" + }, + { + "Full name": "Ari-Nareg Meguerditchian", + "Surname": "Meguerditchian", + "Given name": "Ari-Nareg", + "ORCID": "" + }, + { + "Full name": "Keith Richardson", + "Surname": "Richardson", + "Given name": "Keith", + "ORCID": "" + }, + { + "Full name": "Francine Tremblay", + "Surname": "Tremblay", + "Given name": "Francine", + "ORCID": "" + }, + { + "Full name": "Beatrice Wang", + "Surname": "Wang", + "Given name": "Beatrice", + "ORCID": "" + }, + { + "Full name": "May Chergui", + "Surname": "Chergui", + "Given name": "May", + "ORCID": "" + }, + { + "Full name": "Marie-Christine Guiot", + "Surname": "Guiot", + "Given name": "Marie-Christine", + "ORCID": "" + }, + { + "Full name": "Kevin Watters", + "Surname": "Watters", + "Given name": "Kevin", + "ORCID": "" + }, + { + "Full name": "John Stagg", + "Surname": "Stagg", + "Given name": "John", + "ORCID": "" + }, + { + "Full name": "Daniela F Quail", + "Surname": "Quail", + "Given name": "Daniela", + "ORCID": "" + }, + { + "Full name": "Catalin Mihalcioiu", + "Surname": "Mihalcioiu", + "Given name": "Catalin", + "ORCID": "" + }, + { + "Full name": "Sarkis Meterissian", + "Surname": "Meterissian", + "Given name": "Sarkis", + "ORCID": "" + }, + { + "Full name": "Ian R Watson", + "Surname": "Watson", + "Given name": "Ian", + "ORCID": "" + } + ] +} diff --git a/data_curation/datasets/moldoveanu/sciimmunol.abi5072_tables_s1 to_s5.zip.sha256 b/data_curation/datasets/moldoveanu/sciimmunol.abi5072_tables_s1 to_s5.zip.sha256 new file mode 100644 index 000000000..c1738ed3b --- /dev/null +++ b/data_curation/datasets/moldoveanu/sciimmunol.abi5072_tables_s1 to_s5.zip.sha256 @@ -0,0 +1 @@ +2f403010b06f8d2d7cb2de571efe0553b92cc0c58f0eb86ca532b75c0c6ba03e sciimmunol.abi5072_tables_s1 to_s5.zip diff --git a/data_curation/datasets/template/README.md b/data_curation/datasets/template/README.md new file mode 100644 index 000000000..657f17e65 --- /dev/null +++ b/data_curation/datasets/template/README.md @@ -0,0 +1,206 @@ +# Usage pattern + +By convention of this repository, the "curation" steps are: +1. [`./download.sh`](#1-download-source-data) +2. [`./extract.sh`](#2-extract-what-is-needed-and-save-it-to-the-generated-artifacts-subdirectory) +3. [`./verify.sh`](#3-verify-that-the-generated-artifacts-are-as-expected-by-doing-some-checks) +4. [`./clean.sh`](#5-cleandelete-all-intermediate-and-final-generated-artifacts-as-needed) (optional) + +Prepared, curated artifacts should be saved to `generated_artifacts/`. + +For a real dataset you would of course write your own version of `download.sh`, `extract.sh`, `verify.sh`, `clean.sh`, etc., and commit these to version control. + +In this dummy example, the output is as follows: + +```txt +$ ./download.sh +SourceData_March_10_2023.zip is present. + +$ ./extract.sh +Example during-processing output, e.g. for debugging: + +Measurement event ID Subject of measurement Type code + ABC_30 0000012 0 + ABC_31 0000047 0 + ABC_32 0000012 3 + ABC_33 0000299 5 + +Measurement event ID Value + ABC_30 1.001 + ABC_30 1.002 + ABC_30 1.003 + ABC_30 1.004 + ABC_30 1.005 + ABC_30 1.006 + ABC_31 1.001 + ABC_31 1.002 + ABC_31 1.003 + ABC_31 1.004 + ABC_31 1.005 + ABC_31 1.006 + ABC_32 2.001 + ABC_32 2.002 + ABC_32 2.003 + ABC_32 2.004 + ABC_32 2.005 + ABC_32 2.006 + ABC_33 3.001 + ABC_33 3.002 + ABC_33 3.003 + ABC_33 3.004 + ABC_33 3.005 + ABC_33 3.006 + +Identifier Feature 1 Feature 2 + 0000012 0.12345 0.78910 + 0000047 1.00123 2.17891 + 0000299 13.270 0.00002 + +$ ./verify.sh +Checking record counts for 3 tables found in sqlite database file. + +Got expected record count 4 in measurement_events. +Got expected record count 24 in measurement_values. +Got expected record count 3 in feature_matrix. + +$ ./clean.sh +Deleted SourceData/ and generated_artifacts/ . +``` + +## 1. Download source data +```sh +./download.sh +``` +`download.sh`: +```sh +#!/bin/bash + +# The "download.sh" script should typically fetch a big source data file. +# Something like: +# wget https://data-repository-hub.com/123456789/SourceData_March_10_2023.zip +# +# In this dummy example case we'll pretend this (committed) zip file was downloaded by this script: +# SourceData_March_10_2023.zip + +main_source_file=SourceData_March_10_2023.zip + +if [[ -f $main_source_file ]]; +then + echo "$main_source_file is present." +else + echo "Error: $main_source_file is not present. Not downloaded?" + exit 1 +fi +``` + +## 2. Extract what is needed and save it to the generated artifacts subdirectory +```sh +./extract.sh +``` +`extract.sh`: +```sh +#!/bin/bash +unzip SourceData_March_10_2023.zip +echo '' +python extract.py +``` +`extract.py`: +```py +from os.path import join +from os.path import exists +import json +import sqlite3 + +import pandas as pd + +from sqlite_stuff import initialize_sqlite_db +from sqlite_stuff import get_sqlite_connection + +def extract_dataset_from_source_files(): + source_dir = 'SourceData' + measurement_files = pd.read_csv(join(source_dir, 'spreadsheet2.tsv'), keep_default_na=False, sep='\t', dtype=str) + measurement_events = [] + measurements = [] + for i, row in measurement_files.iterrows(): + measurement_id = row['ID'] + measurement_file = join(source_dir, row['Associated file']) + if not exists(measurement_file): + raise FileNotFoundError(measurement_file) + with open(measurement_file, 'rt', encoding='utf-8') as file: + measurement_info = json.loads(file.read()) + measurement_events.append((measurement_id, measurement_info['subject'], measurement_info['measurement code'])) + for value in measurement_info['measurements']: + measurements.append((measurement_id, value)) + + measurement_events_df = pd.DataFrame(measurement_events, columns=['Measurement event ID', 'Subject of measurement', 'Type code']) + measurements_df = pd.DataFrame(measurements, columns=['Measurement event ID', 'Value']) + feature_matrix = pd.read_csv(join(source_dir, 'spreadsheet1.tsv'), keep_default_na=False, sep='\t', dtype=str) + dfs = [measurement_events_df, measurements_df, feature_matrix] + + print_data_frames(dfs) + initialize_sqlite_db() + send_to_sqlite(dfs, ['measurement_events', 'measurement_values', 'feature_matrix']) + +def send_to_sqlite(dfs, table_names): + connection = get_sqlite_connection() + for table_name, df in zip(table_names, dfs): + df.to_sql(table_name, connection, if_exists='replace', index=False) + connection.commit() + +def print_data_frames(dfs): + print('Example during-processing output, e.g. for debugging:') + print('') + for df in dfs: + print(df.to_string(index=False)) + print('') + +if __name__=='__main__': + extract_dataset_from_source_files() +``` + +This step creates the sqlite database file `generated_artifacts/example_curated_dataset.db` . + +## 3. Verify that the generated artifacts are as expected, by doing some checks +```sh +./verify.sh +``` +`verify.sh`: +```sh +#!/bin/bash +python verify.py +``` +`verify.py`: +```py +import pandas as pd + +from sqlite_stuff import get_sqlite_connection + +def check_record_counts(): + connection = get_sqlite_connection() + table_names = ['measurement_events', 'measurement_values', 'feature_matrix'] + dfs = [pd.read_sql_query(f"SELECT * FROM {table_name}", connection) for table_name in table_names] + + print('Checking record counts for 3 tables found in sqlite database file.') + print('') + expected_counts = [4, 24, 3] + for expected_count, table_name, df in zip(expected_counts, table_names, dfs): + if expected_count != df.shape[0]: + raise ValueError(f'Expected {expected_count} in {table_name} but got {df.shape[0]}.') + else: + print(f'Got expected record count {expected_count} in {table_name}.') + +if __name__=='__main__': + check_record_counts() +``` + +## 4. Clean/delete all intermediate and final generated artifacts, as needed +```sh +./clean.sh +``` +`clean.sh`: +```sh +#!/bin/bash +rm -rf SourceData/ +rm -rf generated_artifacts/ +echo "Deleted SourceData/ and generated_artifacts/ ." +``` diff --git a/data_curation/datasets/template/SourceData_March_10_2023.zip b/data_curation/datasets/template/SourceData_March_10_2023.zip new file mode 100644 index 000000000..61205bc6a Binary files /dev/null and b/data_curation/datasets/template/SourceData_March_10_2023.zip differ diff --git a/data_curation/datasets/template/clean.sh b/data_curation/datasets/template/clean.sh new file mode 100755 index 000000000..992351025 --- /dev/null +++ b/data_curation/datasets/template/clean.sh @@ -0,0 +1,4 @@ +#!/bin/bash +rm -rf SourceData/ +rm -rf generated_artifacts/ +echo "Deleted SourceData/ and generated_artifacts/ ." diff --git a/data_curation/datasets/template/common.py b/data_curation/datasets/template/common.py new file mode 100644 index 000000000..7928a0004 --- /dev/null +++ b/data_curation/datasets/template/common.py @@ -0,0 +1 @@ +generated_artifacts_directory = 'generated_artifacts' diff --git a/data_curation/datasets/template/download.sh b/data_curation/datasets/template/download.sh new file mode 100755 index 000000000..21355847e --- /dev/null +++ b/data_curation/datasets/template/download.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# The "download.sh" script should typically fetch a big source data file. +# Something like: +# wget https://data-repository-hub.com/123456789/SourceData_March_10_2023.zip +# +# In this dummy example case we'll pretend this (committed) zip file was downloaded by this script: +# SourceData_March_10_2023.zip +# +# Prefer NOT to commit such downloaded files, even to git LFS. +main_source_file=SourceData_March_10_2023.zip +if [[ -f $main_source_file ]]; +then + echo "$main_source_file is present." +else + echo "Error: $main_source_file is not present. Not downloaded?" + exit 1 +fi diff --git a/data_curation/datasets/template/extract.py b/data_curation/datasets/template/extract.py new file mode 100644 index 000000000..df538acb0 --- /dev/null +++ b/data_curation/datasets/template/extract.py @@ -0,0 +1,50 @@ +from os.path import join +from os.path import exists +import json +import sqlite3 + +import pandas as pd + +from sqlite_stuff import initialize_sqlite_db +from sqlite_stuff import get_sqlite_connection + +def extract_dataset_from_source_files(): + source_dir = 'SourceData' + measurement_files = pd.read_csv(join(source_dir, 'spreadsheet2.tsv'), keep_default_na=False, sep='\t', dtype=str) + measurement_events = [] + measurements = [] + for i, row in measurement_files.iterrows(): + measurement_id = row['ID'] + measurement_file = join(source_dir, row['Associated file']) + if not exists(measurement_file): + raise FileNotFoundError(measurement_file) + with open(measurement_file, 'rt', encoding='utf-8') as file: + measurement_info = json.loads(file.read()) + measurement_events.append((measurement_id, measurement_info['subject'], measurement_info['measurement code'])) + for value in measurement_info['measurements']: + measurements.append((measurement_id, value)) + + measurement_events_df = pd.DataFrame(measurement_events, columns=['Measurement event ID', 'Subject of measurement', 'Type code']) + measurements_df = pd.DataFrame(measurements, columns=['Measurement event ID', 'Value']) + feature_matrix = pd.read_csv(join(source_dir, 'spreadsheet1.tsv'), keep_default_na=False, sep='\t', dtype=str) + dfs = [measurement_events_df, measurements_df, feature_matrix] + + print_data_frames(dfs) + initialize_sqlite_db() + send_to_sqlite(dfs, ['measurement_events', 'measurement_values', 'feature_matrix']) + +def send_to_sqlite(dfs, table_names): + connection = get_sqlite_connection() + for table_name, df in zip(table_names, dfs): + df.to_sql(table_name, connection, if_exists='replace', index=False) + connection.commit() + +def print_data_frames(dfs): + print('Example during-processing output, e.g. for debugging:') + print('') + for df in dfs: + print(df.to_string(index=False)) + print('') + +if __name__=='__main__': + extract_dataset_from_source_files() diff --git a/data_curation/datasets/template/extract.sh b/data_curation/datasets/template/extract.sh new file mode 100755 index 000000000..24bc36c43 --- /dev/null +++ b/data_curation/datasets/template/extract.sh @@ -0,0 +1,4 @@ +#!/bin/bash +unzip SourceData_March_10_2023.zip +echo '' +python extract.py diff --git a/data_curation/datasets/template/sqlite_stuff.py b/data_curation/datasets/template/sqlite_stuff.py new file mode 100644 index 000000000..e115729a2 --- /dev/null +++ b/data_curation/datasets/template/sqlite_stuff.py @@ -0,0 +1,16 @@ +from os.path import exists +from os.path import join +from os import mkdir +import sqlite3 + +from common import generated_artifacts_directory + +curated_dataset_file = 'example_curated_dataset.db' + +def initialize_sqlite_db(): + connection = get_sqlite_connection() + +def get_sqlite_connection(): + if not exists(generated_artifacts_directory): + mkdir(generated_artifacts_directory) + return sqlite3.connect(join(generated_artifacts_directory, curated_dataset_file)) diff --git a/data_curation/datasets/template/verify.py b/data_curation/datasets/template/verify.py new file mode 100644 index 000000000..af9455c00 --- /dev/null +++ b/data_curation/datasets/template/verify.py @@ -0,0 +1,21 @@ + +import pandas as pd + +from sqlite_stuff import get_sqlite_connection + +def check_record_counts(): + connection = get_sqlite_connection() + table_names = ['measurement_events', 'measurement_values', 'feature_matrix'] + dfs = [pd.read_sql_query(f"SELECT * FROM {table_name}", connection) for table_name in table_names] + + print('Checking record counts for 3 tables found in sqlite database file.') + print('') + expected_counts = [4, 24, 3] + for expected_count, table_name, df in zip(expected_counts, table_names, dfs): + if expected_count != df.shape[0]: + raise ValueError(f'Expected {expected_count} in {table_name} but got {df.shape[0]}.') + else: + print(f'Got expected record count {expected_count} in {table_name}.') + +if __name__=='__main__': + check_record_counts() diff --git a/data_curation/datasets/template/verify.sh b/data_curation/datasets/template/verify.sh new file mode 100755 index 000000000..fdfdaaf54 --- /dev/null +++ b/data_curation/datasets/template/verify.sh @@ -0,0 +1,3 @@ +#!/bin/bash +# Do some checks that the artifacts are generated correctly. +python verify.py diff --git a/data_curation/show_progress.sh b/data_curation/show_progress.sh new file mode 100755 index 000000000..a61a04ef3 --- /dev/null +++ b/data_curation/show_progress.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +source convenience_scripts/import_functions.sh + +show_progress