Skip to content

Commit

Permalink
Dropped --sodar-directory, library name is better suited for filterin…
Browse files Browse the repository at this point in the history
…g directories: --use-library-name.
  • Loading branch information
eudesbarbosa committed Sep 8, 2022
1 parent dc5beec commit a33f876
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 42 deletions.
49 changes: 25 additions & 24 deletions cubi_tk/snappy/pull_raw_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class Config:
sodar_url: str
sodar_api_token: str = attr.ib(repr=lambda value: "***") # type: ignore
tsv_shortcut: str
sodar_directory: str
use_library_name: bool
overwrite: bool
dry_run: bool
first_batch: int
Expand Down Expand Up @@ -107,10 +107,13 @@ def setup_argparse(cls, parser: argparse.ArgumentParser) -> None:
)
parser.add_argument("--samples", help="Optional list of samples to pull")
parser.add_argument(
"--sodar-directory",
default=None,
required=False,
help="SODAR directory name expected for raw data path, e.g.: 'raw_data'.",
"--use-library-name",
default=False,
action="store_true",
help=(
"Flag to indicate that the search in SODAR directories should be based on library name "
"(e.g. 'P001-N1-DNA1-WGS1') instead of sample identifier (e.g.'P001') in the file name."
),
)
parser.add_argument(
"--assay-uuid",
Expand Down Expand Up @@ -158,7 +161,7 @@ def execute(self) -> typing.Optional[int]:

# Filter requested samples and folder directories
parser = ParseSampleSheet()
if self.config.sodar_directory:
if self.config.use_library_name:
selected_identifiers_tuples = list(
parser.yield_ngs_library_and_folder_names(
sheet=sheet,
Expand Down Expand Up @@ -198,8 +201,8 @@ def execute(self) -> typing.Optional[int]:
).perform()

# Filter based on identifiers and file type
if self.config.sodar_directory:
filtered_remote_files_dict = self.filter_irods_collection_plus_dir_name(
if self.config.use_library_name:
filtered_remote_files_dict = self.filter_irods_collection_by_library_name_in_path(
identifiers=selected_identifiers,
directory_name=self.config.sodar_directory,
remote_files_dict=remote_files_dict,
Expand Down Expand Up @@ -243,17 +246,17 @@ def execute(self) -> typing.Optional[int]:
logger.info("All done. Have a nice day!")
return 0

def filter_irods_collection_plus_dir_name(
self, identifiers, directory_name, remote_files_dict, file_type
def filter_irods_collection_by_library_name_in_path(
self, identifiers, remote_files_dict, file_type
):
"""Filter iRODS collection based on identifiers, directory name, and file type/extension.
"""Filter iRODS collection based on identifiers and file type/extension.
Assumes that SODAR directories follow the logic below to filter by library name:
/sodarZone/projects/../<PROJECT_UUID>/sample_data/study_<STUDY_UUID>/assay_<ASSAY_UUID>/<LIBRARY_NAME>
:param identifiers: List of sample identifiers or library names.
:type identifiers: list
:param directory_name: Directory name as defined in arguments.
:type directory_name: str
:param remote_files_dict: Dictionary with iRODS collection information. Key: file name as string (e.g.,
'P001-N1-DNA1-WES1.vcf.gz'); Value: iRODS data (``IrodsDataObject``).
:type remote_files_dict: dict
Expand Down Expand Up @@ -282,20 +285,18 @@ def filter_irods_collection_plus_dir_name(
# Piggyback loop for dir check
_irods_path_list.append(irods_obj.irods_path)
# Actual check
in_common_links = self._irods_path_in_common_links(irods_obj.irods_path)
if in_common_links:
if self._irods_path_in_common_links(irods_obj.irods_path):
in_common_links = True
break

# Check if requested directory is in iRODS path
directory_in_path = directory_name in sum(
[path_.split("/") for path_ in _irods_path_list], []
)

# Filter: not in common links; directory must be part of path
if directory_in_path and not in_common_links:
# Update output if: not in common links and any id is part of SODAR path
# Assumption: the path will include at most one library name
if not in_common_links:
all_directories = sum([path_.split("/") for path_ in _irods_path_list], [])
for id_ in identifiers:
if any([id_ in path_ for path_ in _irods_path_list]):
if any([id_ == dir_ for dir_ in all_directories]):
output_dict[id_].extend(value)
break

return output_dict

Expand Down
40 changes: 22 additions & 18 deletions tests/test_snappy_pull_raw_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def pull_raw_data():
"sodar_url": "https://sodar.bihealth.org/",
"dry_run": False,
"overwrite": False,
"sodar_directory": None,
"use_library_name": False,
"tsv_shortcut": "germline",
"first_batch": 0,
"last_batch": None,
Expand Down Expand Up @@ -133,8 +133,8 @@ def remote_files_all(remote_files_fastq, remote_files_vcf):


@pytest.fixture
def library_to_irods_dict():
"""Returns example of output from PullRawDataCommand.pair_ipath_with_folder_name()"""
def sample_to_irods_dict():
"""Returns example of output from PullRawDataCommand.pair_ipath_with_folder_name() based on sample names."""
p0001_sodar_path = (
"/sodar_path/.../assay_99999999-aaa-bbbb-cccc-99999999/P001-N1-DNA1-WES1/1999-09-09"
)
Expand Down Expand Up @@ -173,6 +173,16 @@ def library_to_irods_dict():
}


@pytest.fixture
def library_to_irods_dict(sample_to_irods_dict):
"""Returns example of output from PullRawDataCommand.pair_ipath_with_folder_name() based on library names"""
output_dict = {}
for key, value in sample_to_irods_dict.items():
new_key = key + "-N1-DNA1-WES1"
output_dict[new_key] = value
return output_dict


def test_run_snappy_pull_raw_help(capsys):
"""Test ``cubi-tk snappy pull-raw-data --help``"""
parser, _subparsers = setup_argparse()
Expand Down Expand Up @@ -225,25 +235,19 @@ def test_pull_raw_data_filter_irods_collection_plus_dir_name(
):
"""Tests PullRawDataCommand.filter_irods_collection_plus_dir_name() - FASTQ files"""
# Define input
absent_sample_list = ["P098", "P099"]
samples_list = ["P001", "P002"]
absent_sample_list = ["P098-N1-DNA1-WES1", "P099-N1-DNA1-WES1"]
samples_list = ["P001-N1-DNA1-WES1", "P002-N1-DNA1-WES1"]
file_type = "fastq"

# Call with samples id as identifiers
actual = pull_raw_data.filter_irods_collection_plus_dir_name(
identifiers=samples_list,
directory_name="raw_data",
remote_files_dict=remote_files_all,
file_type=file_type,
actual = pull_raw_data.filter_irods_collection_by_library_name_in_path(
identifiers=samples_list, remote_files_dict=remote_files_all, file_type=file_type
)
assert actual == library_to_irods_dict

# Sanity check - should return empty dictionary, samples aren't present
actual = pull_raw_data.filter_irods_collection_plus_dir_name(
identifiers=absent_sample_list,
directory_name="raw_data",
remote_files_dict=remote_files_fastq,
file_type=file_type,
actual = pull_raw_data.filter_irods_collection_by_library_name_in_path(
identifiers=absent_sample_list, remote_files_dict=remote_files_fastq, file_type=file_type
)
assert len(actual) == 0

Expand All @@ -258,7 +262,7 @@ def test_pull_raw_data_get_library_to_irods_dict(pull_raw_data, remote_files_fas
assert all([str(irods.file_name).startswith(id_) for irods in actual.get(id_)])


def test_pull_raw_data_pair_ipath_with_folder_name(pull_raw_data, library_to_irods_dict):
def test_pull_raw_data_pair_ipath_with_folder_name(pull_raw_data, sample_to_irods_dict):
"""Tests PullRawDataCommand.pair_ipath_with_folder_name()"""
# Define input
out_dir = "out_dir"
Expand Down Expand Up @@ -299,7 +303,7 @@ def test_pull_raw_data_pair_ipath_with_folder_name(pull_raw_data, library_to_iro

# Test with correct assay UUID - directory structure same as in SODAR
actual = pull_raw_data.pair_ipath_with_outdir(
library_to_irods_dict=library_to_irods_dict,
library_to_irods_dict=sample_to_irods_dict,
identifiers_tuples=identifiers_tup,
output_dir=out_dir,
assay_uuid=assay_uuid,
Expand All @@ -308,7 +312,7 @@ def test_pull_raw_data_pair_ipath_with_folder_name(pull_raw_data, library_to_iro

# Test with wrong assay UUID - all files copied to root of output directory
actual = pull_raw_data.pair_ipath_with_outdir(
library_to_irods_dict=library_to_irods_dict,
library_to_irods_dict=sample_to_irods_dict,
identifiers_tuples=identifiers_tup,
output_dir=out_dir,
assay_uuid=wrong_assay_uuid,
Expand Down

0 comments on commit a33f876

Please sign in to comment.