From f507fe122668981eabea9d19cb899d486ea64b7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Fri, 13 Dec 2024 16:59:25 +0000 Subject: [PATCH] test: adapt `TestCommonProteinCodingFeatureLogic` --- tests/gentropy/dataset/test_l2g_feature.py | 84 +++++++++++++--------- 1 file changed, 52 insertions(+), 32 deletions(-) diff --git a/tests/gentropy/dataset/test_l2g_feature.py b/tests/gentropy/dataset/test_l2g_feature.py index feb8e449a..9320b3aa4 100644 --- a/tests/gentropy/dataset/test_l2g_feature.py +++ b/tests/gentropy/dataset/test_l2g_feature.py @@ -238,9 +238,11 @@ def sample_variant_index_schema() -> StructType: ArrayType( StructType( [ + StructField("distanceFromFootprint", LongType(), True), StructField("distanceFromTss", LongType(), True), StructField("targetId", StringType(), True), StructField("isEnsemblCanonical", BooleanType(), True), + StructField("biotype", StringType(), True), ] ) ), @@ -624,13 +626,17 @@ def _setup( [ { "distanceFromTss": 10, + "distanceFromFootprint": 0, "targetId": "gene1", "isEnsemblCanonical": True, + "biotype": "protein_coding", }, { "distanceFromTss": 2, + "distanceFromFootprint": 0, "targetId": "gene2", "isEnsemblCanonical": True, + "biotype": "protein_coding", }, ], ), @@ -643,8 +649,10 @@ def _setup( [ { "distanceFromTss": 5, + "distanceFromFootprint": 0, "targetId": "gene1", "isEnsemblCanonical": True, + "biotype": "protein_coding", }, ], ), @@ -928,9 +936,8 @@ class TestCommonProteinCodingFeatureLogic: [ ( [ - {"studyLocusId": "1", "geneId": "gene1", "isProteinCoding500kb": 1}, - {"studyLocusId": "1", "geneId": "gene2", "isProteinCoding500kb": 1}, - {"studyLocusId": "1", "geneId": "gene3", "isProteinCoding500kb": 0}, + {"studyLocusId": "1", "geneId": "gene1", "isProteinCoding": 1.0}, + {"studyLocusId": "1", "geneId": "gene2", "isProteinCoding": 0.0}, ] ), ], @@ -944,17 +951,16 @@ def test_is_protein_coding_feature_logic( observed_df = ( is_protein_coding_feature_logic( study_loci_to_annotate=self.sample_study_locus, - gene_index=self.sample_gene_index, - feature_name="isProteinCoding500kb", - genomic_window=500000, + variant_index=self.sample_variant_index, + feature_name="isProteinCoding", ) - .select("studyLocusId", "geneId", "isProteinCoding500kb") + .select("studyLocusId", "geneId", "isProteinCoding") .orderBy("studyLocusId", "geneId") ) expected_df = ( spark.createDataFrame(expected_data) - .select("studyLocusId", "geneId", "isProteinCoding500kb") + .select("studyLocusId", "geneId", "isProteinCoding") .orderBy("studyLocusId", "geneId") ) assert ( @@ -962,7 +968,11 @@ def test_is_protein_coding_feature_logic( ), "Expected and observed DataFrames do not match." @pytest.fixture(autouse=True) - def _setup(self: TestCommonProteinCodingFeatureLogic, spark: SparkSession) -> None: + def _setup( + self: TestCommonProteinCodingFeatureLogic, + spark: SparkSession, + sample_variant_index_schema: StructType, + ) -> None: """Set up sample data for the test.""" # Sample study locus data self.sample_study_locus = StudyLocus( @@ -974,39 +984,47 @@ def _setup(self: TestCommonProteinCodingFeatureLogic, spark: SparkSession) -> No "studyId": "study1", "chromosome": "1", "position": 1000000, + "locus": [ + { + "variantId": "var1", + }, + ], }, ], StudyLocus.get_schema(), ), _schema=StudyLocus.get_schema(), ) - - # Sample gene index data with biotype - self.sample_gene_index = GeneIndex( + self.sample_variant_index = VariantIndex( _df=spark.createDataFrame( [ - { - "geneId": "gene1", - "chromosome": "1", - "tss": 950000, - "biotype": "protein_coding", - }, - { - "geneId": "gene2", - "chromosome": "1", - "tss": 1050000, - "biotype": "protein_coding", - }, - { - "geneId": "gene3", - "chromosome": "1", - "tss": 1010000, - "biotype": "non_coding", - }, + ( + "var1", + "chrom", + 1, + "A", + "T", + [ + { + "distanceFromFootprint": 0, + "distanceFromTss": 10, + "targetId": "gene1", + "biotype": "protein_coding", + "isEnsemblCanonical": True, + }, + { + "distanceFromFootprint": 0, + "distanceFromTss": 20, + "targetId": "gene2", + "biotype": "non_coding", + "isEnsemblCanonical": True, + }, + ], + ), ], - GeneIndex.get_schema(), + sample_variant_index_schema, ), - _schema=GeneIndex.get_schema(), + _schema=VariantIndex.get_schema(), ) @@ -1067,8 +1085,10 @@ def _setup( [ { "distanceFromTss": 10, + "distanceFromFootprint": 0, "targetId": "gene1", "isEnsemblCanonical": True, + "biotype": "protein_coding", }, ], )