diff --git a/.vscode/settings.json b/.vscode/settings.json
index aaafda97f..ecc456889 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -25,5 +25,8 @@
   "python.testing.pytestEnabled": true,
   "mypy-type-checker.severity": {
     "error": "Information"
-  }
+  },
+  "yaml.extension.recommendations": false,
+  "workbench.remoteIndicator.showExtensionRecommendations": false,
+  "extensions.ignoreRecommendations": true
 }
diff --git a/config/datasets/ot_gcp.yaml b/config/datasets/ot_gcp.yaml
index c8f67d418..c61863b86 100644
--- a/config/datasets/ot_gcp.yaml
+++ b/config/datasets/ot_gcp.yaml
@@ -37,13 +37,13 @@ gnomad_public_bucket: gs://gcp-public-data--gnomad/release/
 ld_matrix_template: ${datasets.gnomad_public_bucket}/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.adj.ld.bm
 ld_index_raw_template: ${datasets.gnomad_public_bucket}/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.ld.variant_indices.ht
 liftover_ht_path: ${datasets.gnomad_public_bucket}/2.1.1/liftover_grch38/ht/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.ht
-# variant_annotation
+# GnomAD variant set:
 gnomad_genomes_path: ${datasets.gnomad_public_bucket}4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/
 
 # Others
 chain_38_37: gs://hail-common/references/grch38_to_grch37.over.chain.gz
 chain_37_38: ${datasets.static_assets}/grch37_to_grch38.over.chain
-vep_consequences: ${datasets.static_assets}/vep_consequences.tsv
+vep_consequences: ${datasets.static_assets}/variant_consequence_to_score.tsv
 anderson: ${datasets.static_assets}/andersson2014/enhancer_tss_associations.bed
 javierre: ${datasets.static_assets}/javierre_2016_preprocessed
 jung: ${datasets.static_assets}/jung2019_pchic_tableS3.csv
@@ -55,7 +55,7 @@ finngen_finemapping_results_path: ${datasets.inputs}/Finngen_susie_finemapping_r
 finngen_finemapping_summaries_path: ${datasets.inputs}/Finngen_susie_finemapping_r10/Finngen_susie_credset_summary_r10.tsv
 
 # Dev output datasets
-variant_annotation: ${datasets.outputs}/variant_annotation
+gnomad_variants: ${datasets.outputs}/gnomad_variants
 study_locus: ${datasets.outputs}/study_locus
 summary_statistics: ${datasets.outputs}/summary_statistics
 study_locus_overlap: ${datasets.outputs}/study_locus_overlap
@@ -68,6 +68,8 @@ catalog_study_locus: ${datasets.study_locus}/catalog_study_locus
 from_sumstats_study_locus: ${datasets.study_locus}/from_sumstats
 from_sumstats_pics: ${datasets.credible_set}/from_sumstats
 
+vep_output_path: gs://genetics_etl_python_playground/vep/full_variant_index_vcf
+
 # ETL output datasets:
 l2g_gold_standard_curation: ${datasets.release_folder}/locus_to_gene_gold_standard.json
 l2g_model: ${datasets.release_folder}/locus_to_gene_model/classifier.skops
@@ -78,4 +80,4 @@ study_index: ${datasets.release_folder}/study_index
 variant_index: ${datasets.release_folder}/variant_index
 credible_set: ${datasets.release_folder}/credible_set
 gene_index: ${datasets.release_folder}/gene_index
-v2g: ${datasets.release_folder}/variant_to_gene
+variant_to_gene: ${datasets.release_folder}/variant_to_gene
diff --git a/config/step/ot_variant_annotation.yaml b/config/step/ot_variant_annotation.yaml
deleted file mode 100644
index 55a9503ce..000000000
--- a/config/step/ot_variant_annotation.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-defaults:
-  - variant_annotation
-
-variant_annotation_path: ${datasets.variant_annotation}
-gnomad_genomes_path: ${datasets.gnomad_genomes_path}
-chain_38_37: ${datasets.chain_38_37}
-gnomad_variant_populations:
-  - afr # African-American
-  - amr # American Admixed/Latino
-  - ami # Amish ancestry
-  - asj # Ashkenazi Jewish
-  - eas # East Asian
-  - fin # Finnish
-  - nfe # Non-Finnish European
-  - mid # Middle Eastern
-  - sas # South Asian
-  - remaining # Other
-# The version will of the gnomad will be inferred from ld_matrix_template and appended to the ld_index_out.
-use_version_from_input: true
diff --git a/config/step/ot_variant_index.yaml b/config/step/ot_variant_index.yaml
index 3834196b2..00b6b1602 100644
--- a/config/step/ot_variant_index.yaml
+++ b/config/step/ot_variant_index.yaml
@@ -1,6 +1,6 @@
 defaults:
   - variant_index
 
-variant_annotation_path: ${datasets.variant_annotation}
-credible_set_path: ${datasets.credible_set}
+vep_output_json_path: ${datasets.vep_output_path}
+gnomad_variant_annotations_path: ${datasets.gnomad_variants}
 variant_index_path: ${datasets.variant_index}
diff --git a/config/step/ot_variant_to_gene.yaml b/config/step/ot_variant_to_gene.yaml
index 1ac6d2fbe..7187a0625 100644
--- a/config/step/ot_variant_to_gene.yaml
+++ b/config/step/ot_variant_to_gene.yaml
@@ -2,7 +2,6 @@ defaults:
   - variant_to_gene
 
 variant_index_path: ${datasets.variant_index}
-variant_annotation_path: ${datasets.variant_annotation}
 gene_index_path: ${datasets.gene_index}
 vep_consequences_path: ${datasets.vep_consequences}
 liftover_chain_file_path: ${datasets.chain_37_38}
@@ -11,4 +10,4 @@ interval_sources:
   javierre: ${datasets.javierre}
   jung: ${datasets.jung}
   thurman: ${datasets.thurman}
-v2g_path: ${datasets.v2g}
+v2g_path: ${datasets.variant_to_gene}
diff --git a/docs/assets/imgs/ensembl_logo.png b/docs/assets/imgs/ensembl_logo.png
new file mode 100644
index 000000000..83fd25751
Binary files /dev/null and b/docs/assets/imgs/ensembl_logo.png differ
diff --git a/docs/python_api/datasets/variant_annotation.md b/docs/python_api/datasets/variant_annotation.md
deleted file mode 100644
index f82210b6e..000000000
--- a/docs/python_api/datasets/variant_annotation.md
+++ /dev/null
@@ -1,9 +0,0 @@
----
-title: Variant annotation
----
-
-::: gentropy.dataset.variant_annotation.VariantAnnotation
-
-## Schema
-
---8<-- "assets/schemas/variant_annotation.md"
diff --git a/docs/python_api/datasources/_datasources.md b/docs/python_api/datasources/_datasources.md
index f05a2b834..e6e081b21 100644
--- a/docs/python_api/datasources/_datasources.md
+++ b/docs/python_api/datasources/_datasources.md
@@ -23,7 +23,8 @@ This section contains information about the data source harmonisation tools avai
 ## Variant annotation/validation
 
 1. [GnomAD](gnomad/_gnomad.md) v4.0
-1. GWAS catalog harmonisation pipeline [more info](https://www.ebi.ac.uk/gwas/docs/methods/summary-statistics#_harmonised_summary_statistics_data)
+2. GWAS catalog's [harmonisation pipeline](https://www.ebi.ac.uk/gwas/docs/methods/summary-statistics#_harmonised_summary_statistics_data)
+3. Ensembl's [Variant Effect Predictor](https://www.ensembl.org/info/docs/tools/vep/index.html)
 
 ## Linkage desiquilibrium
 
diff --git a/docs/python_api/datasources/ensembl/_ensembl.md b/docs/python_api/datasources/ensembl/_ensembl.md
new file mode 100644
index 000000000..56c49870a
--- /dev/null
+++ b/docs/python_api/datasources/ensembl/_ensembl.md
@@ -0,0 +1,10 @@
+---
+title: Ensembl annotations
+---
+
+<div align="center">
+  <img width="100" height="100" src="../../../../assets/imgs/ensembl_logo.png">
+  <h1>Ensembl</h1>
+</div>
+
+[Ensembl](https://www.ensembl.org/index.html) provides a diverse set of genetic data Gentropy takes advantage of including gene set, and variant annotations.
diff --git a/docs/python_api/datasources/ensembl/variant_effect_predictor_parser.md b/docs/python_api/datasources/ensembl/variant_effect_predictor_parser.md
new file mode 100644
index 000000000..c956ed3e1
--- /dev/null
+++ b/docs/python_api/datasources/ensembl/variant_effect_predictor_parser.md
@@ -0,0 +1,5 @@
+---
+title: Variant effector parser
+---
+
+::: gentropy.datasource.ensembl.vep_parser.VariantEffectPredictorParser
diff --git a/docs/python_api/steps/ld_index.md b/docs/python_api/steps/ld_index.md
index bf8b9b58e..d8f61a528 100644
--- a/docs/python_api/steps/ld_index.md
+++ b/docs/python_api/steps/ld_index.md
@@ -1,5 +1,5 @@
 ---
-title: ld_index
+title: GnomAD Linkage data ingestion
 ---
 
-::: gentropy.ld_index.LDIndexStep
+::: gentropy.gnomad_ingestion.LDIndexStep
diff --git a/docs/python_api/steps/variant_annotation_step.md b/docs/python_api/steps/variant_annotation_step.md
index e65a071b2..2b5582df4 100644
--- a/docs/python_api/steps/variant_annotation_step.md
+++ b/docs/python_api/steps/variant_annotation_step.md
@@ -1,5 +1,5 @@
 ---
-title: variant_annotation
+title: GnomAD variant data ingestion
 ---
 
-::: gentropy.variant_annotation.VariantAnnotationStep
+::: gentropy.gnomad_ingestion.GnomadVariantIndexStep
diff --git a/docs/python_api/steps/variant_to_gene_step.md b/docs/python_api/steps/variant_to_gene_step.md
index 1a3e56af8..db1c1fd20 100644
--- a/docs/python_api/steps/variant_to_gene_step.md
+++ b/docs/python_api/steps/variant_to_gene_step.md
@@ -2,4 +2,4 @@
 title: variant_to_gene
 ---
 
-::: gentropy.v2g.V2GStep
+::: gentropy.variant_to_gene.V2GStep
diff --git a/poetry.lock b/poetry.lock
index b181572d4..432c3d4cc 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -6881,6 +6881,7 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
diff --git a/src/airflow/dags/gnomad_preprocess.py b/src/airflow/dags/gnomad_preprocess.py
index 22e7fa056..03962bec5 100644
--- a/src/airflow/dags/gnomad_preprocess.py
+++ b/src/airflow/dags/gnomad_preprocess.py
@@ -1,4 +1,4 @@
-"""Airflow DAG for the Preprocess part of the pipeline."""
+"""Airflow DAG for the Preprocess GnomAD datasets - LD index and GnomAD variant set."""
 
 from __future__ import annotations
 
@@ -11,13 +11,13 @@
 
 ALL_STEPS = [
     "ot_ld_index",
-    "ot_variant_annotation",
+    "ot_gnomad_variants",
 ]
 
 
 with DAG(
     dag_id=Path(__file__).stem,
-    description="Open Targets Genetics — Preprocess",
+    description="Open Targets Genetics — GnomAD Preprocess",
     default_args=common.shared_dag_args,
     **common.shared_dag_kwargs,
 ):
diff --git a/src/gentropy/assets/data/so_mappings.json b/src/gentropy/assets/data/so_mappings.json
new file mode 100644
index 000000000..8a087b6f7
--- /dev/null
+++ b/src/gentropy/assets/data/so_mappings.json
@@ -0,0 +1,43 @@
+{
+  "transcript_ablation": "SO_0001893",
+  "splice_acceptor_variant": "SO_0001574",
+  "splice_donor_variant": "SO_0001575",
+  "stop_gained": "SO_0001587",
+  "frameshift_variant": "SO_0001589",
+  "stop_lost": "SO_0001578",
+  "start_lost": "SO_0002012",
+  "transcript_amplification": "SO_0001889",
+  "feature_elongation": "SO_0001907",
+  "feature_truncation": "SO_0001906",
+  "inframe_insertion": "SO_0001821",
+  "inframe_deletion": "SO_0001822",
+  "missense_variant": "SO_0001583",
+  "protein_altering_variant": "SO_0001818",
+  "splice_donor_5th_base_variant": "SO_0001787",
+  "splice_region_variant": "SO_0001630",
+  "splice_donor_region_variant": "SO_0002170",
+  "splice_polypyrimidine_tract_variant": "SO_0002169",
+  "incomplete_terminal_codon_variant": "SO_0001626",
+  "start_retained_variant": "SO_0002019",
+  "stop_retained_variant": "SO_0001567",
+  "synonymous_variant": "SO_0001819",
+  "coding_sequence_variant": "SO_0001580",
+  "mature_miRNA_variant": "SO_0001620",
+  "5_prime_UTR_variant": "SO_0001623",
+  "3_prime_UTR_variant": "SO_0001624",
+  "non_coding_transcript_exon_variant": "SO_0001792",
+  "intron_variant": "SO_0001627",
+  "NMD_transcript_variant": "SO_0001621",
+  "non_coding_transcript_variant": "SO_0001619",
+  "coding_transcript_variant": "SO_0001968",
+  "upstream_gene_variant": "SO_0001631",
+  "downstream_gene_variant": "SO_0001632",
+  "TFBS_ablation": "SO_0001895",
+  "TFBS_amplification": "SO_0001892",
+  "TF_binding_site_variant": "SO_0001782",
+  "regulatory_region_ablation": "SO_0001894",
+  "regulatory_region_amplification": "SO_0001891",
+  "regulatory_region_variant": "SO_0001566",
+  "intergenic_variant": "SO_0001628",
+  "sequence_variant": "SO_0001060"
+}
diff --git a/src/gentropy/assets/schemas/variant_annotation.json b/src/gentropy/assets/schemas/variant_annotation.json
deleted file mode 100644
index ab8767389..000000000
--- a/src/gentropy/assets/schemas/variant_annotation.json
+++ /dev/null
@@ -1,215 +0,0 @@
-{
-  "type": "struct",
-  "fields": [
-    {
-      "name": "variantId",
-      "type": "string",
-      "nullable": false,
-      "metadata": {}
-    },
-    {
-      "name": "chromosome",
-      "type": "string",
-      "nullable": false,
-      "metadata": {}
-    },
-    {
-      "name": "position",
-      "type": "integer",
-      "nullable": false,
-      "metadata": {}
-    },
-    {
-      "name": "referenceAllele",
-      "type": "string",
-      "nullable": false,
-      "metadata": {}
-    },
-    {
-      "name": "alternateAllele",
-      "type": "string",
-      "nullable": false,
-      "metadata": {}
-    },
-    {
-      "name": "chromosomeB37",
-      "type": "string",
-      "nullable": true,
-      "metadata": {}
-    },
-    {
-      "name": "positionB37",
-      "type": "integer",
-      "nullable": true,
-      "metadata": {}
-    },
-    {
-      "name": "alleleType",
-      "type": "string",
-      "nullable": true,
-      "metadata": {}
-    },
-    {
-      "name": "rsIds",
-      "type": {
-        "type": "array",
-        "elementType": "string",
-        "containsNull": true
-      },
-      "nullable": true,
-      "metadata": {}
-    },
-    {
-      "name": "alleleFrequencies",
-      "type": {
-        "type": "array",
-        "elementType": {
-          "type": "struct",
-          "fields": [
-            {
-              "name": "populationName",
-              "type": "string",
-              "nullable": true,
-              "metadata": {}
-            },
-            {
-              "name": "alleleFrequency",
-              "type": "double",
-              "nullable": true,
-              "metadata": {}
-            }
-          ]
-        },
-        "containsNull": true
-      },
-      "nullable": false,
-      "metadata": {}
-    },
-    {
-      "name": "inSilicoPredictors",
-      "nullable": false,
-      "metadata": {},
-      "type": {
-        "type": "struct",
-        "fields": [
-          {
-            "name": "cadd",
-            "nullable": true,
-            "metadata": {},
-            "type": {
-              "type": "struct",
-              "fields": [
-                {
-                  "name": "raw",
-                  "type": "float",
-                  "nullable": true,
-                  "metadata": {}
-                },
-                {
-                  "name": "phred",
-                  "type": "float",
-                  "nullable": true,
-                  "metadata": {}
-                }
-              ]
-            }
-          },
-          {
-            "name": "revelMax",
-            "type": "double",
-            "nullable": true,
-            "metadata": {}
-          },
-          {
-            "name": "spliceaiDsMax",
-            "type": "float",
-            "nullable": true,
-            "metadata": {}
-          },
-          {
-            "name": "pangolinLargestDs",
-            "type": "double",
-            "nullable": true,
-            "metadata": {}
-          },
-          {
-            "name": "phylop",
-            "type": "double",
-            "nullable": true,
-            "metadata": {}
-          },
-          {
-            "name": "siftMax",
-            "type": "double",
-            "nullable": true,
-            "metadata": {}
-          },
-          {
-            "name": "polyphenMax",
-            "type": "double",
-            "nullable": true,
-            "metadata": {}
-          }
-        ]
-      }
-    },
-    {
-      "name": "vep",
-      "type": {
-        "type": "struct",
-        "fields": [
-          {
-            "name": "mostSevereConsequence",
-            "type": "string",
-            "nullable": true,
-            "metadata": {}
-          },
-          {
-            "name": "transcriptConsequences",
-            "type": {
-              "type": "array",
-              "elementType": {
-                "type": "struct",
-                "fields": [
-                  {
-                    "name": "aminoAcids",
-                    "type": "string",
-                    "nullable": true,
-                    "metadata": {}
-                  },
-                  {
-                    "name": "consequenceTerms",
-                    "type": {
-                      "type": "array",
-                      "elementType": "string",
-                      "containsNull": true
-                    },
-                    "nullable": true,
-                    "metadata": {}
-                  },
-                  {
-                    "name": "geneId",
-                    "type": "string",
-                    "nullable": true,
-                    "metadata": {}
-                  },
-                  {
-                    "name": "lof",
-                    "type": "string",
-                    "nullable": true,
-                    "metadata": {}
-                  }
-                ]
-              },
-              "containsNull": true
-            },
-            "nullable": true,
-            "metadata": {}
-          }
-        ]
-      },
-      "nullable": false,
-      "metadata": {}
-    }
-  ]
-}
diff --git a/src/gentropy/assets/schemas/variant_index.json b/src/gentropy/assets/schemas/variant_index.json
index c6a3702c9..d9be4e50f 100644
--- a/src/gentropy/assets/schemas/variant_index.json
+++ b/src/gentropy/assets/schemas/variant_index.json
@@ -1,53 +1,188 @@
 {
-  "type": "struct",
   "fields": [
     {
+      "metadata": {},
       "name": "variantId",
-      "type": "string",
       "nullable": false,
-      "metadata": {}
+      "type": "string"
     },
     {
+      "metadata": {},
       "name": "chromosome",
-      "type": "string",
       "nullable": false,
-      "metadata": {}
+      "type": "string"
     },
     {
+      "metadata": {},
       "name": "position",
-      "type": "integer",
       "nullable": false,
-      "metadata": {}
+      "type": "integer"
     },
     {
+      "metadata": {},
       "name": "referenceAllele",
-      "type": "string",
       "nullable": false,
-      "metadata": {}
+      "type": "string"
     },
     {
+      "metadata": {},
       "name": "alternateAllele",
-      "type": "string",
       "nullable": false,
-      "metadata": {}
+      "type": "string"
     },
     {
-      "name": "chromosomeB37",
-      "type": "string",
-      "nullable": true,
-      "metadata": {}
+      "metadata": {},
+      "name": "inSilicoPredictors",
+      "nullable": false,
+      "type": {
+        "containsNull": true,
+        "elementType": {
+          "fields": [
+            {
+              "metadata": {},
+              "name": "method",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "assessment",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "score",
+              "nullable": true,
+              "type": "float"
+            },
+            {
+              "metadata": {},
+              "name": "assessmentFlag",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "targetId",
+              "nullable": true,
+              "type": "string"
+            }
+          ],
+          "type": "struct"
+        },
+        "type": "array"
+      }
+    },
+    {
+      "metadata": {},
+      "name": "mostSevereConsequenceId",
+      "nullable": false,
+      "type": "string"
     },
     {
-      "name": "positionB37",
-      "type": "integer",
+      "metadata": {},
+      "name": "transcriptConsequences",
       "nullable": true,
-      "metadata": {}
+      "type": {
+        "containsNull": false,
+        "elementType": {
+          "fields": [
+            {
+              "metadata": {},
+              "name": "variantFunctionalConsequenceIds",
+              "nullable": true,
+              "type": {
+                "containsNull": true,
+                "elementType": "string",
+                "type": "array"
+              }
+            },
+            {
+              "metadata": {},
+              "name": "aminoAcidChange",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "uniprotAccessions",
+              "nullable": true,
+              "type": {
+                "containsNull": true,
+                "elementType": "string",
+                "type": "array"
+              }
+            },
+            {
+              "metadata": {},
+              "name": "isEnsemblCanonical",
+              "nullable": false,
+              "type": "boolean"
+            },
+            {
+              "metadata": {},
+              "name": "codons",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "distance",
+              "nullable": true,
+              "type": "long"
+            },
+            {
+              "metadata": {},
+              "name": "targetId",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "impact",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "lofteePrediction",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "siftPrediction",
+              "nullable": true,
+              "type": "float"
+            },
+            {
+              "metadata": {},
+              "name": "polyphenPrediction",
+              "nullable": true,
+              "type": "float"
+            },
+            {
+              "metadata": {},
+              "name": "transcriptId",
+              "nullable": true,
+              "type": "string"
+            }
+          ],
+          "type": "struct"
+        },
+        "type": "array"
+      }
     },
     {
-      "name": "alleleType",
-      "type": "string",
-      "nullable": false,
-      "metadata": {}
+      "metadata": {},
+      "name": "rsIds",
+      "nullable": true,
+      "type": {
+        "containsNull": true,
+        "elementType": "string",
+        "type": "array"
+      }
     },
     {
       "name": "alleleFrequencies",
@@ -76,88 +211,31 @@
       "metadata": {}
     },
     {
-      "name": "inSilicoPredictors",
-      "nullable": false,
       "metadata": {},
+      "name": "dbXrefs",
+      "nullable": false,
       "type": {
-        "type": "struct",
-        "fields": [
-          {
-            "name": "cadd",
-            "nullable": true,
-            "metadata": {},
-            "type": {
-              "type": "struct",
-              "fields": [
-                {
-                  "name": "raw",
-                  "type": "float",
-                  "nullable": true,
-                  "metadata": {}
-                },
-                {
-                  "name": "phred",
-                  "type": "float",
-                  "nullable": true,
-                  "metadata": {}
-                }
-              ]
+        "containsNull": true,
+        "elementType": {
+          "fields": [
+            {
+              "metadata": {},
+              "name": "id",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "source",
+              "nullable": true,
+              "type": "string"
             }
-          },
-          {
-            "name": "revelMax",
-            "type": "double",
-            "nullable": true,
-            "metadata": {}
-          },
-          {
-            "name": "spliceaiDsMax",
-            "type": "float",
-            "nullable": true,
-            "metadata": {}
-          },
-          {
-            "name": "pangolinLargestDs",
-            "type": "double",
-            "nullable": true,
-            "metadata": {}
-          },
-          {
-            "name": "phylop",
-            "type": "double",
-            "nullable": true,
-            "metadata": {}
-          },
-          {
-            "name": "siftMax",
-            "type": "double",
-            "nullable": true,
-            "metadata": {}
-          },
-          {
-            "name": "polyphenMax",
-            "type": "double",
-            "nullable": true,
-            "metadata": {}
-          }
-        ]
+          ],
+          "type": "struct"
+        },
+        "type": "array"
       }
-    },
-    {
-      "name": "mostSevereConsequence",
-      "type": "string",
-      "nullable": true,
-      "metadata": {}
-    },
-    {
-      "name": "rsIds",
-      "type": {
-        "type": "array",
-        "elementType": "string",
-        "containsNull": true
-      },
-      "nullable": true,
-      "metadata": {}
     }
-  ]
+  ],
+  "type": "struct"
 }
diff --git a/src/gentropy/assets/schemas/vep_json_output.json b/src/gentropy/assets/schemas/vep_json_output.json
new file mode 100644
index 000000000..ecad3ea1e
--- /dev/null
+++ b/src/gentropy/assets/schemas/vep_json_output.json
@@ -0,0 +1,531 @@
+{
+  "fields": [
+    {
+      "metadata": {},
+      "name": "allele_string",
+      "nullable": true,
+      "type": "string"
+    },
+    {
+      "metadata": {},
+      "name": "assembly_name",
+      "nullable": true,
+      "type": "string"
+    },
+    {
+      "metadata": {},
+      "name": "intergenic_consequences",
+      "nullable": true,
+      "type": {
+        "containsNull": true,
+        "elementType": {
+          "fields": [
+            {
+              "metadata": {},
+              "name": "cadd_phred",
+              "nullable": true,
+              "type": "double"
+            },
+            {
+              "metadata": {},
+              "name": "cadd_raw",
+              "nullable": true,
+              "type": "double"
+            },
+            {
+              "metadata": {},
+              "name": "consequence_terms",
+              "nullable": true,
+              "type": {
+                "containsNull": true,
+                "elementType": "string",
+                "type": "array"
+              }
+            },
+            {
+              "metadata": {},
+              "name": "impact",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "variant_allele",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "gene_id",
+              "nullable": true,
+              "type": "string"
+            }
+          ],
+          "type": "struct"
+        },
+        "type": "array"
+      }
+    },
+    {
+      "metadata": {},
+      "name": "colocated_variants",
+      "nullable": true,
+      "type": {
+        "containsNull": true,
+        "elementType": {
+          "fields": [
+            {
+              "metadata": {},
+              "name": "allele_string",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "clin_sig",
+              "nullable": true,
+              "type": {
+                "containsNull": true,
+                "elementType": "string",
+                "type": "array"
+              }
+            },
+            {
+              "metadata": {},
+              "name": "clin_sig_allele",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "end",
+              "nullable": true,
+              "type": "long"
+            },
+            {
+              "metadata": {},
+              "name": "id",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "phenotype_or_disease",
+              "nullable": true,
+              "type": "long"
+            },
+            {
+              "metadata": {},
+              "name": "pubmed",
+              "nullable": true,
+              "type": {
+                "containsNull": true,
+                "elementType": "long",
+                "type": "array"
+              }
+            },
+            {
+              "metadata": {},
+              "name": "seq_region_name",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "start",
+              "nullable": true,
+              "type": "long"
+            },
+            {
+              "metadata": {},
+              "name": "strand",
+              "nullable": true,
+              "type": "long"
+            },
+            {
+              "metadata": {},
+              "name": "var_synonyms",
+              "nullable": true,
+              "type": {
+                "fields": [
+                  {
+                    "metadata": {},
+                    "name": "ClinVar",
+                    "nullable": true,
+                    "type": {
+                      "containsNull": true,
+                      "elementType": "string",
+                      "type": "array"
+                    }
+                  },
+                  {
+                    "metadata": {},
+                    "name": "LMDD",
+                    "nullable": true,
+                    "type": {
+                      "containsNull": true,
+                      "elementType": "string",
+                      "type": "array"
+                    }
+                  },
+                  {
+                    "metadata": {},
+                    "name": "OIVD",
+                    "nullable": true,
+                    "type": {
+                      "containsNull": true,
+                      "elementType": "string",
+                      "type": "array"
+                    }
+                  },
+                  {
+                    "metadata": {},
+                    "name": "OMIM",
+                    "nullable": true,
+                    "type": {
+                      "containsNull": true,
+                      "elementType": "double",
+                      "type": "array"
+                    }
+                  },
+                  {
+                    "metadata": {},
+                    "name": "PharmGKB",
+                    "nullable": true,
+                    "type": {
+                      "containsNull": true,
+                      "elementType": "string",
+                      "type": "array"
+                    }
+                  },
+                  {
+                    "metadata": {},
+                    "name": "PhenCode",
+                    "nullable": true,
+                    "type": {
+                      "containsNull": true,
+                      "elementType": "string",
+                      "type": "array"
+                    }
+                  },
+                  {
+                    "metadata": {},
+                    "name": "UniProt",
+                    "nullable": true,
+                    "type": {
+                      "containsNull": true,
+                      "elementType": "string",
+                      "type": "array"
+                    }
+                  },
+                  {
+                    "metadata": {},
+                    "name": "dbPEX",
+                    "nullable": true,
+                    "type": {
+                      "containsNull": true,
+                      "elementType": "string",
+                      "type": "array"
+                    }
+                  }
+                ],
+                "type": "struct"
+              }
+            }
+          ],
+          "type": "struct"
+        },
+        "type": "array"
+      }
+    },
+    {
+      "metadata": {},
+      "name": "end",
+      "nullable": true,
+      "type": "long"
+    },
+    {
+      "metadata": {},
+      "name": "id",
+      "nullable": true,
+      "type": "string"
+    },
+    {
+      "metadata": {},
+      "name": "input",
+      "nullable": true,
+      "type": "string"
+    },
+    {
+      "metadata": {},
+      "name": "most_severe_consequence",
+      "nullable": true,
+      "type": "string"
+    },
+    {
+      "metadata": {},
+      "name": "seq_region_name",
+      "nullable": true,
+      "type": "string"
+    },
+    {
+      "metadata": {},
+      "name": "start",
+      "nullable": true,
+      "type": "long"
+    },
+    {
+      "metadata": {},
+      "name": "strand",
+      "nullable": true,
+      "type": "long"
+    },
+    {
+      "metadata": {},
+      "name": "transcript_consequences",
+      "nullable": true,
+      "type": {
+        "containsNull": true,
+        "elementType": {
+          "fields": [
+            {
+              "metadata": {},
+              "name": "alphamissense",
+              "nullable": true,
+              "type": {
+                "fields": [
+                  {
+                    "metadata": {},
+                    "name": "am_class",
+                    "nullable": true,
+                    "type": "string"
+                  },
+                  {
+                    "metadata": {},
+                    "name": "am_pathogenicity",
+                    "nullable": true,
+                    "type": "double"
+                  }
+                ],
+                "type": "struct"
+              }
+            },
+            {
+              "metadata": {},
+              "name": "amino_acids",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "cadd_phred",
+              "nullable": true,
+              "type": "double"
+            },
+            {
+              "metadata": {},
+              "name": "cadd_raw",
+              "nullable": true,
+              "type": "double"
+            },
+            {
+              "metadata": {},
+              "name": "canonical",
+              "nullable": true,
+              "type": "long"
+            },
+            {
+              "metadata": {},
+              "name": "cdna_end",
+              "nullable": true,
+              "type": "long"
+            },
+            {
+              "metadata": {},
+              "name": "cdna_start",
+              "nullable": true,
+              "type": "long"
+            },
+            {
+              "metadata": {},
+              "name": "cds_end",
+              "nullable": true,
+              "type": "long"
+            },
+            {
+              "metadata": {},
+              "name": "cds_start",
+              "nullable": true,
+              "type": "long"
+            },
+            {
+              "metadata": {},
+              "name": "codons",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "consequence_terms",
+              "nullable": true,
+              "type": {
+                "containsNull": true,
+                "elementType": "string",
+                "type": "array"
+              }
+            },
+            {
+              "metadata": {},
+              "name": "distance",
+              "nullable": true,
+              "type": "long"
+            },
+            {
+              "metadata": {},
+              "name": "flags",
+              "nullable": true,
+              "type": {
+                "containsNull": true,
+                "elementType": "string",
+                "type": "array"
+              }
+            },
+            {
+              "metadata": {},
+              "name": "gene_id",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "impact",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "lof",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "lof_filter",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "lof_flags",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "lof_info",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "polyphen_prediction",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "polyphen_score",
+              "nullable": true,
+              "type": "double"
+            },
+            {
+              "metadata": {},
+              "name": "protein_end",
+              "nullable": true,
+              "type": "long"
+            },
+            {
+              "metadata": {},
+              "name": "protein_start",
+              "nullable": true,
+              "type": "long"
+            },
+            {
+              "metadata": {},
+              "name": "sift_prediction",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "sift_score",
+              "nullable": true,
+              "type": "double"
+            },
+            {
+              "metadata": {},
+              "name": "strand",
+              "nullable": true,
+              "type": "long"
+            },
+            {
+              "metadata": {},
+              "name": "swissprot",
+              "nullable": true,
+              "type": {
+                "containsNull": true,
+                "elementType": "string",
+                "type": "array"
+              }
+            },
+            {
+              "metadata": {},
+              "name": "transcript_id",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "trembl",
+              "nullable": true,
+              "type": {
+                "containsNull": true,
+                "elementType": "string",
+                "type": "array"
+              }
+            },
+            {
+              "metadata": {},
+              "name": "uniparc",
+              "nullable": true,
+              "type": {
+                "containsNull": true,
+                "elementType": "string",
+                "type": "array"
+              }
+            },
+            {
+              "metadata": {},
+              "name": "uniprot_isoform",
+              "nullable": true,
+              "type": {
+                "containsNull": true,
+                "elementType": "string",
+                "type": "array"
+              }
+            },
+            {
+              "metadata": {},
+              "name": "variant_allele",
+              "nullable": true,
+              "type": "string"
+            }
+          ],
+          "type": "struct"
+        },
+        "type": "array"
+      }
+    }
+  ],
+  "type": "struct"
+}
diff --git a/src/gentropy/config.py b/src/gentropy/config.py
index 4ebf4cbe8..f7ba73e98 100644
--- a/src/gentropy/config.py
+++ b/src/gentropy/config.py
@@ -180,7 +180,7 @@ class LDIndexConfig(StepConfig):
         ]
     )
     use_version_from_input: bool = False
-    _target_: str = "gentropy.ld_index.LDIndexStep"
+    _target_: str = "gentropy.gnomad_ingestion.LDIndexStep"
 
 
 @dataclass
@@ -302,8 +302,8 @@ class UkbPppEurConfig(StepConfig):
 
 
 @dataclass
-class VariantAnnotationConfig(StepConfig):
-    """Variant annotation step configuration."""
+class GnomadVariantConfig(StepConfig):
+    """Gnomad variant ingestion step configuration."""
 
     session: Any = field(
         default_factory=lambda: {
@@ -312,7 +312,6 @@ class VariantAnnotationConfig(StepConfig):
     )
     variant_annotation_path: str = MISSING
     gnomad_genomes_path: str = "gs://gcp-public-data--gnomad/release/4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/"
-    chain_38_37: str = "gs://hail-common/references/grch38_to_grch37.over.chain.gz"
     gnomad_variant_populations: list[str] = field(
         default_factory=lambda: [
             "afr",  # African-American
@@ -328,16 +327,16 @@ class VariantAnnotationConfig(StepConfig):
         ]
     )
     use_version_from_input: bool = False
-    _target_: str = "gentropy.variant_annotation.VariantAnnotationStep"
+    _target_: str = "gentropy.gnomad_ingestion.GnomadVariantIndexStep"
 
 
 @dataclass
 class VariantIndexConfig(StepConfig):
     """Variant index step configuration."""
 
-    variant_annotation_path: str = MISSING
-    credible_set_path: str = MISSING
+    vep_output_json_path: str = MISSING
     variant_index_path: str = MISSING
+    gnomad_variant_annotations_path: str | None = None
     _target_: str = "gentropy.variant_index.VariantIndexStep"
 
 
@@ -346,7 +345,6 @@ class VariantToGeneConfig(StepConfig):
     """V2G step configuration."""
 
     variant_index_path: str = MISSING
-    variant_annotation_path: str = MISSING
     gene_index_path: str = MISSING
     vep_consequences_path: str = MISSING
     liftover_chain_file_path: str = MISSING
@@ -371,7 +369,7 @@ class VariantToGeneConfig(StepConfig):
     )
     interval_sources: Dict[str, str] = field(default_factory=dict)
     v2g_path: str = MISSING
-    _target_: str = "gentropy.v2g.V2GStep"
+    _target_: str = "gentropy.variant_to_gene.V2GStep"
 
 
 @dataclass
@@ -491,8 +489,8 @@ def register_config() -> None:
     )
 
     cs.store(group="step", name="pics", node=PICSConfig)
+    cs.store(group="step", name="variant_annotation", node=GnomadVariantConfig)
     cs.store(group="step", name="ukb_ppp_eur_sumstat_preprocess", node=UkbPppEurConfig)
-    cs.store(group="step", name="variant_annotation", node=VariantAnnotationConfig)
     cs.store(group="step", name="variant_index", node=VariantIndexConfig)
     cs.store(group="step", name="variant_to_gene", node=VariantToGeneConfig)
     cs.store(
diff --git a/src/gentropy/dataset/variant_index.py b/src/gentropy/dataset/variant_index.py
index 4bafbc288..a9e24dcab 100644
--- a/src/gentropy/dataset/variant_index.py
+++ b/src/gentropy/dataset/variant_index.py
@@ -1,78 +1,295 @@
-"""Variant index dataset."""
+"""Dataset definition for variant annotation."""
+
 from __future__ import annotations
 
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 
-import pyspark.sql.functions as f
+from pyspark.sql import functions as f
 
 from gentropy.common.schemas import parse_spark_schema
-from gentropy.common.spark_helpers import nullify_empty_array
+from gentropy.common.spark_helpers import (
+    get_record_with_maximum_value,
+    normalise_column,
+)
 from gentropy.dataset.dataset import Dataset
-from gentropy.dataset.study_locus import StudyLocus
+from gentropy.dataset.gene_index import GeneIndex
+from gentropy.dataset.v2g import V2G
 
 if TYPE_CHECKING:
+    from pyspark.sql import Column, DataFrame
     from pyspark.sql.types import StructType
 
-    from gentropy.dataset.variant_annotation import VariantAnnotation
-
 
 @dataclass
 class VariantIndex(Dataset):
-    """Variant index dataset.
-
-    Variant index dataset is the result of intersecting the variant annotation dataset with the variants with V2D available information.
-    """
+    """Dataset for representing variants and methods applied on them."""
 
     @classmethod
     def get_schema(cls: type[VariantIndex]) -> StructType:
-        """Provides the schema for the VariantIndex dataset.
+        """Provides the schema for the variant index dataset.
 
         Returns:
             StructType: Schema for the VariantIndex dataset
         """
         return parse_spark_schema("variant_index.json")
 
-    @classmethod
-    def from_variant_annotation(
-        cls: type[VariantIndex],
-        variant_annotation: VariantAnnotation,
-        study_locus: StudyLocus,
+    def add_annotation(
+        self: VariantIndex, annotation_source: VariantIndex
     ) -> VariantIndex:
-        """Initialise VariantIndex from pre-existing variant annotation dataset.
+        """Import annotation from an other variant index dataset.
+
+        At this point the annotation can be extended with extra cross-references,
+        in-silico predictions and allele frequencies.
 
         Args:
-            variant_annotation (VariantAnnotation): Variant annotation dataset
-            study_locus (StudyLocus): Study locus dataset with the variants to intersect with the variant annotation dataset
+            annotation_source (VariantIndex): Annotation to add to the dataset
+
+        Returns:
+            VariantIndex: VariantIndex dataset with the annotation added
+        """
+        # Columns in the source dataset:
+        variant_index_columns = [
+            # Combining cross-references:
+            f.array_union(f.col("dbXrefs"), f.col("annotation_dbXrefs"))
+            if row == "dbXrefs"
+            # Combining in silico predictors:
+            else f.array_union(
+                f.col("inSilicoPredictors"), f.col("annotation_inSilicoPredictors")
+            )
+            if row == "inSilicoPredictors"
+            # Combining allele frequencies:
+            else f.array_union(
+                f.col("alleleFrequencies"), f.col("annotation_alleleFrequencies")
+            )
+            if row == "alleleFrequencies"
+            # Carrying over all other columns:
+            else row
+            for row in self.df.columns
+        ]
+
+        # Rename columns in the annotation source to avoid conflicts:
+        annotation = annotation_source.df.select(
+            *[
+                f.col(col).alias(f"annotation_{col}") if col != "variantId" else col
+                for col in annotation_source.df.columns
+            ]
+        )
+
+        # Join the annotation to the dataset:
+        return VariantIndex(
+            _df=(
+                annotation.join(
+                    f.broadcast(self.df), on="variantId", how="right"
+                ).select(*variant_index_columns)
+            ),
+            _schema=self.schema,
+        )
+
+    def max_maf(self: VariantIndex) -> Column:
+        """Maximum minor allele frequency accross all populations assuming all variants biallelic.
 
         Returns:
-            VariantIndex: Variant index dataset
+            Column: Maximum minor allele frequency accross all populations.
+
+        Raises:
+            ValueError: Allele frequencies are not present in the dataset.
         """
-        unchanged_cols = [
+        if "alleleFrequencies" not in self.df.columns:
+            raise ValueError("Allele frequencies are not present in the dataset.")
+
+        return f.array_max(
+            f.transform(
+                self.df.alleleFrequencies,
+                lambda af: f.when(
+                    af.alleleFrequency > 0.5, 1 - af.alleleFrequency
+                ).otherwise(af.alleleFrequency),
+            )
+        )
+
+    def filter_by_variant(self: VariantIndex, df: DataFrame) -> VariantIndex:
+        """Filter variant annotation dataset by a variant dataframe.
+
+        Args:
+            df (DataFrame): A dataframe of variants
+
+        Returns:
+            VariantIndex: A filtered variant annotation dataset
+        """
+        join_columns = ["variantId", "chromosome"]
+
+        assert all(
+            col in df.columns for col in join_columns
+        ), "The variant dataframe must contain the columns 'variantId' and 'chromosome'."
+
+        return VariantIndex(
+            _df=self._df.join(
+                f.broadcast(df.select(*join_columns).distinct()),
+                on=join_columns,
+                how="inner",
+            ),
+            _schema=self.schema,
+        )
+
+    def get_transcript_consequence_df(
+        self: VariantIndex, gene_index: GeneIndex | None = None
+    ) -> DataFrame:
+        """Dataframe of exploded transcript consequences.
+
+        Optionally the trancript consequences can be reduced to the universe of a gene index.
+
+        Args:
+            gene_index (GeneIndex | None): A gene index. Defaults to None.
+
+        Returns:
+            DataFrame: A dataframe exploded by transcript consequences with the columns variantId, chromosome, transcriptConsequence
+        """
+        # exploding the array removes records without VEP annotation
+        transript_consequences = self.df.withColumn(
+            "transcriptConsequence", f.explode("transcriptConsequences")
+        ).select(
             "variantId",
             "chromosome",
             "position",
-            "referenceAllele",
-            "alternateAllele",
-            "chromosomeB37",
-            "positionB37",
-            "alleleType",
-            "alleleFrequencies",
-            "inSilicoPredictors",
-        ]
-        va_slimmed = variant_annotation.filter_by_variant_df(
-            study_locus.unique_variants_in_locus()
+            "transcriptConsequence",
+            f.col("transcriptConsequence.targetId").alias("geneId"),
         )
-        return cls(
+        if gene_index:
+            transript_consequences = transript_consequences.join(
+                f.broadcast(gene_index.df),
+                on=["chromosome", "geneId"],
+            )
+        return transript_consequences
+
+    def get_distance_to_tss(
+        self: VariantIndex,
+        gene_index: GeneIndex,
+        max_distance: int = 500_000,
+    ) -> V2G:
+        """Extracts variant to gene assignments for variants falling within a window of a gene's TSS.
+
+        Args:
+            gene_index (GeneIndex): A gene index to filter by.
+            max_distance (int): The maximum distance from the TSS to consider. Defaults to 500_000.
+
+        Returns:
+            V2G: variant to gene assignments with their distance to the TSS
+        """
+        return V2G(
+            _df=(
+                self.df.alias("variant")
+                .join(
+                    f.broadcast(gene_index.locations_lut()).alias("gene"),
+                    on=[
+                        f.col("variant.chromosome") == f.col("gene.chromosome"),
+                        f.abs(f.col("variant.position") - f.col("gene.tss"))
+                        <= max_distance,
+                    ],
+                    how="inner",
+                )
+                .withColumn(
+                    "distance", f.abs(f.col("variant.position") - f.col("gene.tss"))
+                )
+                .withColumn(
+                    "inverse_distance",
+                    max_distance - f.col("distance"),
+                )
+                .transform(lambda df: normalise_column(df, "inverse_distance", "score"))
+                .select(
+                    "variantId",
+                    f.col("variant.chromosome").alias("chromosome"),
+                    "distance",
+                    "geneId",
+                    "score",
+                    f.lit("distance").alias("datatypeId"),
+                    f.lit("canonical_tss").alias("datasourceId"),
+                )
+            ),
+            _schema=V2G.get_schema(),
+        )
+
+    def get_plof_v2g(self: VariantIndex, gene_index: GeneIndex) -> V2G:
+        """Creates a dataset with variant to gene assignments with a flag indicating if the variant is predicted to be a loss-of-function variant by the LOFTEE algorithm.
+
+        Optionally the trancript consequences can be reduced to the universe of a gene index.
+
+        Args:
+            gene_index (GeneIndex): A gene index to filter by.
+
+        Returns:
+            V2G: variant to gene assignments from the LOFTEE algorithm
+        """
+        return V2G(
             _df=(
-                va_slimmed.df.select(
-                    *unchanged_cols,
-                    f.col("vep.mostSevereConsequence").alias("mostSevereConsequence"),
-                    # filters/rsid are arrays that can be empty, in this case we convert them to null
-                    nullify_empty_array(f.col("rsIds")).alias("rsIds"),
+                self.get_transcript_consequence_df(gene_index)
+                .filter(f.col("transcriptConsequence.lofteePrediction").isNotNull())
+                .withColumn(
+                    "isHighQualityPlof",
+                    f.when(
+                        f.col("transcriptConsequence.lofteePrediction") == "HC", True
+                    ).when(
+                        f.col("transcriptConsequence.lofteePrediction") == "LC", False
+                    ),
+                )
+                .withColumn(
+                    "score",
+                    f.when(f.col("isHighQualityPlof"), 1.0).when(
+                        ~f.col("isHighQualityPlof"), 0
+                    ),
+                )
+                .select(
+                    "variantId",
+                    "chromosome",
+                    "geneId",
+                    "isHighQualityPlof",
+                    f.col("score"),
+                    f.lit("vep").alias("datatypeId"),
+                    f.lit("loftee").alias("datasourceId"),
+                )
+            ),
+            _schema=V2G.get_schema(),
+        )
+
+    def get_most_severe_transcript_consequence(
+        self: VariantIndex,
+        vep_consequences: DataFrame,
+        gene_index: GeneIndex,
+    ) -> V2G:
+        """Creates a dataset with variant to gene assignments based on VEP's predicted consequence of the transcript.
+
+        Optionally the trancript consequences can be reduced to the universe of a gene index.
+
+        Args:
+            vep_consequences (DataFrame): A dataframe of VEP consequences
+            gene_index (GeneIndex): A gene index to filter by. Defaults to None.
+
+        Returns:
+            V2G: High and medium severity variant to gene assignments
+        """
+        return V2G(
+            _df=self.get_transcript_consequence_df(gene_index)
+            .select(
+                "variantId",
+                "chromosome",
+                f.col("transcriptConsequence.targetId").alias("geneId"),
+                f.explode(
+                    "transcriptConsequence.variantFunctionalConsequenceIds"
+                ).alias("variantFunctionalConsequenceId"),
+                f.lit("vep").alias("datatypeId"),
+                f.lit("variantConsequence").alias("datasourceId"),
+            )
+            .join(
+                f.broadcast(vep_consequences),
+                on="variantFunctionalConsequenceId",
+                how="inner",
+            )
+            .drop("label")
+            .filter(f.col("score") != 0)
+            # A variant can have multiple predicted consequences on a transcript, the most severe one is selected
+            .transform(
+                lambda df: get_record_with_maximum_value(
+                    df, ["variantId", "geneId"], "score"
                 )
-                .repartition(400, "chromosome")
-                .sortWithinPartitions("chromosome", "position")
             ),
-            _schema=cls.get_schema(),
+            _schema=V2G.get_schema(),
         )
diff --git a/src/gentropy/datasource/ensembl/__init__.py b/src/gentropy/datasource/ensembl/__init__.py
new file mode 100644
index 000000000..e20ce9a95
--- /dev/null
+++ b/src/gentropy/datasource/ensembl/__init__.py
@@ -0,0 +1,3 @@
+"""Ensembl's Variant Effect Predictor datasource."""
+
+from __future__ import annotations
diff --git a/src/gentropy/datasource/ensembl/vep_parser.py b/src/gentropy/datasource/ensembl/vep_parser.py
new file mode 100644
index 000000000..fcade35b1
--- /dev/null
+++ b/src/gentropy/datasource/ensembl/vep_parser.py
@@ -0,0 +1,784 @@
+"""Generating variant index based on Esembl's Variant Effect Predictor output."""
+
+from __future__ import annotations
+
+import importlib.resources as pkg_resources
+import json
+from itertools import chain
+from typing import TYPE_CHECKING, Dict, List
+
+from pyspark.sql import SparkSession
+from pyspark.sql import functions as f
+from pyspark.sql import types as t
+
+from gentropy.assets import data
+from gentropy.common.schemas import parse_spark_schema
+from gentropy.common.spark_helpers import (
+    enforce_schema,
+    order_array_of_structs_by_field,
+)
+from gentropy.dataset.variant_index import VariantIndex
+
+if TYPE_CHECKING:
+    from pyspark.sql import Column, DataFrame
+
+
+class VariantEffectPredictorParser:
+    """Collection of methods to parse VEP output in json format."""
+
+    # Schema description of the dbXref object:
+    DBXREF_SCHEMA = t.ArrayType(
+        t.StructType(
+            [
+                t.StructField("id", t.StringType(), True),
+                t.StructField("source", t.StringType(), True),
+            ]
+        )
+    )
+
+    # Schema description of the in silico predictor object:
+    IN_SILICO_PREDICTOR_SCHEMA = t.StructType(
+        [
+            t.StructField("method", t.StringType(), True),
+            t.StructField("assessment", t.StringType(), True),
+            t.StructField("score", t.FloatType(), True),
+            t.StructField("assessmentFlag", t.StringType(), True),
+            t.StructField("targetId", t.StringType(), True),
+        ]
+    )
+
+    # Schema for the allele frequency column:
+    ALLELE_FREQUENCY_SCHEMA = t.ArrayType(
+        t.StructType(
+            [
+                t.StructField("populationName", t.StringType(), True),
+                t.StructField("alleleFrequency", t.DoubleType(), True),
+            ]
+        ),
+        False,
+    )
+
+    @staticmethod
+    def get_schema() -> t.StructType:
+        """Return the schema of the VEP output.
+
+        Returns:
+            t.StructType: VEP output schema.
+
+        Examples:
+            >>> type(VariantEffectPredictorParser.get_schema())
+            <class 'pyspark.sql.types.StructType'>
+        """
+        return parse_spark_schema("vep_json_output.json")
+
+    @classmethod
+    def extract_variant_index_from_vep(
+        cls: type[VariantEffectPredictorParser],
+        spark: SparkSession,
+        vep_output_path: str | list[str],
+        **kwargs: bool | float | int | str | None,
+    ) -> VariantIndex:
+        """Extract variant index from VEP output.
+
+        Args:
+            spark (SparkSession): Spark session.
+            vep_output_path (str | list[str]): Path to the VEP output.
+            **kwargs (bool | float | int | str | None): Additional arguments to pass to spark.read.json.
+
+        Returns:
+            VariantIndex: Variant index dataset.
+
+        Raises:
+            ValueError: Failed reading file.
+            ValueError: The dataset is empty.
+        """
+        # To speed things up and simplify the json structure, read data following an expected schema:
+        vep_schema = cls.get_schema()
+
+        try:
+            vep_data = spark.read.json(vep_output_path, schema=vep_schema, **kwargs)
+        except ValueError as e:
+            raise ValueError(f"Failed reading file: {vep_output_path}.") from e
+
+        if vep_data.isEmpty():
+            raise ValueError(f"The dataset is empty: {vep_output_path}")
+
+        # Convert to VariantAnnotation dataset:
+        return VariantIndex(
+            _df=VariantEffectPredictorParser.process_vep_output(vep_data),
+            _schema=VariantIndex.get_schema(),
+        )
+
+    @staticmethod
+    def _extract_ensembl_xrefs(colocated_variants: Column) -> Column:
+        """Extract rs identifiers and build cross reference to Ensembl's variant page.
+
+        Args:
+            colocated_variants (Column): Colocated variants field from VEP output.
+
+        Returns:
+            Column: List of dbXrefs for rs identifiers.
+        """
+        return VariantEffectPredictorParser._generate_dbxrefs(
+            VariantEffectPredictorParser._colocated_variants_to_rsids(
+                colocated_variants
+            ),
+            "ensembl_variation",
+        )
+
+    @enforce_schema(DBXREF_SCHEMA)
+    @staticmethod
+    def _generate_dbxrefs(ids: Column, source: str) -> Column:
+        """Convert a list of variant identifiers to dbXrefs given the source label.
+
+        Identifiers are cast to strings, then Null values are filtered out of the id list.
+
+        Args:
+            ids (Column): List of variant identifiers.
+            source (str): Source label for the dbXrefs.
+
+        Returns:
+            Column: List of dbXrefs.
+
+        Examples:
+            >>> (
+            ...     spark.createDataFrame([('rs12',),(None,)], ['test_id'])
+            ...     .select(VariantEffectPredictorParser._generate_dbxrefs(f.array(f.col('test_id')), "ensemblVariation").alias('col'))
+            ...     .show(truncate=False)
+            ... )
+            +--------------------------+
+            |col                       |
+            +--------------------------+
+            |[{rs12, ensemblVariation}]|
+            |[]                        |
+            +--------------------------+
+            <BLANKLINE>
+            >>> (
+            ...     spark.createDataFrame([('rs12',),(None,)], ['test_id'])
+            ...     .select(VariantEffectPredictorParser._generate_dbxrefs(f.array(f.col('test_id')), "ensemblVariation").alias('col'))
+            ...     .first().col[0].asDict()
+            ... )
+            {'id': 'rs12', 'source': 'ensemblVariation'}
+        """
+        ids = f.filter(ids, lambda id: id.isNotNull())
+        xref_column = f.transform(
+            ids,
+            lambda id: f.struct(
+                id.cast(t.StringType()).alias("id"), f.lit(source).alias("source")
+            ),
+        )
+
+        return f.when(xref_column.isNull(), f.array()).otherwise(xref_column)
+
+    @staticmethod
+    def _colocated_variants_to_rsids(colocated_variants: Column) -> Column:
+        """Extract rs identifiers from the colocated variants VEP field.
+
+        Args:
+            colocated_variants (Column): Colocated variants field from VEP output.
+
+        Returns:
+            Column: List of rs identifiers.
+
+        Examples:
+            >>> data = [('s1', 'rs1'),('s1', 'rs2'),('s1', 'rs3'),('s2', None),]
+            >>> (
+            ...    spark.createDataFrame(data, ['s','v'])
+            ...    .groupBy('s')
+            ...    .agg(f.collect_list(f.struct(f.col('v').alias('id'))).alias('cv'))
+            ...    .select(VariantEffectPredictorParser._colocated_variants_to_rsids(f.col('cv')).alias('rsIds'),)
+            ...    .show(truncate=False)
+            ... )
+            +---------------+
+            |rsIds          |
+            +---------------+
+            |[rs1, rs2, rs3]|
+            |[null]         |
+            +---------------+
+            <BLANKLINE>
+        """
+        return f.when(
+            colocated_variants.isNotNull(),
+            f.transform(
+                colocated_variants, lambda variant: variant.getItem("id").alias("id")
+            ),
+        ).alias("rsIds")
+
+    @staticmethod
+    def _extract_omim_xrefs(colocated_variants: Column) -> Column:
+        """Build xdbRefs for OMIM identifiers.
+
+        OMIM identifiers are extracted from the colocated variants field, casted to strings and formatted as dbXrefs.
+
+        Args:
+            colocated_variants (Column): Colocated variants field from VEP output.
+
+        Returns:
+            Column: List of dbXrefs for OMIM identifiers.
+
+        Examples:
+            >>> data = [('234234.32', 'rs1', 's1',),(None, 'rs1', 's1',),]
+            >>> (
+            ...    spark.createDataFrame(data, ['id', 'rsid', 's'])
+            ...    .groupBy('s')
+            ...    .agg(f.collect_list(f.struct(f.struct(f.array(f.col('id')).alias('OMIM')).alias('var_synonyms'),f.col('rsid').alias('id'),),).alias('cv'),).select(VariantEffectPredictorParser._extract_omim_xrefs(f.col('cv')).alias('dbXrefs'))
+            ...    .show(truncate=False)
+            ... )
+            +-------------------+
+            |dbXrefs            |
+            +-------------------+
+            |[{234234#32, omim}]|
+            +-------------------+
+            <BLANKLINE>
+        """
+        variants_w_omim_ref = f.filter(
+            colocated_variants,
+            lambda variant: variant.getItem("var_synonyms").getItem("OMIM").isNotNull(),
+        )
+
+        omim_var_ids = f.transform(
+            variants_w_omim_ref,
+            lambda var: f.transform(
+                var.getItem("var_synonyms").getItem("OMIM"),
+                lambda var: f.regexp_replace(var.cast(t.StringType()), r"\.", r"#"),
+            ),
+        )
+
+        return VariantEffectPredictorParser._generate_dbxrefs(
+            f.flatten(omim_var_ids), "omim"
+        )
+
+    @staticmethod
+    def _extract_clinvar_xrefs(colocated_variants: Column) -> Column:
+        """Build xdbRefs for VCV ClinVar identifiers.
+
+        ClinVar identifiers are extracted from the colocated variants field.
+        VCV-identifiers are filtered out to generate cross-references.
+
+        Args:
+            colocated_variants (Column): Colocated variants field from VEP output.
+
+        Returns:
+            Column: List of dbXrefs for VCV ClinVar identifiers.
+
+        Examples:
+            >>> data = [('VCV2323,RCV3423', 'rs1', 's1',),(None, 'rs1', 's1',),]
+            >>> (
+            ...    spark.createDataFrame(data, ['id', 'rsid', 's'])
+            ...    .groupBy('s')
+            ...    .agg(f.collect_list(f.struct(f.struct(f.split(f.col('id'),',').alias('ClinVar')).alias('var_synonyms'),f.col('rsid').alias('id'),),).alias('cv'),).select(VariantEffectPredictorParser._extract_clinvar_xrefs(f.col('cv')).alias('dbXrefs'))
+            ...    .show(truncate=False)
+            ... )
+            +--------------------+
+            |dbXrefs             |
+            +--------------------+
+            |[{VCV2323, clinvar}]|
+            +--------------------+
+            <BLANKLINE>
+        """
+        variants_w_clinvar_ref = f.filter(
+            colocated_variants,
+            lambda variant: variant.getItem("var_synonyms")
+            .getItem("ClinVar")
+            .isNotNull(),
+        )
+
+        clin_var_ids = f.transform(
+            variants_w_clinvar_ref,
+            lambda var: f.filter(
+                var.getItem("var_synonyms").getItem("ClinVar"),
+                lambda x: x.startswith("VCV"),
+            ),
+        )
+
+        return VariantEffectPredictorParser._generate_dbxrefs(
+            f.flatten(clin_var_ids), "clinvar"
+        )
+
+    @staticmethod
+    def _get_most_severe_transcript(
+        transcript_column_name: str, score_field_name: str
+    ) -> Column:
+        """Return transcript with the highest in silico predicted score.
+
+        This method assumes the higher the score, the more severe the consequence of the variant is.
+        Hence, by selecting the transcript with the highest score, we are selecting the most severe consequence.
+
+        Args:
+            transcript_column_name (str): Name of the column containing the list of transcripts.
+            score_field_name (str): Name of the field containing the severity score.
+
+        Returns:
+            Column: Most severe transcript.
+
+        Examples:
+            >>> data = [("v1", 0.2, 'transcript1'),("v1", None, 'transcript2'),("v1", 0.6, 'transcript3'),("v1", 0.4, 'transcript4'),]
+            >>> (
+            ...    spark.createDataFrame(data, ['v', 'score', 'transcriptId'])
+            ...    .groupBy('v')
+            ...    .agg(f.collect_list(f.struct(f.col('score'),f.col('transcriptId'))).alias('transcripts'))
+            ...    .select(VariantEffectPredictorParser._get_most_severe_transcript('transcripts', 'score').alias('most_severe_transcript'))
+            ...    .show(truncate=False)
+            ... )
+            +----------------------+
+            |most_severe_transcript|
+            +----------------------+
+            |{0.6, transcript3}    |
+            +----------------------+
+            <BLANKLINE>
+        """
+        assert isinstance(
+            transcript_column_name, str
+        ), "transcript_column_name must be a string and not a column."
+        # Order transcripts by severity score:
+        ordered_transcripts = order_array_of_structs_by_field(
+            transcript_column_name, score_field_name
+        )
+
+        # Drop transcripts with no severity score and return the most severe one:
+        return f.filter(
+            ordered_transcripts,
+            lambda transcript: transcript.getItem(score_field_name).isNotNull(),
+        )[0]
+
+    @enforce_schema(IN_SILICO_PREDICTOR_SCHEMA)
+    @staticmethod
+    def _get_max_alpha_missense(transcripts: Column) -> Column:
+        """Return the most severe alpha missense prediction from all transcripts.
+
+        This function assumes one variant can fall onto only one gene with alpha-sense prediction available on the canonical transcript.
+
+        Args:
+            transcripts (Column): List of transcripts.
+
+        Returns:
+            Column: Most severe alpha missense prediction.
+
+        Examples:
+        >>> data = [('v1', 0.4, 'assessment 1'), ('v1', None, None), ('v1', None, None),('v2', None, None),]
+        >>> (
+        ...     spark.createDataFrame(data, ['v', 'a', 'b'])
+        ...    .groupBy('v')
+        ...    .agg(f.collect_list(f.struct(f.struct(
+        ...        f.col('a').alias('am_pathogenicity'),
+        ...        f.col('b').alias('am_class')).alias('alphamissense'),
+        ...        f.lit('gene1').alias('gene_id'))).alias('transcripts')
+        ...    )
+        ...    .select(VariantEffectPredictorParser._get_max_alpha_missense(f.col('transcripts')).alias('am'))
+        ...    .show(truncate=False)
+        ... )
+        +----------------------------------------------------+
+        |am                                                  |
+        +----------------------------------------------------+
+        |{max alpha missense, assessment 1, 0.4, null, gene1}|
+        |{max alpha missense, null, null, null, gene1}       |
+        +----------------------------------------------------+
+        <BLANKLINE>
+        """
+        return f.transform(
+            # Extract transcripts with alpha missense values:
+            f.filter(
+                transcripts,
+                lambda transcript: transcript.getItem("alphamissense").isNotNull(),
+            ),
+            # Extract alpha missense values:
+            lambda transcript: f.struct(
+                transcript.getItem("alphamissense")
+                .getItem("am_pathogenicity")
+                .cast(t.FloatType())
+                .alias("score"),
+                transcript.getItem("alphamissense")
+                .getItem("am_class")
+                .alias("assessment"),
+                f.lit("max alpha missense").alias("method"),
+                transcript.getItem("gene_id").alias("targetId"),
+            ),
+        )[0]
+
+    @enforce_schema(IN_SILICO_PREDICTOR_SCHEMA)
+    @staticmethod
+    def _vep_in_silico_prediction_extractor(
+        transcript_column_name: str,
+        method_name: str,
+        score_column_name: str | None = None,
+        assessment_column_name: str | None = None,
+        assessment_flag_column_name: str | None = None,
+    ) -> Column:
+        """Extract in silico prediction from VEP output.
+
+        Args:
+            transcript_column_name (str): Name of the column containing the list of transcripts.
+            method_name (str): Name of the in silico predictor.
+            score_column_name (str | None): Name of the column containing the score.
+            assessment_column_name (str | None): Name of the column containing the assessment.
+            assessment_flag_column_name (str | None): Name of the column containing the assessment flag.
+
+        Returns:
+            Column: In silico predictor.
+        """
+        # Get highest score:
+        most_severe_transcript: Column = (
+            # Getting the most severe transcript:
+            VariantEffectPredictorParser._get_most_severe_transcript(
+                transcript_column_name, score_column_name
+            )
+            if score_column_name is not None
+            # If we don't have score, just pick one of the transcript where assessment is available:
+            else f.filter(
+                f.col(transcript_column_name),
+                lambda transcript: transcript.getItem(
+                    assessment_column_name
+                ).isNotNull(),
+            )
+        )
+
+        return f.when(
+            most_severe_transcript.isNotNull(),
+            f.struct(
+                # Adding method name:
+                f.lit(method_name).cast(t.StringType()).alias("method"),
+                # Adding assessment:
+                f.lit(None).cast(t.StringType()).alias("assessment")
+                if assessment_column_name is None
+                else most_severe_transcript.getField(assessment_column_name).alias(
+                    "assessment"
+                ),
+                # Adding score:
+                f.lit(None).cast(t.FloatType()).alias("score")
+                if score_column_name is None
+                else most_severe_transcript.getField(score_column_name)
+                .cast(t.FloatType())
+                .alias("score"),
+                # Adding assessment flag:
+                f.lit(None).cast(t.StringType()).alias("assessmentFlag")
+                if assessment_flag_column_name is None
+                else most_severe_transcript.getField(assessment_flag_column_name)
+                .cast(t.FloatType())
+                .alias("assessmentFlag"),
+                # Adding target id if present:
+                most_severe_transcript.getItem("gene_id").alias("targetId"),
+            ),
+        )
+
+    @staticmethod
+    def _parser_amino_acid_change(amino_acids: Column, protein_end: Column) -> Column:
+        """Convert VEP amino acid change information to one letter code aa substitution code.
+
+        The logic assumes that the amino acid change is given in the format "from/to" and the protein end is given also.
+        If any of the information is missing, the amino acid change will be set to None.
+
+        Args:
+            amino_acids (Column): Amino acid change information.
+            protein_end (Column): Protein end information.
+
+        Returns:
+            Column: Amino acid change in one letter code.
+
+        Examples:
+            >>> data = [('A/B', 1),('A/B', None),(None, 1),]
+            >>> (
+            ...    spark.createDataFrame(data, ['amino_acids', 'protein_end'])
+            ...    .select(VariantEffectPredictorParser._parser_amino_acid_change(f.col('amino_acids'), f.col('protein_end')).alias('amino_acid_change'))
+            ...    .show()
+            ... )
+            +-----------------+
+            |amino_acid_change|
+            +-----------------+
+            |              A1B|
+            |             null|
+            |             null|
+            +-----------------+
+            <BLANKLINE>
+        """
+        return f.when(
+            amino_acids.isNotNull() & protein_end.isNotNull(),
+            f.concat(
+                f.split(amino_acids, r"\/")[0],
+                protein_end,
+                f.split(amino_acids, r"\/")[1],
+            ),
+        ).otherwise(f.lit(None))
+
+    @staticmethod
+    def _collect_uniprot_accessions(trembl: Column, swissprot: Column) -> Column:
+        """Flatten arrays containing Uniprot accessions.
+
+        Args:
+            trembl (Column): List of TrEMBL protein accessions.
+            swissprot (Column): List of SwissProt protein accessions.
+
+        Returns:
+            Column: List of unique Uniprot accessions extracted from swissprot and trembl arrays, splitted from version numbers.
+
+        Examples:
+        >>> data = [('v1', ["sp_1"], ["tr_1"]), ('v1', None, None), ('v1', None, ["tr_2"]),]
+        >>> (
+        ...    spark.createDataFrame(data, ['v', 'sp', 'tr'])
+        ...    .select(VariantEffectPredictorParser._collect_uniprot_accessions(f.col('sp'), f.col('tr')).alias('proteinIds'))
+        ...    .show()
+        ... )
+        +------------+
+        |  proteinIds|
+        +------------+
+        |[sp_1, tr_1]|
+        |          []|
+        |      [tr_2]|
+        +------------+
+        <BLANKLINE>
+        """
+        # Dropping Null values and flattening the arrays:
+        return f.filter(
+            f.array_distinct(
+                f.transform(
+                    f.flatten(
+                        f.filter(
+                            f.array(trembl, swissprot),
+                            lambda x: x.isNotNull(),
+                        )
+                    ),
+                    lambda x: f.split(x, r"\.")[0],
+                )
+            ),
+            lambda x: x.isNotNull(),
+        )
+
+    @staticmethod
+    def _consequence_to_sequence_ontology(
+        col: Column, so_dict: Dict[str, str]
+    ) -> Column:
+        """Convert VEP consequence terms to sequence ontology identifiers.
+
+        Missing consequence label will be converted to None, unmapped consequences will be mapped as None.
+
+        Args:
+            col (Column): Column containing VEP consequence terms.
+            so_dict (Dict[str, str]): Dictionary mapping VEP consequence terms to sequence ontology identifiers.
+
+        Returns:
+            Column: Column containing sequence ontology identifiers.
+
+        Examples:
+            >>> data = [('consequence_1',),('unmapped_consequence',),(None,)]
+            >>> m = {'consequence_1': 'SO:000000'}
+            >>> (
+            ...    spark.createDataFrame(data, ['label'])
+            ...    .select('label',VariantEffectPredictorParser._consequence_to_sequence_ontology(f.col('label'),m).alias('id'))
+            ...    .show()
+            ... )
+            +--------------------+---------+
+            |               label|       id|
+            +--------------------+---------+
+            |       consequence_1|SO:000000|
+            |unmapped_consequence|     null|
+            |                null|     null|
+            +--------------------+---------+
+            <BLANKLINE>
+        """
+        map_expr = f.create_map(*[f.lit(x) for x in chain(*so_dict.items())])
+
+        return map_expr[col].alias("ancestry")
+
+    @staticmethod
+    def _parse_variant_location_id(vep_input_field: Column) -> List[Column]:
+        r"""Parse variant identifier, chromosome, position, reference allele and alternate allele from VEP input field.
+
+        Args:
+            vep_input_field (Column): Column containing variant vcf string used as VEP input.
+
+        Returns:
+            List[Column]: List of columns containing chromosome, position, reference allele and alternate allele.
+        """
+        variant_fields = f.split(vep_input_field, r"\t")
+        return [
+            f.concat_ws(
+                "_",
+                f.array(
+                    variant_fields[0],
+                    variant_fields[1],
+                    variant_fields[3],
+                    variant_fields[4],
+                ),
+            ).alias("variantId"),
+            variant_fields[0].cast(t.StringType()).alias("chromosome"),
+            variant_fields[1].cast(t.IntegerType()).alias("position"),
+            variant_fields[3].cast(t.StringType()).alias("referenceAllele"),
+            variant_fields[4].cast(t.StringType()).alias("alternateAllele"),
+        ]
+
+    @classmethod
+    def process_vep_output(cls, vep_output: DataFrame) -> DataFrame:
+        """Process and format a VEP output in JSON format.
+
+        Args:
+            vep_output (DataFrame): raw VEP output, read as spark DataFrame.
+
+        Returns:
+           DataFrame: processed data in the right shape.
+        """
+        # Reading consequence to sequence ontology map:
+        sequence_ontology_map = json.loads(
+            pkg_resources.read_text(data, "so_mappings.json", encoding="utf-8")
+        )
+        # Processing VEP output:
+        return (
+            vep_output
+            # Dropping non-canonical transcript consequences:  # TODO: parametrize this.
+            .withColumn(
+                "transcript_consequences",
+                f.filter(
+                    f.col("transcript_consequences"),
+                    lambda consequence: consequence.getItem("canonical") == 1,
+                ),
+            )
+            .select(
+                # Parse id and chr:pos:alt:ref:
+                *cls._parse_variant_location_id(f.col("input")),
+                # Extracting corss_references from colocated variants:
+                cls._extract_ensembl_xrefs(f.col("colocated_variants")).alias(
+                    "ensembl_xrefs"
+                ),
+                cls._extract_omim_xrefs(f.col("colocated_variants")).alias(
+                    "omim_xrefs"
+                ),
+                cls._extract_clinvar_xrefs(f.col("colocated_variants")).alias(
+                    "clinvar_xrefs"
+                ),
+                # Extracting in silico predictors
+                f.when(
+                    # The following in-silico predictors are only available for variants with transcript consequences:
+                    f.col("transcript_consequences").isNotNull(),
+                    f.filter(
+                        f.array(
+                            # Extract CADD scores:
+                            cls._vep_in_silico_prediction_extractor(
+                                transcript_column_name="transcript_consequences",
+                                method_name="phred scaled CADD",
+                                score_column_name="cadd_phred",
+                            ),
+                            # Extract polyphen scores:
+                            cls._vep_in_silico_prediction_extractor(
+                                transcript_column_name="transcript_consequences",
+                                method_name="polyphen",
+                                score_column_name="polyphen_score",
+                                assessment_column_name="polyphen_prediction",
+                            ),
+                            # Extract sift scores:
+                            cls._vep_in_silico_prediction_extractor(
+                                transcript_column_name="transcript_consequences",
+                                method_name="sift",
+                                score_column_name="sift_score",
+                                assessment_column_name="sift_prediction",
+                            ),
+                            # Extract loftee scores:
+                            cls._vep_in_silico_prediction_extractor(
+                                method_name="loftee",
+                                transcript_column_name="transcript_consequences",
+                                score_column_name="lof",
+                                assessment_column_name="lof",
+                                assessment_flag_column_name="lof_filter",
+                            ),
+                            # Extract max alpha missense:
+                            cls._get_max_alpha_missense(
+                                f.col("transcript_consequences")
+                            ),
+                        ),
+                        lambda predictor: predictor.isNotNull(),
+                    ),
+                )
+                .otherwise(
+                    # Extract CADD scores from intergenic object:
+                    f.array(
+                        cls._vep_in_silico_prediction_extractor(
+                            transcript_column_name="intergenic_consequences",
+                            method_name="phred scaled CADD",
+                            score_column_name="cadd_phred",
+                        ),
+                    )
+                )
+                .alias("inSilicoPredictors"),
+                # Convert consequence to SO:
+                cls._consequence_to_sequence_ontology(
+                    f.col("most_severe_consequence"), sequence_ontology_map
+                ).alias("mostSevereConsequenceId"),
+                # Collect transcript consequence:
+                f.when(
+                    f.col("transcript_consequences").isNotNull(),
+                    f.transform(
+                        f.col("transcript_consequences"),
+                        lambda transcript: f.struct(
+                            # Convert consequence terms to SO identifier:
+                            f.transform(
+                                transcript.consequence_terms,
+                                lambda y: cls._consequence_to_sequence_ontology(
+                                    y, sequence_ontology_map
+                                ),
+                            ).alias("variantFunctionalConsequenceIds"),
+                            # Format amino acid change:
+                            cls._parser_amino_acid_change(
+                                transcript.amino_acids, transcript.protein_end
+                            ).alias("aminoAcidChange"),
+                            # Extract and clean uniprot identifiers:
+                            cls._collect_uniprot_accessions(
+                                transcript.swissprot,
+                                transcript.trembl,
+                            ).alias("uniprotAccessions"),
+                            # Add canonical flag:
+                            f.when(transcript.canonical == 1, f.lit(True))
+                            .otherwise(f.lit(False))
+                            .alias("isEnsemblCanonical"),
+                            # Extract other fields as is:
+                            transcript.codons.alias("codons"),
+                            transcript.distance.alias("distance"),
+                            transcript.gene_id.alias("targetId"),
+                            transcript.impact.alias("impact"),
+                            transcript.transcript_id.alias("transcriptId"),
+                            transcript.lof.cast(t.StringType()).alias(
+                                "lofteePrediction"
+                            ),
+                            transcript.lof.cast(t.FloatType()).alias("siftPrediction"),
+                            transcript.lof.cast(t.FloatType()).alias(
+                                "polyphenPrediction"
+                            ),
+                        ),
+                    ),
+                ).alias("transcriptConsequences"),
+                # Extracting rsids:
+                cls._colocated_variants_to_rsids(f.col("colocated_variants")).alias(
+                    "rsIds"
+                ),
+                # Adding empty array for allele frequencies - now this piece of data is not coming form the VEP data:
+                f.array().cast(cls.ALLELE_FREQUENCY_SCHEMA).alias("alleleFrequencies"),
+            )
+            # Adding protvar xref for missense variants:  # TODO: making and extendable list of consequences
+            .withColumn(
+                "protvar_xrefs",
+                f.when(
+                    f.size(
+                        f.filter(
+                            f.col("transcriptConsequences"),
+                            lambda x: f.array_contains(
+                                x.variantFunctionalConsequenceIds, "SO_0001583"
+                            ),
+                        )
+                    )
+                    > 0,
+                    cls._generate_dbxrefs(f.array(f.col("variantId")), "protvar"),
+                ),
+            )
+            .withColumn(
+                "dbXrefs",
+                f.flatten(
+                    f.filter(
+                        f.array(
+                            f.col("ensembl_xrefs"),
+                            f.col("omim_xrefs"),
+                            f.col("clinvar_xrefs"),
+                            f.col("protvar_xrefs"),
+                        ),
+                        lambda x: x.isNotNull(),
+                    )
+                ),
+            )
+            # Dropping intermediate xref columns:
+            .drop(*["ensembl_xrefs", "omim_xrefs", "clinvar_xrefs", "protvar_xrefs"])
+        )
diff --git a/src/gentropy/datasource/gnomad/variants.py b/src/gentropy/datasource/gnomad/variants.py
index fdc67a7cb..98e7013f5 100644
--- a/src/gentropy/datasource/gnomad/variants.py
+++ b/src/gentropy/datasource/gnomad/variants.py
@@ -7,8 +7,8 @@
 import hail as hl
 
 from gentropy.common.types import VariantPopulation
-from gentropy.config import VariantAnnotationConfig
-from gentropy.dataset.variant_annotation import VariantAnnotation
+from gentropy.config import GnomadVariantConfig
+from gentropy.dataset.variant_index import VariantIndex
 
 if TYPE_CHECKING:
     pass
@@ -19,26 +19,23 @@ class GnomADVariants:
 
     def __init__(
         self,
-        gnomad_genomes_path: str = VariantAnnotationConfig().gnomad_genomes_path,
-        chain_38_37: str = VariantAnnotationConfig().chain_38_37,
+        gnomad_genomes_path: str = GnomadVariantConfig().gnomad_genomes_path,
         gnomad_variant_populations: list[
             VariantPopulation | str
-        ] = VariantAnnotationConfig().gnomad_variant_populations,
+        ] = GnomadVariantConfig().gnomad_variant_populations,
     ):
         """Initialize.
 
         Args:
             gnomad_genomes_path (str): Path to gnomAD genomes hail table.
-            chain_38_37 (str): Path to GRCh38 to GRCh37 chain file.
             gnomad_variant_populations (list[VariantPopulation | str]): List of populations to include.
 
-        All defaults are stored in VariantAnnotationConfig.
+        All defaults are stored in GnomadVariantConfig.
         """
         self.gnomad_genomes_path = gnomad_genomes_path
-        self.chain_38_37 = chain_38_37
         self.gnomad_variant_populations = gnomad_variant_populations
 
-    def as_variant_annotation(self: GnomADVariants) -> VariantAnnotation:
+    def as_variant_index(self: GnomADVariants) -> VariantIndex:
         """Generate variant annotation dataset from gnomAD.
 
         Some relevant modifications to the original dataset are:
@@ -48,7 +45,7 @@ def as_variant_annotation(self: GnomADVariants) -> VariantAnnotation:
         3. Field names are converted to camel case to follow the convention.
 
         Returns:
-            VariantAnnotation: Variant annotation dataset
+            VariantIndex: GnomaAD variants dataset.
         """
         # Load variants dataset
         ht = hl.read_table(
@@ -56,19 +53,14 @@ def as_variant_annotation(self: GnomADVariants) -> VariantAnnotation:
             _load_refs=False,
         )
 
-        # Liftover
-        grch37 = hl.get_reference("GRCh37")
-        grch38 = hl.get_reference("GRCh38")
-        grch38.add_liftover(self.chain_38_37, grch37)
-
         # Drop non biallelic variants
         ht = ht.filter(ht.alleles.length() == 2)
-        # Liftover
-        ht = ht.annotate(locus_GRCh37=hl.liftover(ht.locus, "GRCh37"))
+
         # Select relevant fields and nested records to create class
-        return VariantAnnotation(
+        return VariantIndex(
             _df=(
                 ht.select(
+                    # Extract mandatory fields:
                     variantId=hl.str("_").join(
                         [
                             ht.locus.contig.replace("chr", ""),
@@ -79,12 +71,9 @@ def as_variant_annotation(self: GnomADVariants) -> VariantAnnotation:
                     ),
                     chromosome=ht.locus.contig.replace("chr", ""),
                     position=ht.locus.position,
-                    chromosomeB37=ht.locus_GRCh37.contig.replace("chr", ""),
-                    positionB37=ht.locus_GRCh37.position,
                     referenceAllele=ht.alleles[0],
                     alternateAllele=ht.alleles[1],
-                    rsIds=ht.rsid,
-                    alleleType=ht.allele_info.allele_type,
+                    # Extract allele frequencies from populations of interest:
                     alleleFrequencies=hl.set(
                         [f"{pop}_adj" for pop in self.gnomad_variant_populations]
                     ).map(
@@ -93,33 +82,46 @@ def as_variant_annotation(self: GnomADVariants) -> VariantAnnotation:
                             alleleFrequency=ht.freq[ht.globals.freq_index_dict[p]].AF,
                         )
                     ),
-                    vep=hl.struct(
-                        mostSevereConsequence=ht.vep.most_severe_consequence,
-                        transcriptConsequences=hl.map(
-                            lambda x: hl.struct(
-                                aminoAcids=x.amino_acids,
-                                consequenceTerms=x.consequence_terms,
-                                geneId=x.gene_id,
-                                lof=x.lof,
+                    # Extract most severe consequence:
+                    mostSevereConsequence=ht.vep.most_severe_consequence,
+                    # Extract in silico predictors:
+                    inSilicoPredictors=hl.array(
+                        [
+                            hl.struct(
+                                method=hl.str("spliceai"),
+                                assessment=hl.missing(hl.tstr),
+                                score=hl.expr.functions.float32(
+                                    ht.in_silico_predictors.spliceai_ds_max
+                                ),
+                                assessmentFlag=hl.missing(hl.tstr),
+                                targetId=hl.missing(hl.tstr),
                             ),
-                            # Only keeping canonical transcripts
-                            ht.vep.transcript_consequences.filter(
-                                lambda x: (x.canonical == 1)
-                                & (x.gene_symbol_source == "HGNC")
+                            hl.struct(
+                                method=hl.str("pangolin"),
+                                assessment=hl.missing(hl.tstr),
+                                score=hl.expr.functions.float32(
+                                    ht.in_silico_predictors.pangolin_largest_ds
+                                ),
+                                assessmentFlag=hl.missing(hl.tstr),
+                                targetId=hl.missing(hl.tstr),
                             ),
-                        ),
+                        ]
                     ),
-                    inSilicoPredictors=hl.struct(
-                        cadd=hl.struct(
-                            phred=ht.in_silico_predictors.cadd.phred,
-                            raw=ht.in_silico_predictors.cadd.raw_score,
-                        ),
-                        revelMax=ht.in_silico_predictors.revel_max,
-                        spliceaiDsMax=ht.in_silico_predictors.spliceai_ds_max,
-                        pangolinLargestDs=ht.in_silico_predictors.pangolin_largest_ds,
-                        phylop=ht.in_silico_predictors.phylop,
-                        siftMax=ht.in_silico_predictors.sift_max,
-                        polyphenMax=ht.in_silico_predictors.polyphen_max,
+                    # Extract cross references to GnomAD:
+                    dbXrefs=hl.array(
+                        [
+                            hl.struct(
+                                id=hl.str("-").join(
+                                    [
+                                        ht.locus.contig.replace("chr", ""),
+                                        hl.str(ht.locus.position),
+                                        ht.alleles[0],
+                                        ht.alleles[1],
+                                    ]
+                                ),
+                                source=hl.str("gnomad"),
+                            )
+                        ]
                     ),
                 )
                 .key_by("chromosome", "position")
@@ -127,5 +129,5 @@ def as_variant_annotation(self: GnomADVariants) -> VariantAnnotation:
                 .select_globals()
                 .to_spark(flatten=False)
             ),
-            _schema=VariantAnnotation.get_schema(),
+            _schema=VariantIndex.get_schema(),
         )
diff --git a/src/gentropy/datasource/gwas_catalog/associations.py b/src/gentropy/datasource/gwas_catalog/associations.py
index d6763cb35..b4e9a2d45 100644
--- a/src/gentropy/datasource/gwas_catalog/associations.py
+++ b/src/gentropy/datasource/gwas_catalog/associations.py
@@ -26,7 +26,7 @@
 if TYPE_CHECKING:
     from pyspark.sql import Column, DataFrame
 
-    from gentropy.dataset.variant_annotation import VariantAnnotation
+    from gentropy.dataset.variant_index import VariantIndex
 
 
 @dataclass
@@ -197,14 +197,14 @@ def _collect_rsids(
         return f.array_distinct(f.array(snp_id, snp_id_current, risk_allele))
 
     @staticmethod
-    def _map_to_variant_annotation_variants(
-        gwas_associations: DataFrame, variant_annotation: VariantAnnotation
+    def _map_variants_to_variant_index(
+        gwas_associations: DataFrame, variant_index: VariantIndex
     ) -> DataFrame:
         """Add variant metadata in associations.
 
         Args:
-            gwas_associations (DataFrame): raw GWAS Catalog associations
-            variant_annotation (VariantAnnotation): variant annotation dataset
+            gwas_associations (DataFrame): raw GWAS Catalog associations.
+            variant_index (VariantIndex): GnomaAD variants dataset with allele frequencies.
 
         Returns:
             DataFrame: GWAS Catalog associations data including `variantId`, `referenceAllele`,
@@ -228,7 +228,7 @@ def _map_to_variant_annotation_variants(
         )
 
         # Subset of variant annotation required for GWAS Catalog annotations:
-        va_subset = variant_annotation.df.select(
+        va_subset = variant_index.df.select(
             "variantId",
             "chromosome",
             # Calculate the position in Ensembl coordinates for indels:
@@ -241,7 +241,7 @@ def _map_to_variant_annotation_variants(
             "referenceAllele",
             "alternateAllele",
             "alleleFrequencies",
-            variant_annotation.max_maf().alias("maxMaf"),
+            variant_index.max_maf().alias("maxMaf"),
         ).join(
             f.broadcast(
                 gwas_associations_subset.select(
@@ -1035,7 +1035,7 @@ def _qc_palindromic_alleles(
     def from_source(
         cls: type[GWASCatalogCuratedAssociationsParser],
         gwas_associations: DataFrame,
-        variant_annotation: VariantAnnotation,
+        variant_index: VariantIndex,
         pvalue_threshold: float = WindowBasedClumpingStepConfig.gwas_significance,
     ) -> StudyLocusGWASCatalog:
         """Read GWASCatalog associations.
@@ -1044,9 +1044,9 @@ def from_source(
         applies some pre-defined filters on the data:
 
         Args:
-            gwas_associations (DataFrame): GWAS Catalog raw associations dataset
-            variant_annotation (VariantAnnotation): Variant annotation dataset
-            pvalue_threshold (float): P-value threshold for flagging associations
+            gwas_associations (DataFrame): GWAS Catalog raw associations dataset.
+            variant_index (VariantIndex): Variant index dataset with available allele frequencies.
+            pvalue_threshold (float): P-value threshold for flagging associations.
 
         Returns:
             StudyLocusGWASCatalog: GWASCatalogAssociations dataset
@@ -1060,8 +1060,8 @@ def from_source(
             .transform(
                 # Map/harmonise variants to variant annotation dataset:
                 # This function adds columns: variantId, referenceAllele, alternateAllele, chromosome, position
-                lambda df: GWASCatalogCuratedAssociationsParser._map_to_variant_annotation_variants(
-                    df, variant_annotation
+                lambda df: GWASCatalogCuratedAssociationsParser._map_variants_to_variant_index(
+                    df, variant_index
                 )
             )
             .withColumn(
diff --git a/src/gentropy/datasource/ukbiobank/__init__.py b/src/gentropy/datasource/ukbiobank/__init__.py
index 544779b18..910603703 100644
--- a/src/gentropy/datasource/ukbiobank/__init__.py
+++ b/src/gentropy/datasource/ukbiobank/__init__.py
@@ -1,3 +1,3 @@
-"""GWAS Catalog Data Source."""
+"""UK biobank."""
 
 from __future__ import annotations
diff --git a/src/gentropy/ld_index.py b/src/gentropy/gnomad_ingestion.py
similarity index 54%
rename from src/gentropy/ld_index.py
rename to src/gentropy/gnomad_ingestion.py
index 0cc00cf14..07b7fd58b 100644
--- a/src/gentropy/ld_index.py
+++ b/src/gentropy/gnomad_ingestion.py
@@ -1,14 +1,15 @@
-"""Step to dump a filtered version of a LD matrix (block matrix) as Parquet files."""
+"""Step to dump a filtered version of a LD matrix (block matrix) and GnomAD variants."""
 
 from __future__ import annotations
 
 import hail as hl
 
 from gentropy.common.session import Session
-from gentropy.common.types import LD_Population
+from gentropy.common.types import LD_Population, VariantPopulation
 from gentropy.common.version_engine import VersionEngine
-from gentropy.config import LDIndexConfig
+from gentropy.config import GnomadVariantConfig, LDIndexConfig
 from gentropy.datasource.gnomad.ld import GnomADLDMatrix
+from gentropy.datasource.gnomad.variants import GnomADVariants
 
 
 class LDIndexStep:
@@ -69,3 +70,56 @@ def __init__(
             .parquet(ld_index_out)
         )
         session.logger.info(ld_index_out)
+
+
+class GnomadVariantIndexStep:
+    """A step to generate variant index dataset from gnomad data.
+
+    Variant annotation step produces a dataset of the type `VariantIndex` derived from gnomADs `gnomad.genomes.vX.X.X.sites.ht` Hail's table.
+    This dataset is used to validate variants and as a source of annotation.
+    """
+
+    def __init__(
+        self,
+        session: Session,
+        gnomad_variant_output: str,
+        gnomad_genomes_path: str = GnomadVariantConfig().gnomad_genomes_path,
+        gnomad_variant_populations: list[
+            VariantPopulation | str
+        ] = GnomadVariantConfig().gnomad_variant_populations,
+        use_version_from_input: bool = GnomadVariantConfig().use_version_from_input,
+    ) -> None:
+        """Run Variant Annotation step.
+
+        Args:
+            session (Session): Session object.
+            gnomad_variant_output (str): Path to resulting dataset.
+            gnomad_genomes_path (str): Path to gnomAD genomes hail table, e.g. `gs://gcp-public-data--gnomad/release/4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/`.
+            gnomad_variant_populations (list[VariantPopulation | str]): List of populations to include.
+            use_version_from_input (bool): Append version derived from input gnomad_genomes_path to the output gnomad_variant_output. Defaults to False.
+
+        In case use_version_from_input is set to True,
+        data source version inferred from gnomad_genomes_path is appended as the last path segment to the output path.
+        All defaults are stored in the GnomadVariantConfig.
+        """
+        # amend data source version to output path
+        if use_version_from_input:
+            gnomad_variant_output = VersionEngine("gnomad").amend_version(
+                gnomad_genomes_path, gnomad_variant_output
+            )
+
+        # Initialise hail session.
+        hl.init(sc=session.spark.sparkContext, log="/dev/null")
+
+        # Parse variant info from source.
+        gnomad_variants = GnomADVariants(
+            gnomad_genomes_path=gnomad_genomes_path,
+            gnomad_variant_populations=gnomad_variant_populations,
+        ).as_variant_index()
+
+        # Write data partitioned by chromosome and position.
+        (
+            gnomad_variants.df.write.mode(session.write_mode).parquet(
+                gnomad_variant_output
+            )
+        )
diff --git a/src/gentropy/gwas_catalog_ingestion.py b/src/gentropy/gwas_catalog_ingestion.py
index 6930a2df9..725f1ca4d 100644
--- a/src/gentropy/gwas_catalog_ingestion.py
+++ b/src/gentropy/gwas_catalog_ingestion.py
@@ -1,8 +1,9 @@
 """Step to process GWAS Catalog associations and study table."""
+
 from __future__ import annotations
 
 from gentropy.common.session import Session
-from gentropy.dataset.variant_annotation import VariantAnnotation
+from gentropy.dataset.variant_index import VariantIndex
 from gentropy.datasource.gwas_catalog.associations import (
     GWASCatalogCuratedAssociationsParser,
 )
@@ -26,7 +27,7 @@ def __init__(
         catalog_ancestry_files: list[str],
         catalog_sumstats_lut: str,
         catalog_associations_file: str,
-        variant_annotation_path: str,
+        gnomad_variant_path: str,
         catalog_studies_out: str,
         catalog_associations_out: str,
         gwas_catalog_study_curation_file: str | None = None,
@@ -40,14 +41,14 @@ def __init__(
             catalog_ancestry_files (list[str]): List of raw ancestry annotations files from GWAS Catalog.
             catalog_sumstats_lut (str): GWAS Catalog summary statistics lookup table.
             catalog_associations_file (str): Raw GWAS catalog associations file.
-            variant_annotation_path (str): Input variant annotation path.
+            gnomad_variant_path (str): Path to GnomAD variants.
             catalog_studies_out (str): Output GWAS catalog studies path.
             catalog_associations_out (str): Output GWAS catalog associations path.
             gwas_catalog_study_curation_file (str | None): file of the curation table. Optional.
             inclusion_list_path (str | None): optional inclusion list (parquet)
         """
         # Extract
-        va = VariantAnnotation.from_parquet(session, variant_annotation_path)
+        gnomad_variants = VariantIndex.from_parquet(session, gnomad_variant_path)
         catalog_studies = session.spark.read.csv(
             list(catalog_study_files), sep="\t", header=True
         )
@@ -69,7 +70,9 @@ def __init__(
             StudyIndexGWASCatalogParser.from_source(
                 catalog_studies, ancestry_lut, sumstats_lut
             ).annotate_from_study_curation(gwas_catalog_study_curation),
-            GWASCatalogCuratedAssociationsParser.from_source(catalog_associations, va),
+            GWASCatalogCuratedAssociationsParser.from_source(
+                catalog_associations, gnomad_variants
+            ),
         )
 
         # if inclusion list is provided apply filter:
diff --git a/src/gentropy/gwas_catalog_study_inclusion.py b/src/gentropy/gwas_catalog_study_inclusion.py
index 872177601..f07f851a7 100644
--- a/src/gentropy/gwas_catalog_study_inclusion.py
+++ b/src/gentropy/gwas_catalog_study_inclusion.py
@@ -1,4 +1,5 @@
 """Step to generate an GWAS Catalog study identifier inclusion and exclusion list."""
+
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
@@ -6,7 +7,7 @@
 from pyspark.sql import functions as f
 
 from gentropy.common.session import Session
-from gentropy.dataset.variant_annotation import VariantAnnotation
+from gentropy.dataset.variant_index import VariantIndex
 from gentropy.datasource.gwas_catalog.associations import (
     GWASCatalogCuratedAssociationsParser,
 )
@@ -84,7 +85,7 @@ def process_harmonised_list(studies: list[str], session: Session) -> DataFrame:
     @staticmethod
     def get_gwas_catalog_study_index(
         session: Session,
-        variant_annotation_path: str,
+        gnomad_variant_path: str,
         catalog_study_files: list[str],
         catalog_ancestry_files: list[str],
         harmonised_study_file: str,
@@ -95,7 +96,7 @@ def get_gwas_catalog_study_index(
 
         Args:
             session (Session): Session object.
-            variant_annotation_path (str): Input variant annotation path.
+            gnomad_variant_path (str): Path to GnomAD variant list.
             catalog_study_files (list[str]): List of raw GWAS catalog studies file.
             catalog_ancestry_files (list[str]): List of raw ancestry annotations files from GWAS Catalog.
             harmonised_study_file (str): GWAS Catalog summary statistics lookup table.
@@ -106,7 +107,7 @@ def get_gwas_catalog_study_index(
             StudyIndexGWASCatalog: Completely processed and fully annotated study index.
         """
         # Extract
-        va = VariantAnnotation.from_parquet(session, variant_annotation_path)
+        gnomad_variants = VariantIndex.from_parquet(session, gnomad_variant_path)
         catalog_studies = session.spark.read.csv(
             list(catalog_study_files), sep="\t", header=True
         )
@@ -130,7 +131,9 @@ def get_gwas_catalog_study_index(
                 ancestry_lut,
                 sumstats_lut,
             ).annotate_from_study_curation(gwas_catalog_study_curation),
-            GWASCatalogCuratedAssociationsParser.from_source(catalog_associations, va),
+            GWASCatalogCuratedAssociationsParser.from_source(
+                catalog_associations, gnomad_variants
+            ),
         )
 
         return study_index
@@ -142,7 +145,7 @@ def __init__(
         catalog_ancestry_files: list[str],
         catalog_associations_file: str,
         gwas_catalog_study_curation_file: str,
-        variant_annotation_path: str,
+        gnomad_variant_path: str,
         harmonised_study_file: str,
         criteria: str,
         inclusion_list_path: str,
@@ -151,12 +154,12 @@ def __init__(
         """Run step.
 
         Args:
-            session (Session): Session object.
+            session (Session): Session objecct.
             catalog_study_files (list[str]): List of raw GWAS catalog studies file.
             catalog_ancestry_files (list[str]): List of raw ancestry annotations files from GWAS Catalog.
             catalog_associations_file (str): Raw GWAS catalog associations file.
             gwas_catalog_study_curation_file (str): file of the curation table. Optional.
-            variant_annotation_path (str): Input variant annotation path.
+            gnomad_variant_path (str): Path to GnomAD variant list.
             harmonised_study_file (str): GWAS Catalog summary statistics lookup table.
             criteria (str): name of the filter set to be applied.
             inclusion_list_path (str): Output path for the inclusion list.
@@ -165,7 +168,7 @@ def __init__(
         # Create study index:
         study_index = self.get_gwas_catalog_study_index(
             session,
-            variant_annotation_path,
+            gnomad_variant_path,
             catalog_study_files,
             catalog_ancestry_files,
             harmonised_study_file,
diff --git a/src/gentropy/variant_annotation.py b/src/gentropy/variant_annotation.py
deleted file mode 100644
index 355a4dfea..000000000
--- a/src/gentropy/variant_annotation.py
+++ /dev/null
@@ -1,66 +0,0 @@
-"""Step to generate variant annotation dataset."""
-
-from __future__ import annotations
-
-import hail as hl
-
-from gentropy.common.session import Session
-from gentropy.common.types import VariantPopulation
-from gentropy.common.version_engine import VersionEngine
-from gentropy.config import VariantAnnotationConfig
-from gentropy.datasource.gnomad.variants import GnomADVariants
-
-
-class VariantAnnotationStep:
-    """Variant annotation step.
-
-    Variant annotation step produces a dataset of the type `VariantAnnotation` derived from gnomADs `gnomad.genomes.vX.X.X.sites.ht` Hail's table.
-    This dataset is used to validate variants and as a source of annotation.
-    """
-
-    def __init__(
-        self,
-        session: Session,
-        variant_annotation_path: str,
-        gnomad_genomes_path: str = VariantAnnotationConfig().gnomad_genomes_path,
-        gnomad_variant_populations: list[
-            VariantPopulation | str
-        ] = VariantAnnotationConfig().gnomad_variant_populations,
-        chain_38_37: str = VariantAnnotationConfig().chain_38_37,
-        use_version_from_input: bool = VariantAnnotationConfig().use_version_from_input,
-    ) -> None:
-        """Run Variant Annotation step.
-
-        Args:
-            session (Session): Session object.
-            variant_annotation_path (str): Variant annotation dataset path.
-            gnomad_genomes_path (str): Path to gnomAD genomes hail table, e.g. `gs://gcp-public-data--gnomad/release/4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/`.
-            gnomad_variant_populations (list[VariantPopulation | str]): List of populations to include.
-            chain_38_37 (str): Path to GRCh38 to GRCh37 chain file for lifover.
-            use_version_from_input (bool): Append version derived from input gnomad_genomes_path to the output variant_annotation_path. Defaults to False.
-
-        In case use_version_from_input is set to True,
-        data source version inferred from gnomad_genomes_path is appended as the last path segment to the output path.
-        All defaults are stored in the VariantAnnotationConfig.
-        """
-        # amend data source version to output path
-        if use_version_from_input:
-            variant_annotation_path = VersionEngine("gnomad").amend_version(
-                gnomad_genomes_path, variant_annotation_path
-            )
-
-        # Initialise hail session.
-        hl.init(sc=session.spark.sparkContext, log="/dev/null")
-        # Run variant annotation.
-        variant_annotation = GnomADVariants(
-            gnomad_genomes_path=gnomad_genomes_path,
-            gnomad_variant_populations=gnomad_variant_populations,
-            chain_38_37=chain_38_37,
-        ).as_variant_annotation()
-
-        # Write data partitioned by chromosome and position.
-        (
-            variant_annotation.df.write.mode(session.write_mode).parquet(
-                variant_annotation_path
-            )
-        )
diff --git a/src/gentropy/variant_index.py b/src/gentropy/variant_index.py
index bdf838ce2..ba86c4602 100644
--- a/src/gentropy/variant_index.py
+++ b/src/gentropy/variant_index.py
@@ -1,44 +1,53 @@
-"""Step to generate variant index dataset."""
+"""Step to generate variant index dataset based on VEP output."""
+
 from __future__ import annotations
 
 from gentropy.common.session import Session
-from gentropy.dataset.study_locus import StudyLocus
-from gentropy.dataset.variant_annotation import VariantAnnotation
 from gentropy.dataset.variant_index import VariantIndex
+from gentropy.datasource.ensembl.vep_parser import VariantEffectPredictorParser
 
 
 class VariantIndexStep:
-    """Run variant index step to only variants in study-locus sets.
+    """Generate variant index based on a VEP output in json format.
 
-    Using a `VariantAnnotation` dataset as a reference, this step creates and writes a dataset of the type `VariantIndex` that includes only variants that have disease-association data with a reduced set of annotations.
+    The variant index is a dataset that contains variant annotations extracted from VEP output. It is expected that all variants in the VEP output are present in the variant index.
+    There's an option to provide extra variant annotations to be added to the variant index eg. allele frequencies from GnomAD.
     """
 
     def __init__(
         self: VariantIndexStep,
         session: Session,
-        variant_annotation_path: str,
-        credible_set_path: str,
+        vep_output_json_path: str,
         variant_index_path: str,
+        gnomad_variant_annotations_path: str | None = None,
     ) -> None:
         """Run VariantIndex step.
 
         Args:
             session (Session): Session object.
-            variant_annotation_path (str): Variant annotation dataset path.
-            credible_set_path (str): Credible set dataset path.
-            variant_index_path (str): Variant index dataset path.
+            vep_output_json_path (str): Variant effect predictor output path (in json format).
+            variant_index_path (str): Variant index dataset path to save resulting data.
+            gnomad_variant_annotations_path (str | None): Path to extra variant annotation dataset.
         """
-        # Extract
-        va = VariantAnnotation.from_parquet(session, variant_annotation_path)
-        credible_set = StudyLocus.from_parquet(
-            session, credible_set_path, recursiveFileLookup=True
+        # Extract variant annotations from VEP output:
+        variant_index = VariantEffectPredictorParser.extract_variant_index_from_vep(
+            session.spark, vep_output_json_path
         )
 
-        # Transform
-        vi = VariantIndex.from_variant_annotation(va, credible_set)
+        # Process variant annotations if provided:
+        if gnomad_variant_annotations_path:
+            # Read variant annotations from parquet:
+            annotations = VariantIndex.from_parquet(
+                session=session,
+                path=gnomad_variant_annotations_path,
+                recursiveFileLookup=True,
+            )
+
+            # Update file with extra annotations:
+            variant_index = variant_index.add_annotation(annotations)
 
         (
-            vi.df.write.partitionBy("chromosome")
+            variant_index.df.write.partitionBy("chromosome")
             .mode(session.write_mode)
             .parquet(variant_index_path)
         )
diff --git a/src/gentropy/v2g.py b/src/gentropy/variant_to_gene.py
similarity index 81%
rename from src/gentropy/v2g.py
rename to src/gentropy/variant_to_gene.py
index e98348a01..cf21053d7 100644
--- a/src/gentropy/v2g.py
+++ b/src/gentropy/variant_to_gene.py
@@ -1,16 +1,16 @@
 """Step to generate variant annotation dataset."""
+
 from __future__ import annotations
 
 from functools import reduce
 
-import pyspark.sql.functions as f
+from pyspark.sql import functions as f
 
 from gentropy.common.Liftover import LiftOverSpark
 from gentropy.common.session import Session
 from gentropy.dataset.gene_index import GeneIndex
 from gentropy.dataset.intervals import Intervals
 from gentropy.dataset.v2g import V2G
-from gentropy.dataset.variant_annotation import VariantAnnotation
 from gentropy.dataset.variant_index import VariantIndex
 
 
@@ -26,7 +26,6 @@ class V2GStep:
     Attributes:
         session (Session): Session object.
         variant_index_path (str): Input variant index path.
-        variant_annotation_path (str): Input variant annotation path.
         gene_index_path (str): Input gene index path.
         vep_consequences_path (str): Input VEP consequences path.
         liftover_chain_file_path (str): Path to GRCh37 to GRCh38 chain file.
@@ -41,7 +40,6 @@ def __init__(
         self,
         session: Session,
         variant_index_path: str,
-        variant_annotation_path: str,
         gene_index_path: str,
         vep_consequences_path: str,
         liftover_chain_file_path: str,
@@ -56,7 +54,6 @@ def __init__(
         Args:
             session (Session): Session object.
             variant_index_path (str): Input variant index path.
-            variant_annotation_path (str): Input variant annotation path.
             gene_index_path (str): Input gene index path.
             vep_consequences_path (str): Input VEP consequences path.
             liftover_chain_file_path (str): Path to GRCh37 to GRCh38 chain file.
@@ -69,16 +66,10 @@ def __init__(
         # Read
         gene_index = GeneIndex.from_parquet(session, gene_index_path)
         vi = VariantIndex.from_parquet(session, variant_index_path).persist()
-        va = VariantAnnotation.from_parquet(session, variant_annotation_path)
+        # Reading VEP consequence to score table and cast the score to the right type:
         vep_consequences = session.spark.read.csv(
             vep_consequences_path, sep="\t", header=True
-        ).select(
-            f.element_at(f.split("Accession", r"/"), -1).alias(
-                "variantFunctionalConsequenceId"
-            ),
-            f.col("Term").alias("label"),
-            f.col("v2g_score").cast("double").alias("score"),
-        )
+        ).withColumn("score", f.col("score").cast("double"))
 
         # Transform
         lift = LiftOverSpark(
@@ -90,10 +81,7 @@ def __init__(
             # Filter gene index by approved biotypes to define V2G gene universe
             list(approved_biotypes)
         )
-        va_slimmed = va.filter_by_variant_df(
-            # Variant annotation reduced to the variant index to define V2G variant universe
-            vi.df
-        ).persist()
+
         intervals = Intervals(
             _df=reduce(
                 lambda x, y: x.unionByName(y, allowMissingColumns=True),
@@ -108,9 +96,11 @@ def __init__(
             _schema=Intervals.get_schema(),
         )
         v2g_datasets = [
-            va_slimmed.get_distance_to_tss(gene_index_filtered, max_distance),
-            va_slimmed.get_most_severe_vep_v2g(vep_consequences, gene_index_filtered),
-            va_slimmed.get_plof_v2g(gene_index_filtered),
+            vi.get_distance_to_tss(gene_index_filtered, max_distance),
+            vi.get_most_severe_transcript_consequence(
+                vep_consequences, gene_index_filtered
+            ),
+            vi.get_plof_v2g(gene_index_filtered),
             intervals.v2g(vi),
         ]
         v2g = V2G(
diff --git a/tests/gentropy/conftest.py b/tests/gentropy/conftest.py
index 62b873355..c35188466 100644
--- a/tests/gentropy/conftest.py
+++ b/tests/gentropy/conftest.py
@@ -23,7 +23,6 @@
 from gentropy.dataset.study_locus_overlap import StudyLocusOverlap
 from gentropy.dataset.summary_statistics import SummaryStatistics
 from gentropy.dataset.v2g import V2G
-from gentropy.dataset.variant_annotation import VariantAnnotation
 from gentropy.dataset.variant_index import VariantIndex
 from gentropy.datasource.eqtl_catalogue.finemapping import EqtlCatalogueFinemapping
 from gentropy.datasource.eqtl_catalogue.study_index import EqtlCatalogueStudyIndex
@@ -276,45 +275,6 @@ def mock_v2g(spark: SparkSession) -> V2G:
     return V2G(_df=data_spec.build(), _schema=v2g_schema)
 
 
-@pytest.fixture()
-def mock_variant_annotation(spark: SparkSession) -> VariantAnnotation:
-    """Mock variant annotation."""
-    va_schema = VariantAnnotation.get_schema()
-
-    data_spec = (
-        dg.DataGenerator(
-            spark,
-            rows=400,
-            partitions=4,
-            randomSeedMethod="hash_fieldname",
-        )
-        .withSchema(va_schema)
-        .withColumnSpec("alleleType", percentNulls=0.1)
-        .withColumnSpec("chromosomeB37", percentNulls=0.1)
-        .withColumnSpec("positionB37", percentNulls=0.1)
-        # Nested column handling workaround
-        # https://github.com/databrickslabs/dbldatagen/issues/135
-        # It's a workaround for nested column handling in dbldatagen.
-        .withColumnSpec(
-            "alleleFrequencies",
-            expr='array(named_struct("alleleFrequency", rand(), "populationName", cast(rand() as string)))',
-            percentNulls=0.1,
-        )
-        .withColumnSpec("rsIds", expr="array(cast(rand() AS string))", percentNulls=0.1)
-        .withColumnSpec(
-            "vep",
-            expr='named_struct("mostSevereConsequence", cast(rand() as string), "transcriptConsequences", array(named_struct("aminoAcids", cast(rand() as string), "consequenceTerms", array(cast(rand() as string)), "geneId", cast(rand() as string), "lof", cast(rand() as string))))',
-            percentNulls=0.1,
-        )
-        .withColumnSpec(
-            "inSilicoPredictors",
-            expr='named_struct("cadd", named_struct("phred", cast(rand() as float), "raw_score", cast(rand() as float)), "revelMax", cast(rand() as double), "spliceaiDsMax", cast(rand() as float), "pangolinLargestDs", cast(rand() as double), "phylop", cast(rand() as double), "polyphenMax", cast(rand() as double), "siftMax", cast(rand() as double))',
-            percentNulls=0.1,
-        )
-    )
-    return VariantAnnotation(_df=data_spec.build(), _schema=va_schema)
-
-
 @pytest.fixture()
 def mock_variant_index(spark: SparkSession) -> VariantIndex:
     """Mock variant index."""
@@ -328,23 +288,65 @@ def mock_variant_index(spark: SparkSession) -> VariantIndex:
             randomSeedMethod="hash_fieldname",
         )
         .withSchema(vi_schema)
-        .withColumnSpec("chromosomeB37", percentNulls=0.1)
-        .withColumnSpec("positionB37", percentNulls=0.1)
-        .withColumnSpec("mostSevereConsequence", percentNulls=0.1)
+        .withColumnSpec("mostSevereConsequenceId", percentNulls=0.1)
         # Nested column handling workaround
         # https://github.com/databrickslabs/dbldatagen/issues/135
         # It's a workaround for nested column handling in dbldatagen.
+        .withColumnSpec(
+            "inSilicoPredictors",
+            expr="""
+                array(
+                    named_struct(
+                        "method", cast(rand() as string),
+                        "assessment", cast(rand() as string),
+                        "score", rand(),
+                        "assessmentFlag", cast(rand() as string),
+                        "targetId", cast(rand() as string)
+                    )
+                )
+            """,
+            percentNulls=0.1,
+        )
         .withColumnSpec(
             "alleleFrequencies",
             expr='array(named_struct("alleleFrequency", rand(), "populationName", cast(rand() as string)))',
             percentNulls=0.1,
         )
+        .withColumnSpec("rsIds", expr="array(cast(rand() AS string))", percentNulls=0.1)
         .withColumnSpec(
-            "inSilicoPredictors",
-            expr='named_struct("cadd", named_struct("phred", cast(rand() as float), "raw_score", cast(rand() as float)), "revelMax", cast(rand() as double), "spliceaiDsMax", cast(rand() as float), "pangolinLargestDs", cast(rand() as double), "phylop", cast(rand() as double), "polyphenMax", cast(rand() as double), "siftMax", cast(rand() as double))',
+            "transcriptConsequences",
+            expr="""
+                array(
+                    named_struct(
+                        "variantFunctionalConsequenceIds", array(cast(rand() as string)),
+                        "aminoAcidChange", cast(rand() as string),
+                        "uniprotAccessions", array(cast(rand() as string)),
+                        "isEnsemblCanonical", cast(rand() as boolean),
+                        "codons", cast(rand() as string),
+                        "distance", cast(rand() as long),
+                        "targetId", cast(rand() as string),
+                        "impact", cast(rand() as string),
+                        "lofteePrediction", cast(rand() as string),
+                        "siftPrediction", rand(),
+                        "polyphenPrediction", rand(),
+                        "transcriptId", cast(rand() as string)
+                    )
+                )
+            """,
+            percentNulls=0.1,
+        )
+        .withColumnSpec(
+            "dbXrefs",
+            expr="""
+                array(
+                    named_struct(
+                        "id", cast(rand() as string),
+                        "source", cast(rand() as string)
+                    )
+                )
+            """,
             percentNulls=0.1,
         )
-        .withColumnSpec("rsIds", expr="array(cast(rand() AS string))", percentNulls=0.1)
     )
 
     return VariantIndex(_df=data_spec.build(), _schema=vi_schema)
diff --git a/tests/gentropy/data_samples/finucane_PIPs.npy b/tests/gentropy/data_samples/finucane_PIPs.npy
deleted file mode 100644
index 2a93a5c71..000000000
Binary files a/tests/gentropy/data_samples/finucane_PIPs.npy and /dev/null differ
diff --git a/tests/gentropy/data_samples/imported/GTEx_V8/ge/Adipose_Subcutaneous.tsv.gz b/tests/gentropy/data_samples/imported/GTEx_V8/ge/Adipose_Subcutaneous.tsv.gz
deleted file mode 100644
index 74c0844a4..000000000
Binary files a/tests/gentropy/data_samples/imported/GTEx_V8/ge/Adipose_Subcutaneous.tsv.gz and /dev/null differ
diff --git a/tests/gentropy/data_samples/vep_consequences_sample.tsv b/tests/gentropy/data_samples/vep_consequences_sample.tsv
deleted file mode 100644
index 7b5de68b2..000000000
--- a/tests/gentropy/data_samples/vep_consequences_sample.tsv
+++ /dev/null
@@ -1,45 +0,0 @@
-Accession	Term	Description	Display term	IMPACT	v2g_score	eco_score
-http://purl.obolibrary.org/obo/SO_0001893	transcript_ablation	A feature ablation whereby the deleted region includes a transcript feature	Transcript ablation	HIGH	1	1
-http://identifiers.org/eco/cttv_mapping_pipeline	cttv_mapping_pipeline					1
-http://purl.obolibrary.org/obo/ECO_0000205	curator_inference					1
-http://purl.obolibrary.org/obo/SO_0002165	trinucleotide_repeat_expansion					1
-http://purl.obolibrary.org/obo/SO_0001574	splice_acceptor_variant	A splice variant that changes the 2 base region at the 3' end of an intron	Splice acceptor variant	HIGH	1	0.95
-http://purl.obolibrary.org/obo/SO_0001575	splice_donor_variant	A splice variant that changes the 2 base region at the 5' end of an intron	Splice donor variant	HIGH	1	0.95
-http://purl.obolibrary.org/obo/SO_0001587	stop_gained	A sequence variant whereby at least one base of a codon is changed, resulting in a premature stop codon, leading to a shortened transcript	Stop gained	HIGH	1	0.95
-http://purl.obolibrary.org/obo/SO_0001589	frameshift_variant	A sequence variant which causes a disruption of the translational reading frame, because the number of nucleotides inserted or deleted is not a multiple of three	Frameshift variant	HIGH	1	0.95
-http://purl.obolibrary.org/obo/SO_0002012	start_lost	A codon variant that changes at least one base of the canonical start codon	Start lost	HIGH	1	0.95
-http://purl.obolibrary.org/obo/SO_0001578	stop_lost	A sequence variant where at least one base of the terminator codon (stop) is changed, resulting in an elongated transcript	Stop lost	HIGH	1	0.9
-http://purl.obolibrary.org/obo/SO_0001889	transcript_amplification	A feature amplification of a region containing a transcript	Transcript amplification	HIGH	1	0.9
-http://purl.obolibrary.org/obo/SO_0001894	regulatory_region_ablation	A feature ablation whereby the deleted region includes a regulatory region	Regulatory region ablation	MODERATE	0.66	0.9
-http://purl.obolibrary.org/obo/SO_0001583	missense_variant	A sequence variant, that changes one or more bases, resulting in a different amino acid sequence but where the length is preserved	Missense variant	MODERATE	0.66	0.7
-http://purl.obolibrary.org/obo/SO_0001818	protein_altering_variant	A sequence_variant which is predicted to change the protein encoded in the coding sequence	Protein altering variant	MODERATE	0.66	0.7
-http://purl.obolibrary.org/obo/SO_0001821	inframe_insertion	An inframe non synonymous variant that inserts bases into in the coding sequence	Inframe insertion	MODERATE	0.66	0.7
-http://purl.obolibrary.org/obo/SO_0001822	inframe_deletion	An inframe non synonymous variant that deletes bases from the coding sequence	Inframe deletion	MODERATE	0.66	0.7
-http://purl.obolibrary.org/obo/SO_0001582	initiator_codon_variant					0.7
-http://purl.obolibrary.org/obo/SO_0001630	splice_region_variant	A sequence variant in which a change has occurred within the region of the splice site, either within 1-3 bases of the exon or 3-8 bases of the intron	Splice region variant	LOW	0.33	0.65
-http://purl.obolibrary.org/obo/SO_0001626	incomplete_terminal_codon_variant	A sequence variant where at least one base of the final codon of an incompletely annotated transcript is changed	Incomplete terminal codon variant	LOW	0.33	0.65
-http://purl.obolibrary.org/obo/SO_0001567	stop_retained_variant	A sequence variant where at least one base in the terminator codon is changed, but the terminator remains	Stop retained variant	LOW	0.33	0.65
-http://purl.obolibrary.org/obo/SO_0001819	synonymous_variant	A sequence variant where there is no resulting change to the encoded amino acid	Synonymous variant	LOW	0.33	0.65
-http://purl.obolibrary.org/obo/SO_0002019	start_retained_variant	A sequence variant where at least one base in the start codon is changed, but the start remains	Start retained variant	LOW	0.33	0.65
-http://purl.obolibrary.org/obo/SO_0001619	non_coding_transcript_variant	A transcript variant of a non coding RNA gene	Non coding transcript variant	MODIFIER	0	0.65
-http://purl.obolibrary.org/obo/SO_0001620	mature_miRNA_variant	A transcript variant located with the sequence of the mature miRNA	Mature miRNA variant	MODIFIER	0	0.65
-http://purl.obolibrary.org/obo/SO_0001621	NMD_transcript_variant	A variant in a transcript that is the target of NMD	NMD transcript variant	MODIFIER	0.1	0.65
-http://purl.obolibrary.org/obo/SO_0001623	5_prime_UTR_variant	A UTR variant of the 5' UTR	5 prime UTR variant	MODIFIER	0.1	0.65
-http://purl.obolibrary.org/obo/SO_0001624	3_prime_UTR_variant	A UTR variant of the 3' UTR	3 prime UTR variant	MODIFIER	0.1	0.65
-http://purl.obolibrary.org/obo/SO_0001627	intron_variant	A transcript variant occurring within an intron	Intron variant	MODIFIER	0.1	0.65
-http://purl.obolibrary.org/obo/SO_0001792	non_coding_transcript_exon_variant	A sequence variant that changes non-coding exon sequence in a non-coding transcript	Non coding transcript exon variant	MODIFIER	0	0.65
-http://purl.obolibrary.org/obo/SO_0001580	coding_sequence_variant	A sequence variant that changes the coding sequence	Coding sequence variant	MODIFIER	0	0.6
-http://purl.obolibrary.org/obo/SO_0001566	regulatory_region_variant	A sequence variant located within a regulatory region	Regulatory region variant	MODIFIER	0	0.6
-http://purl.obolibrary.org/obo/SO_0001631	upstream_gene_variant	A sequence variant located 5' of a gene	Upstream gene variant	MODIFIER	0	0.6
-http://purl.obolibrary.org/obo/SO_0001632	downstream_gene_variant	A sequence variant located 3' of a gene	Downstream gene variant	MODIFIER	0	0.6
-http://purl.obolibrary.org/obo/SO_0001782	TF_binding_site_variant	A sequence variant located within a transcription factor binding site	TF binding site variant	MODIFIER	0	0.6
-http://purl.obolibrary.org/obo/SO_0001891	regulatory_region_amplification	A feature amplification of a region containing a regulatory region	Regulatory region amplification	MODIFIER	0	0.6
-http://purl.obolibrary.org/obo/SO_0001892	TFBS_amplification	A feature amplification of a region containing a transcription factor binding site	TFBS amplification	MODIFIER	0	0.6
-http://purl.obolibrary.org/obo/SO_0001895	TFBS_ablation	A feature ablation whereby the deleted region includes a transcription factor binding site	TFBS ablation	MODIFIER	0	0.6
-http://purl.obolibrary.org/obo/SO_0001906	feature_truncation	A sequence variant that causes the reduction of a genomic feature, with regard to the reference sequence	Feature truncation	MODIFIER	0	0.6
-http://purl.obolibrary.org/obo/SO_0001907	feature_elongation	A sequence variant that causes the extension of a genomic feature, with regard to the reference sequence	Feature elongation	MODIFIER	0	0.6
-http://targetvalidation.org/sequence/regulatory_nearest_gene_five_prime_end	Regulatory nearest gene 5' end					0.6
-http://purl.obolibrary.org/obo/SO_0001628	intergenic_variant	A sequence variant located in the intergenic region, between genes	Intergenic variant	MODIFIER	0	0.5
-http://purl.obolibrary.org/obo/SO_0001060	sequence_variant					0.5
-http://purl.obolibrary.org/obo/SO_0001825	conservative_inframe_deletion					0.5
-http://targetvalidation.org/sequence/nearest_gene_five_prime_end	Nearest gene counting from 5' end					0.5
diff --git a/tests/gentropy/data_samples/vep_sample.jsonl b/tests/gentropy/data_samples/vep_sample.jsonl
new file mode 100644
index 000000000..78a76f150
--- /dev/null
+++ b/tests/gentropy/data_samples/vep_sample.jsonl
@@ -0,0 +1,2 @@
+{"transcript_consequences":[{"gene_id":"ENSG00000168702","swissprot":["Q9NZR2.181"],"uniparc":["UPI00001B045B"],"transcript_id":"ENST00000389484","impact":"MODIFIER","canonical":1,"consequence_terms":["intron_variant"],"strand":-1,"variant_allele":"T"}],"assembly_name":"GRCh38","most_severe_consequence":"intron_variant","strand":1,"seq_region_name":"2","start":140699626,"end":140699625,"input":"2\t140699625\t2_140699625_G_GT\tG\tGT","allele_string":"-/T","id":"2_140699625_G_GT"}
+{"transcript_consequences":[{"impact":"MODIFIER","consequence_terms":["upstream_gene_variant"],"distance":682,"trembl":["A0A2U3TZJ1.14"],"cadd_raw":4.846757,"cadd_phred":27.1,"transcript_id":"ENST00000336451","strand":-1,"variant_allele":"T","gene_id":"ENSG00000155906","uniparc":["UPI000D18E792"]},{"lof_info":"INTRON_SIZE:8753","consequence_terms":["splice_donor_variant"],"impact":"HIGH","transcript_id":"ENST00000444024","cadd_phred":27.1,"cadd_raw":4.846757,"strand":-1,"variant_allele":"T","lof":"HC","canonical":1,"uniprot_isoform":["Q9NWS8-1"],"uniparc":["UPI00001AEAE1"],"swissprot":["Q9NWS8.145"],"gene_id":"ENSG00000155906"},{"gene_id":"ENSG00000155906","uniparc":["UPI000006DC2F"],"swissprot":["Q9NWS8.145"],"uniprot_isoform":["Q9NWS8-3"],"lof":"HC","variant_allele":"T","strand":-1,"cadd_phred":27.1,"transcript_id":"ENST00000491268","cadd_raw":4.846757,"consequence_terms":["splice_donor_variant"],"impact":"HIGH","lof_info":"INTRON_SIZE:8753"},{"variant_allele":"T","strand":-1,"uniparc":["UPI0002A12044"],"gene_id":"ENSG00000155906","consequence_terms":["intron_variant"],"impact":"MODIFIER","transcript_id":"ENST00000622845","cadd_phred":27.1,"cadd_raw":4.846757,"trembl":["A0A087WXU0.51"]},{"transcript_id":"ENST00000643564","cadd_phred":27.1,"cadd_raw":4.846757,"consequence_terms":["non_coding_transcript_exon_variant"],"impact":"MODIFIER","gene_id":"ENSG00000155906","cdna_start":578,"cdna_end":578,"strand":-1,"variant_allele":"T"},{"transcript_id":"ENST00000644054","cadd_phred":27.1,"trembl":["A0A2R8YFC3.14"],"cadd_raw":4.846757,"consequence_terms":["splice_donor_variant","NMD_transcript_variant"],"impact":"HIGH","uniparc":["UPI001B8998C5"],"gene_id":"ENSG00000155906","strand":-1,"variant_allele":"T"},{"transcript_id":"ENST00000644711","cadd_phred":27.1,"cadd_raw":4.846757,"trembl":["A0A2R8Y4J4.16"],"consequence_terms":["splice_donor_variant","NMD_transcript_variant"],"impact":"HIGH","uniparc":["UPI000D1907AF"],"gene_id":"ENSG00000155906","variant_allele":"T","strand":-1},{"gene_id":"ENSG00000155906","transcript_id":"ENST00000645367","cadd_phred":27.1,"cadd_raw":4.846757,"consequence_terms":["splice_donor_variant","non_coding_transcript_variant"],"impact":"HIGH","strand":-1,"variant_allele":"T"},{"cadd_phred":27.1,"transcript_id":"ENST00000645895","cadd_raw":4.846757,"gene_id":"ENSG00000155906","variant_allele":"T","strand":-1,"consequence_terms":["splice_donor_variant","non_coding_transcript_variant"],"impact":"HIGH"},{"gene_id":"ENSG00000155906","cdna_end":724,"cdna_start":724,"variant_allele":"T","strand":-1,"cadd_phred":27.1,"transcript_id":"ENST00000645917","cadd_raw":4.846757,"consequence_terms":["non_coding_transcript_exon_variant"],"impact":"MODIFIER"},{"impact":"HIGH","consequence_terms":["splice_donor_variant","NMD_transcript_variant"],"cadd_raw":4.846757,"trembl":["A0A2R8Y4P5.16"],"cadd_phred":27.1,"transcript_id":"ENST00000646926","variant_allele":"T","strand":-1,"uniparc":["UPI001B89CA49"],"gene_id":"ENSG00000155906"},{"transcript_id":"ENST00000682004","cadd_phred":27.1,"cadd_raw":4.846757,"gene_id":"ENSG00000155906","variant_allele":"T","strand":-1,"consequence_terms":["splice_donor_variant","non_coding_transcript_variant"],"impact":"HIGH"},{"gene_id":"ENSG00000155906","uniparc":["UPI001B88E93C"],"lof":"HC","strand":-1,"variant_allele":"T","trembl":["A0A804HHY2.7"],"cadd_raw":4.846757,"transcript_id":"ENST00000682299","cadd_phred":27.1,"impact":"HIGH","consequence_terms":["splice_donor_variant"],"lof_info":"INTRON_SIZE:8753"},{"uniparc":["UPI001B8920B1"],"gene_id":"ENSG00000155906","strand":-1,"variant_allele":"T","trembl":["A0A804HLE1.7"],"cadd_raw":4.846757,"cadd_phred":27.1,"transcript_id":"ENST00000682392","impact":"HIGH","consequence_terms":["splice_donor_variant","NMD_transcript_variant"]},{"cadd_raw":4.846757,"trembl":["A0A804HHW6.7"],"cadd_phred":27.1,"transcript_id":"ENST00000682641","impact":"HIGH","consequence_terms":["splice_donor_variant"],"lof_info":"INTRON_SIZE:8753","gene_id":"ENSG00000155906","uniparc":["UPI001B893F2B"],"lof":"HC","variant_allele":"T","strand":-1},{"gene_id":"ENSG00000155906","transcript_id":"ENST00000682760","cadd_phred":27.1,"cadd_raw":4.846757,"consequence_terms":["splice_donor_variant","non_coding_transcript_variant"],"impact":"HIGH","strand":-1,"variant_allele":"T"},{"variant_allele":"T","strand":-1,"impact":"HIGH","consequence_terms":["splice_donor_variant","non_coding_transcript_variant"],"cadd_raw":4.846757,"cadd_phred":27.1,"transcript_id":"ENST00000683439","gene_id":"ENSG00000155906"},{"gene_id":"ENSG00000155906","swissprot":["Q9NWS8.145"],"uniparc":["UPI00001AEAE1"],"lof":"HC","uniprot_isoform":["Q9NWS8-1"],"strand":-1,"variant_allele":"T","cadd_raw":4.846757,"transcript_id":"ENST00000683724","cadd_phred":27.1,"impact":"HIGH","consequence_terms":["splice_donor_variant"],"lof_info":"INTRON_SIZE:8753"},{"gene_id":"ENSG00000155906","transcript_id":"ENST00000683740","cadd_phred":27.1,"cadd_raw":4.846757,"consequence_terms":["splice_donor_variant","non_coding_transcript_variant"],"impact":"HIGH","variant_allele":"T","strand":-1},{"cadd_phred":27.1,"transcript_id":"ENST00000684301","cadd_raw":4.846757,"consequence_terms":["splice_donor_variant","NMD_transcript_variant"],"impact":"HIGH","gene_id":"ENSG00000155906","uniparc":["UPI000006DC2F"],"swissprot":["Q9NWS8.145"],"uniprot_isoform":["Q9NWS8-3"],"variant_allele":"T","strand":-1},{"gene_id":"ENSG00000155906","cadd_raw":4.846757,"cadd_phred":27.1,"transcript_id":"ENST00000684658","impact":"HIGH","consequence_terms":["splice_donor_variant","non_coding_transcript_variant"],"strand":-1,"variant_allele":"T"},{"consequence_terms":["splice_donor_variant","non_coding_transcript_variant"],"impact":"HIGH","variant_allele":"T","strand":-1,"gene_id":"ENSG00000155906","cadd_phred":27.1,"transcript_id":"ENST00000684715","cadd_raw":4.846757},{"impact":"HIGH","consequence_terms":["splice_donor_variant","NMD_transcript_variant"],"trembl":["A0A804HKF8.7"],"cadd_raw":4.846757,"cadd_phred":27.1,"transcript_id":"ENST00000684765","strand":-1,"variant_allele":"T","uniparc":["UPI001B89CAE3"],"gene_id":"ENSG00000155906"}],"seq_region_name":"6","assembly_name":"GRCh38","end":151445307,"strand":1,"most_severe_consequence":"splice_donor_variant","start":151445307,"id":"6_151445307_C_T","colocated_variants":[{"clin_sig_allele":"T:pathogenic;T:likely_pathogenic","end":151445307,"var_synonyms":{"ClinVar":["RCV000032983","VCV000039764","RCV001814019"],"OMIM":[614917.0001]},"strand":1,"start":151445307,"allele_string":"C/T","clin_sig":["likely_pathogenic","pathogenic"],"seq_region_name":"6","pubmed":[23022099,18835491],"phenotype_or_disease":1,"id":"rs1562800908"}],"input":"6\t151445307\t6_151445307_C_T\tC\tT","allele_string":"C/T"}
diff --git a/tests/gentropy/dataset/test_variant_annotation.py b/tests/gentropy/dataset/test_variant_annotation.py
deleted file mode 100644
index 05fa0cd60..000000000
--- a/tests/gentropy/dataset/test_variant_annotation.py
+++ /dev/null
@@ -1,30 +0,0 @@
-"""Tests variant annotation dataset."""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from gentropy.dataset.gene_index import GeneIndex
-
-from gentropy.dataset.v2g import V2G
-from gentropy.dataset.variant_annotation import VariantAnnotation
-
-
-def test_variant_index_creation(mock_variant_annotation: VariantAnnotation) -> None:
-    """Test gene index creation with mock gene index."""
-    assert isinstance(mock_variant_annotation, VariantAnnotation)
-
-
-def test_get_plof_v2g(
-    mock_variant_annotation: VariantAnnotation, mock_gene_index: GeneIndex
-) -> None:
-    """Test get_plof_v2g with mock variant annotation."""
-    assert isinstance(mock_variant_annotation.get_plof_v2g(mock_gene_index), V2G)
-
-
-def test_get_distance_to_tss(
-    mock_variant_annotation: VariantAnnotation, mock_gene_index: GeneIndex
-) -> None:
-    """Test get_distance_to_tss with mock variant annotation."""
-    assert isinstance(mock_variant_annotation.get_distance_to_tss(mock_gene_index), V2G)
diff --git a/tests/gentropy/dataset/test_variant_index.py b/tests/gentropy/dataset/test_variant_index.py
index a41db3031..4d97ef536 100644
--- a/tests/gentropy/dataset/test_variant_index.py
+++ b/tests/gentropy/dataset/test_variant_index.py
@@ -4,11 +4,12 @@
 
 from typing import TYPE_CHECKING
 
+from gentropy.dataset.gene_index import GeneIndex
+from gentropy.dataset.v2g import V2G
 from gentropy.dataset.variant_index import VariantIndex
 
 if TYPE_CHECKING:
-    from gentropy.dataset.study_locus import StudyLocus
-    from gentropy.dataset.variant_annotation import VariantAnnotation
+    pass
 
 
 def test_variant_index_creation(mock_variant_index: VariantIndex) -> None:
@@ -16,11 +17,15 @@ def test_variant_index_creation(mock_variant_index: VariantIndex) -> None:
     assert isinstance(mock_variant_index, VariantIndex)
 
 
-def test_from_variant_annotation(
-    mock_variant_annotation: VariantAnnotation, mock_study_locus: StudyLocus
+def test_get_plof_v2g(
+    mock_variant_index: VariantIndex, mock_gene_index: GeneIndex
 ) -> None:
-    """Test variant index creation from variant annotation."""
-    variant_index = VariantIndex.from_variant_annotation(
-        mock_variant_annotation, mock_study_locus
-    )
-    assert isinstance(variant_index, VariantIndex)
+    """Test get_plof_v2g with mock variant annotation."""
+    assert isinstance(mock_variant_index.get_plof_v2g(mock_gene_index), V2G)
+
+
+def test_get_distance_to_tss(
+    mock_variant_index: VariantIndex, mock_gene_index: GeneIndex
+) -> None:
+    """Test get_distance_to_tss with mock variant annotation."""
+    assert isinstance(mock_variant_index.get_distance_to_tss(mock_gene_index), V2G)
diff --git a/tests/gentropy/datasource/ensembl/test_vep_variants.py b/tests/gentropy/datasource/ensembl/test_vep_variants.py
new file mode 100644
index 000000000..1401f1b6c
--- /dev/null
+++ b/tests/gentropy/datasource/ensembl/test_vep_variants.py
@@ -0,0 +1,145 @@
+"""Testing VEP parsing and variant index extraction."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+from gentropy.dataset.variant_index import VariantIndex
+from gentropy.datasource.ensembl.vep_parser import VariantEffectPredictorParser
+from pyspark.sql import DataFrame
+from pyspark.sql import functions as f
+
+if TYPE_CHECKING:
+    from pyspark.sql import SparkSession
+
+
+class TestVEPParserInSilicoExtractor:
+    """Testing the _vep_in_silico_prediction_extractor method of the VEP parser class.
+
+    These tests assumes that the _get_most_severe_transcript() method works correctly, as it's not tested.
+
+    The test cases try to cover the following scenarios:
+    - Transcripts with no assessments.
+    - Transcripts without assessments flag.
+    - Transcripts with no score.
+    - Testing cases, where no score column is provided.
+    """
+
+    # Data prototype:
+    SAMPLE_DATA = [
+        # Complete dataset:
+        ("v1", "deleterious", 0.1, "gene1", "flag"),
+        # No assessment:
+        ("v2", None, 0.1, "gene1", "flag"),
+        # No flag:
+        ("v3", "deleterious", 0.1, "gene1", None),
+        # No score:
+        ("v4", "deleterious", None, "gene1", "flag"),
+    ]
+
+    SAMPLE_COLUMNS = ["variantId", "assessment", "score", "gene_id", "flag"]
+
+    @pytest.fixture(autouse=True)
+    def _setup(self: TestVEPParserInSilicoExtractor, spark: SparkSession) -> None:
+        """Setup fixture."""
+        parsed_df = (
+            spark.createDataFrame(self.SAMPLE_DATA, self.SAMPLE_COLUMNS)
+            .groupBy("variantId")
+            .agg(
+                f.collect_list(
+                    f.struct(
+                        f.col("assessment").alias("assessment"),
+                        f.col("score").alias("score"),
+                        f.col("flag").alias("flag"),
+                        f.col("gene_id").alias("gene_id"),
+                    )
+                ).alias("transcripts")
+            )
+            .select(
+                "variantId",
+                VariantEffectPredictorParser._vep_in_silico_prediction_extractor(
+                    "transcripts", "method_name", "score", "assessment", "flag"
+                ).alias("in_silico_predictions"),
+            )
+        ).persist()
+
+        self.df = parsed_df
+
+    def test_in_silico_output_missing_value(
+        self: TestVEPParserInSilicoExtractor,
+    ) -> None:
+        """Test if the in silico output count is correct."""
+        variant_with_missing_score = [
+            x[0] for x in filter(lambda x: x[2] is None, self.SAMPLE_DATA)
+        ]
+        # Assert that the correct variants return null:
+        assert (
+            [
+                x["variantId"]
+                for x in self.df.filter(
+                    f.col("in_silico_predictions").isNull()
+                ).collect()
+            ]
+            == variant_with_missing_score
+        ), "Not the right variants got nullified in-silico predictor object."
+
+
+class TestVEPParser:
+    """Testing VEP parser class.
+
+    Some of the input data:
+    - 6_151445307_C_T - complicated variant with numerous annotations.
+    - 2_140699625_G_GT - simple variant with no annotations whatsoever.
+    """
+
+    SAMPLE_VEP_DATA_PATH = "tests/gentropy/data_samples/vep_sample.jsonl"
+
+    @pytest.fixture(autouse=True)
+    def _setup(self: TestVEPParser, spark: SparkSession) -> None:
+        """Setup fixture."""
+        self.raw_vep_output = spark.read.json(
+            self.SAMPLE_VEP_DATA_PATH,
+            schema=VariantEffectPredictorParser.get_schema(),
+        )
+        self.processed_vep_output = VariantEffectPredictorParser.process_vep_output(
+            self.raw_vep_output
+        )
+
+    def test_extract_variant_index_from_vep(
+        self: TestVEPParser, spark: SparkSession
+    ) -> None:
+        """Test if the variant index can be extracted from the VEP output."""
+        variant_index = VariantEffectPredictorParser.extract_variant_index_from_vep(
+            spark, self.SAMPLE_VEP_DATA_PATH
+        )
+
+        assert isinstance(
+            variant_index, VariantIndex
+        ), "VariantIndex object not created."
+
+    def test_process(self: TestVEPParser) -> None:
+        """Test process method."""
+        df = VariantEffectPredictorParser.process_vep_output(self.raw_vep_output)
+        assert isinstance(df, DataFrame), "Processed VEP output is not a DataFrame."
+        assert df.count() > 0, "No variant data in processed VEP dataframe."
+
+    def test_conversion(self: TestVEPParser) -> None:
+        """Test if processed data can be converted into a VariantIndex object."""
+        variant_index = VariantIndex(
+            _df=self.processed_vep_output,
+            _schema=VariantIndex.get_schema(),
+        )
+
+        assert isinstance(
+            variant_index, VariantIndex
+        ), "VariantIndex object not created."
+
+    def test_variant_count(self: TestVEPParser) -> None:
+        """Test if the number of variants is correct.
+
+        It is expected that all rows from the parsed VEP output are present in the processed VEP output.
+        """
+        assert (
+            self.raw_vep_output.count() == self.processed_vep_output.count()
+        ), f"Incorrect number of variants in processed VEP output: expected {self.raw_vep_output.count()}, got {self.processed_vep_output.count()}."
diff --git a/tests/gentropy/datasource/gwas_catalog/test_gwas_catalog_associations.py b/tests/gentropy/datasource/gwas_catalog/test_gwas_catalog_associations.py
index de70e8b63..e7067e3d9 100644
--- a/tests/gentropy/datasource/gwas_catalog/test_gwas_catalog_associations.py
+++ b/tests/gentropy/datasource/gwas_catalog/test_gwas_catalog_associations.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from gentropy.dataset.variant_annotation import VariantAnnotation
+from gentropy.dataset.variant_index import VariantIndex
 from gentropy.datasource.gwas_catalog.associations import (
     GWASCatalogCuratedAssociationsParser,
     StudyLocusGWASCatalog,
@@ -50,29 +50,29 @@ def test_qc_ambiguous_study(
 
 
 def test_study_locus_gwas_catalog_from_source(
-    mock_variant_annotation: VariantAnnotation,
+    mock_variant_index: VariantIndex,
     sample_gwas_catalog_associations: DataFrame,
 ) -> None:
     """Test study locus from gwas catalog mock data."""
     assert isinstance(
         GWASCatalogCuratedAssociationsParser.from_source(
-            sample_gwas_catalog_associations, mock_variant_annotation
+            sample_gwas_catalog_associations, mock_variant_index
         ),
         StudyLocusGWASCatalog,
     )
 
 
-def test__map_to_variant_annotation_variants(
+def test_map_variants_to_variant_index(
     sample_gwas_catalog_associations: DataFrame,
-    mock_variant_annotation: VariantAnnotation,
+    mock_variant_index: VariantIndex,
 ) -> None:
     """Test mapping to variant annotation variants."""
     assert isinstance(
-        GWASCatalogCuratedAssociationsParser._map_to_variant_annotation_variants(
+        GWASCatalogCuratedAssociationsParser._map_variants_to_variant_index(
             sample_gwas_catalog_associations.withColumn(
                 "studyLocusId", f.monotonically_increasing_id().cast(LongType())
             ),
-            mock_variant_annotation,
+            mock_variant_index,
         ),
         DataFrame,
     )
diff --git a/tests/gentropy/test_schemas.py b/tests/gentropy/test_schemas.py
index 417eb4657..630abd0ab 100644
--- a/tests/gentropy/test_schemas.py
+++ b/tests/gentropy/test_schemas.py
@@ -57,6 +57,9 @@ def test_schema_columns_camelcase(schema_json: str) -> None:
     Args:
         schema_json (str): schema filename
     """
+    if schema_json == "vep_json_output.json":
+        pytest.skip("VEP schema is exempt from camelCase check.")
+
     core_schema = json.loads(Path(SCHEMA_DIR, schema_json).read_text(encoding="utf-8"))
     schema = StructType.fromJson(core_schema)
     # Use a regular expression to check if the identifier is in camelCase