diff --git a/candle-examples/examples/quantized-phi/main.rs b/candle-examples/examples/quantized-phi/main.rs
index f567ce2d36..a776e989e5 100644
--- a/candle-examples/examples/quantized-phi/main.rs
+++ b/candle-examples/examples/quantized-phi/main.rs
@@ -28,6 +28,8 @@ enum Which {
     /// Alternative implementation of phi-3, based on llama.
     #[value(name = "phi-3b")]
     Phi3b,
+    #[value(name = "phi-4")]
+    Phi4,
 }
 
 #[derive(Parser, Debug)]
@@ -104,6 +106,7 @@ impl Args {
                 let repo = match self.which {
                     Which::Phi2 => "microsoft/phi-2",
                     Which::Phi3 | Which::Phi3b => "microsoft/Phi-3-mini-4k-instruct",
+                    Which::Phi4 => "microsoft/phi-4",
                 };
                 let api = api.model(repo.to_string());
                 api.get("tokenizer.json")?
@@ -128,6 +131,7 @@ impl Args {
                         "Phi-3-mini-4k-instruct-q4.gguf",
                         "5eef2ce24766d31909c0b269fe90c817a8f263fb",
                     ),
+                    Which::Phi4 => ("microsoft/phi-4-gguf", "phi-4-q4.gguf", "main"),
                 };
                 let api = hf_hub::api::sync::Api::new()?;
                 api.repo(hf_hub::Repo::with_revision(
@@ -216,7 +220,7 @@ fn main() -> anyhow::Result<()> {
         );
         match args.which {
             Which::Phi2 => Model::Phi2(Phi2::from_gguf(model, &mut file, &device)?),
-            Which::Phi3 => Model::Phi3(Phi3::from_gguf(
+            Which::Phi3 | Which::Phi4 => Model::Phi3(Phi3::from_gguf(
                 args.use_flash_attn,
                 model,
                 &mut file,
diff --git a/candle-transformers/src/models/quantized_phi3.rs b/candle-transformers/src/models/quantized_phi3.rs
index 51a75f3895..1ceb48d13a 100644
--- a/candle-transformers/src/models/quantized_phi3.rs
+++ b/candle-transformers/src/models/quantized_phi3.rs
@@ -127,7 +127,7 @@ impl LayerWeights {
             .reshape((b_sz, seq_len, self.n_head, self.head_dim))?
             .transpose(1, 2)?;
         let k = k
-            .reshape((b_sz, seq_len, self.n_head, self.head_dim))?
+            .reshape((b_sz, seq_len, self.n_kv_head, self.head_dim))?
             .transpose(1, 2)?;
         let v = v
             .reshape((b_sz, seq_len, self.n_kv_head, self.head_dim))?