From 6fae532cb566900e2150c0873308ac1f6bbc5455 Mon Sep 17 00:00:00 2001 From: inkcherry Date: Thu, 30 May 2024 10:09:51 +0000 Subject: [PATCH 1/3] ds-sp(ulysses) for rope --- megatron/model/language_model.py | 4 ++++ megatron/model/transformer.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py index ec2ae1877a..38e33b4abe 100644 --- a/megatron/model/language_model.py +++ b/megatron/model/language_model.py @@ -543,6 +543,10 @@ def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask, else: if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: rotary_pos_emb = self.rotary_pos_emb(args.curriculum_seqlen) + elif args.ds_sequence_parallel_size > 1: + parallel_seq_len = self.seq_length / args.ds_sequence_parallel_size + ds_sp_offset = mpu.get_sequence_parallel_rank() * parallel_seq_len + rotary_pos_emb = self.rotary_pos_emb(parallel_seq_len, ds_sp_offset) else: rotary_pos_emb = self.rotary_pos_emb(self.seq_length) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index e75f13a24f..53e2093683 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -401,6 +401,10 @@ def forward(self, q, k, v): --------- q, k, v: The tensor containing the query, key, and value. (B, S, H, D) """ + # dump q, k, v tensor after by heads + heads_per_gpus = q.shape[2] + for hid in range(heads_per_gpus): + print(f"heads_{hid+mpu.get_sequence_parallel_rank()*heads_per_gpus} : {q[:,:,hid,:],k[:,:,hid,:],v[:,:,hid,:]}") assert all((i.dtype in [torch.float16, torch.bfloat16] for i in (q,k,v))) assert all((get_accelerator().on_accelerator(i) for i in (q, k, v))) From eedfe9c7d374327bf21f29c1c12b5cbf625941e1 Mon Sep 17 00:00:00 2001 From: inkcherry Date: Thu, 30 May 2024 10:23:14 +0000 Subject: [PATCH 2/3] remove print --- megatron/model/transformer.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 53e2093683..e75f13a24f 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -401,10 +401,6 @@ def forward(self, q, k, v): --------- q, k, v: The tensor containing the query, key, and value. (B, S, H, D) """ - # dump q, k, v tensor after by heads - heads_per_gpus = q.shape[2] - for hid in range(heads_per_gpus): - print(f"heads_{hid+mpu.get_sequence_parallel_rank()*heads_per_gpus} : {q[:,:,hid,:],k[:,:,hid,:],v[:,:,hid,:]}") assert all((i.dtype in [torch.float16, torch.bfloat16] for i in (q,k,v))) assert all((get_accelerator().on_accelerator(i) for i in (q, k, v))) From d1ca6bb283b8247776c76c7a118f6847f656681e Mon Sep 17 00:00:00 2001 From: inkcherry Date: Mon, 3 Jun 2024 05:46:12 +0000 Subject: [PATCH 3/3] fix format --- megatron/model/language_model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py index 38e33b4abe..c93f478e8e 100644 --- a/megatron/model/language_model.py +++ b/megatron/model/language_model.py @@ -544,9 +544,9 @@ def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask, if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: rotary_pos_emb = self.rotary_pos_emb(args.curriculum_seqlen) elif args.ds_sequence_parallel_size > 1: - parallel_seq_len = self.seq_length / args.ds_sequence_parallel_size - ds_sp_offset = mpu.get_sequence_parallel_rank() * parallel_seq_len - rotary_pos_emb = self.rotary_pos_emb(parallel_seq_len, ds_sp_offset) + parallel_seq_len = self.seq_length / args.ds_sequence_parallel_size + ds_sp_offset = mpu.get_sequence_parallel_rank() * parallel_seq_len + rotary_pos_emb = self.rotary_pos_emb(parallel_seq_len, ds_sp_offset) else: rotary_pos_emb = self.rotary_pos_emb(self.seq_length)