-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathmodel_load_gpt2.cpp
196 lines (190 loc) · 13.1 KB
/
model_load_gpt2.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
// This file was generated by scripts/model_load.py
// DO NOT EDIT
#include <fcntl.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#include "model.h"
#include "tensor.h"
bool load_gpt2_model(Model &m) {
const char *fname = "model/gpt2-weights.bin";
int fd = open(fname, O_RDONLY);
if (fd < 0) {
perror(fname);
return false;
}
struct stat sb;
fstat(fd, &sb);
char *data = (char*) mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
m.mmap_data = data;
m.mmap_siz = sb.st_size;
m.embedding_dim = 768;
m.context_len = 1024;
m.ntokens = 50257;
m.h = new TransformerBlock[12];
m.wte_weight = Tensorf<2>((float*)(data + 0x00000000), 50257, 768);
m.wpe_weight = Tensorf<2>((float*)(data + 0x0933cc00), 1024, 768);
m.ln_f.bias = Tensorf<1>((float*)(data + 0x0963cc00), 768);
m.ln_f.weight = Tensorf<1>((float*)(data + 0x0963d800), 768);
m.h[0].ln_1.bias = Tensorf<1>((float*)(data + 0x0963e400), 768);
m.h[0].ln_1.weight = Tensorf<1>((float*)(data + 0x0963f000), 768);
m.h[0].attn.num_heads = 12;
m.h[0].attn.c_attn_bias = Tensorf<1>((float*)(data + 0x0963fc00), 2304);
m.h[0].attn.c_attn_weight = Tensorf<2>((float*)(data + 0x09642000), 2304, 768);
m.h[0].attn.c_proj_bias = Tensorf<1>((float*)(data + 0x09d02000), 768);
m.h[0].attn.c_proj_weight = Tensorf<2>((float*)(data + 0x09d02c00), 768, 768);
m.h[0].ln_2.bias = Tensorf<1>((float*)(data + 0x09f42c00), 768);
m.h[0].ln_2.weight = Tensorf<1>((float*)(data + 0x09f43800), 768);
m.h[0].mlp.c_fc_bias = Tensorf<1>((float*)(data + 0x09f44400), 3072);
m.h[0].mlp.c_fc_weight = Tensorf<2>((float*)(data + 0x09f47400), 3072, 768);
m.h[0].mlp.c_proj_bias = Tensorf<1>((float*)(data + 0x0a847400), 768);
m.h[0].mlp.c_proj_weight = Tensorf<2>((float*)(data + 0x0a848000), 768, 3072);
m.h[1].ln_1.bias = Tensorf<1>((float*)(data + 0x0b148000), 768);
m.h[1].ln_1.weight = Tensorf<1>((float*)(data + 0x0b148c00), 768);
m.h[1].attn.num_heads = 12;
m.h[1].attn.c_attn_bias = Tensorf<1>((float*)(data + 0x0b149800), 2304);
m.h[1].attn.c_attn_weight = Tensorf<2>((float*)(data + 0x0b14bc00), 2304, 768);
m.h[1].attn.c_proj_bias = Tensorf<1>((float*)(data + 0x0b80bc00), 768);
m.h[1].attn.c_proj_weight = Tensorf<2>((float*)(data + 0x0b80c800), 768, 768);
m.h[1].ln_2.bias = Tensorf<1>((float*)(data + 0x0ba4c800), 768);
m.h[1].ln_2.weight = Tensorf<1>((float*)(data + 0x0ba4d400), 768);
m.h[1].mlp.c_fc_bias = Tensorf<1>((float*)(data + 0x0ba4e000), 3072);
m.h[1].mlp.c_fc_weight = Tensorf<2>((float*)(data + 0x0ba51000), 3072, 768);
m.h[1].mlp.c_proj_bias = Tensorf<1>((float*)(data + 0x0c351000), 768);
m.h[1].mlp.c_proj_weight = Tensorf<2>((float*)(data + 0x0c351c00), 768, 3072);
m.h[2].ln_1.bias = Tensorf<1>((float*)(data + 0x0cc51c00), 768);
m.h[2].ln_1.weight = Tensorf<1>((float*)(data + 0x0cc52800), 768);
m.h[2].attn.num_heads = 12;
m.h[2].attn.c_attn_bias = Tensorf<1>((float*)(data + 0x0cc53400), 2304);
m.h[2].attn.c_attn_weight = Tensorf<2>((float*)(data + 0x0cc55800), 2304, 768);
m.h[2].attn.c_proj_bias = Tensorf<1>((float*)(data + 0x0d315800), 768);
m.h[2].attn.c_proj_weight = Tensorf<2>((float*)(data + 0x0d316400), 768, 768);
m.h[2].ln_2.bias = Tensorf<1>((float*)(data + 0x0d556400), 768);
m.h[2].ln_2.weight = Tensorf<1>((float*)(data + 0x0d557000), 768);
m.h[2].mlp.c_fc_bias = Tensorf<1>((float*)(data + 0x0d557c00), 3072);
m.h[2].mlp.c_fc_weight = Tensorf<2>((float*)(data + 0x0d55ac00), 3072, 768);
m.h[2].mlp.c_proj_bias = Tensorf<1>((float*)(data + 0x0de5ac00), 768);
m.h[2].mlp.c_proj_weight = Tensorf<2>((float*)(data + 0x0de5b800), 768, 3072);
m.h[3].ln_1.bias = Tensorf<1>((float*)(data + 0x0e75b800), 768);
m.h[3].ln_1.weight = Tensorf<1>((float*)(data + 0x0e75c400), 768);
m.h[3].attn.num_heads = 12;
m.h[3].attn.c_attn_bias = Tensorf<1>((float*)(data + 0x0e75d000), 2304);
m.h[3].attn.c_attn_weight = Tensorf<2>((float*)(data + 0x0e75f400), 2304, 768);
m.h[3].attn.c_proj_bias = Tensorf<1>((float*)(data + 0x0ee1f400), 768);
m.h[3].attn.c_proj_weight = Tensorf<2>((float*)(data + 0x0ee20000), 768, 768);
m.h[3].ln_2.bias = Tensorf<1>((float*)(data + 0x0f060000), 768);
m.h[3].ln_2.weight = Tensorf<1>((float*)(data + 0x0f060c00), 768);
m.h[3].mlp.c_fc_bias = Tensorf<1>((float*)(data + 0x0f061800), 3072);
m.h[3].mlp.c_fc_weight = Tensorf<2>((float*)(data + 0x0f064800), 3072, 768);
m.h[3].mlp.c_proj_bias = Tensorf<1>((float*)(data + 0x0f964800), 768);
m.h[3].mlp.c_proj_weight = Tensorf<2>((float*)(data + 0x0f965400), 768, 3072);
m.h[4].ln_1.bias = Tensorf<1>((float*)(data + 0x10265400), 768);
m.h[4].ln_1.weight = Tensorf<1>((float*)(data + 0x10266000), 768);
m.h[4].attn.num_heads = 12;
m.h[4].attn.c_attn_bias = Tensorf<1>((float*)(data + 0x10266c00), 2304);
m.h[4].attn.c_attn_weight = Tensorf<2>((float*)(data + 0x10269000), 2304, 768);
m.h[4].attn.c_proj_bias = Tensorf<1>((float*)(data + 0x10929000), 768);
m.h[4].attn.c_proj_weight = Tensorf<2>((float*)(data + 0x10929c00), 768, 768);
m.h[4].ln_2.bias = Tensorf<1>((float*)(data + 0x10b69c00), 768);
m.h[4].ln_2.weight = Tensorf<1>((float*)(data + 0x10b6a800), 768);
m.h[4].mlp.c_fc_bias = Tensorf<1>((float*)(data + 0x10b6b400), 3072);
m.h[4].mlp.c_fc_weight = Tensorf<2>((float*)(data + 0x10b6e400), 3072, 768);
m.h[4].mlp.c_proj_bias = Tensorf<1>((float*)(data + 0x1146e400), 768);
m.h[4].mlp.c_proj_weight = Tensorf<2>((float*)(data + 0x1146f000), 768, 3072);
m.h[5].ln_1.bias = Tensorf<1>((float*)(data + 0x11d6f000), 768);
m.h[5].ln_1.weight = Tensorf<1>((float*)(data + 0x11d6fc00), 768);
m.h[5].attn.num_heads = 12;
m.h[5].attn.c_attn_bias = Tensorf<1>((float*)(data + 0x11d70800), 2304);
m.h[5].attn.c_attn_weight = Tensorf<2>((float*)(data + 0x11d72c00), 2304, 768);
m.h[5].attn.c_proj_bias = Tensorf<1>((float*)(data + 0x12432c00), 768);
m.h[5].attn.c_proj_weight = Tensorf<2>((float*)(data + 0x12433800), 768, 768);
m.h[5].ln_2.bias = Tensorf<1>((float*)(data + 0x12673800), 768);
m.h[5].ln_2.weight = Tensorf<1>((float*)(data + 0x12674400), 768);
m.h[5].mlp.c_fc_bias = Tensorf<1>((float*)(data + 0x12675000), 3072);
m.h[5].mlp.c_fc_weight = Tensorf<2>((float*)(data + 0x12678000), 3072, 768);
m.h[5].mlp.c_proj_bias = Tensorf<1>((float*)(data + 0x12f78000), 768);
m.h[5].mlp.c_proj_weight = Tensorf<2>((float*)(data + 0x12f78c00), 768, 3072);
m.h[6].ln_1.bias = Tensorf<1>((float*)(data + 0x13878c00), 768);
m.h[6].ln_1.weight = Tensorf<1>((float*)(data + 0x13879800), 768);
m.h[6].attn.num_heads = 12;
m.h[6].attn.c_attn_bias = Tensorf<1>((float*)(data + 0x1387a400), 2304);
m.h[6].attn.c_attn_weight = Tensorf<2>((float*)(data + 0x1387c800), 2304, 768);
m.h[6].attn.c_proj_bias = Tensorf<1>((float*)(data + 0x13f3c800), 768);
m.h[6].attn.c_proj_weight = Tensorf<2>((float*)(data + 0x13f3d400), 768, 768);
m.h[6].ln_2.bias = Tensorf<1>((float*)(data + 0x1417d400), 768);
m.h[6].ln_2.weight = Tensorf<1>((float*)(data + 0x1417e000), 768);
m.h[6].mlp.c_fc_bias = Tensorf<1>((float*)(data + 0x1417ec00), 3072);
m.h[6].mlp.c_fc_weight = Tensorf<2>((float*)(data + 0x14181c00), 3072, 768);
m.h[6].mlp.c_proj_bias = Tensorf<1>((float*)(data + 0x14a81c00), 768);
m.h[6].mlp.c_proj_weight = Tensorf<2>((float*)(data + 0x14a82800), 768, 3072);
m.h[7].ln_1.bias = Tensorf<1>((float*)(data + 0x15382800), 768);
m.h[7].ln_1.weight = Tensorf<1>((float*)(data + 0x15383400), 768);
m.h[7].attn.num_heads = 12;
m.h[7].attn.c_attn_bias = Tensorf<1>((float*)(data + 0x15384000), 2304);
m.h[7].attn.c_attn_weight = Tensorf<2>((float*)(data + 0x15386400), 2304, 768);
m.h[7].attn.c_proj_bias = Tensorf<1>((float*)(data + 0x15a46400), 768);
m.h[7].attn.c_proj_weight = Tensorf<2>((float*)(data + 0x15a47000), 768, 768);
m.h[7].ln_2.bias = Tensorf<1>((float*)(data + 0x15c87000), 768);
m.h[7].ln_2.weight = Tensorf<1>((float*)(data + 0x15c87c00), 768);
m.h[7].mlp.c_fc_bias = Tensorf<1>((float*)(data + 0x15c88800), 3072);
m.h[7].mlp.c_fc_weight = Tensorf<2>((float*)(data + 0x15c8b800), 3072, 768);
m.h[7].mlp.c_proj_bias = Tensorf<1>((float*)(data + 0x1658b800), 768);
m.h[7].mlp.c_proj_weight = Tensorf<2>((float*)(data + 0x1658c400), 768, 3072);
m.h[8].ln_1.bias = Tensorf<1>((float*)(data + 0x16e8c400), 768);
m.h[8].ln_1.weight = Tensorf<1>((float*)(data + 0x16e8d000), 768);
m.h[8].attn.num_heads = 12;
m.h[8].attn.c_attn_bias = Tensorf<1>((float*)(data + 0x16e8dc00), 2304);
m.h[8].attn.c_attn_weight = Tensorf<2>((float*)(data + 0x16e90000), 2304, 768);
m.h[8].attn.c_proj_bias = Tensorf<1>((float*)(data + 0x17550000), 768);
m.h[8].attn.c_proj_weight = Tensorf<2>((float*)(data + 0x17550c00), 768, 768);
m.h[8].ln_2.bias = Tensorf<1>((float*)(data + 0x17790c00), 768);
m.h[8].ln_2.weight = Tensorf<1>((float*)(data + 0x17791800), 768);
m.h[8].mlp.c_fc_bias = Tensorf<1>((float*)(data + 0x17792400), 3072);
m.h[8].mlp.c_fc_weight = Tensorf<2>((float*)(data + 0x17795400), 3072, 768);
m.h[8].mlp.c_proj_bias = Tensorf<1>((float*)(data + 0x18095400), 768);
m.h[8].mlp.c_proj_weight = Tensorf<2>((float*)(data + 0x18096000), 768, 3072);
m.h[9].ln_1.bias = Tensorf<1>((float*)(data + 0x18996000), 768);
m.h[9].ln_1.weight = Tensorf<1>((float*)(data + 0x18996c00), 768);
m.h[9].attn.num_heads = 12;
m.h[9].attn.c_attn_bias = Tensorf<1>((float*)(data + 0x18997800), 2304);
m.h[9].attn.c_attn_weight = Tensorf<2>((float*)(data + 0x18999c00), 2304, 768);
m.h[9].attn.c_proj_bias = Tensorf<1>((float*)(data + 0x19059c00), 768);
m.h[9].attn.c_proj_weight = Tensorf<2>((float*)(data + 0x1905a800), 768, 768);
m.h[9].ln_2.bias = Tensorf<1>((float*)(data + 0x1929a800), 768);
m.h[9].ln_2.weight = Tensorf<1>((float*)(data + 0x1929b400), 768);
m.h[9].mlp.c_fc_bias = Tensorf<1>((float*)(data + 0x1929c000), 3072);
m.h[9].mlp.c_fc_weight = Tensorf<2>((float*)(data + 0x1929f000), 3072, 768);
m.h[9].mlp.c_proj_bias = Tensorf<1>((float*)(data + 0x19b9f000), 768);
m.h[9].mlp.c_proj_weight = Tensorf<2>((float*)(data + 0x19b9fc00), 768, 3072);
m.h[10].ln_1.bias = Tensorf<1>((float*)(data + 0x1a49fc00), 768);
m.h[10].ln_1.weight = Tensorf<1>((float*)(data + 0x1a4a0800), 768);
m.h[10].attn.num_heads = 12;
m.h[10].attn.c_attn_bias = Tensorf<1>((float*)(data + 0x1a4a1400), 2304);
m.h[10].attn.c_attn_weight = Tensorf<2>((float*)(data + 0x1a4a3800), 2304, 768);
m.h[10].attn.c_proj_bias = Tensorf<1>((float*)(data + 0x1ab63800), 768);
m.h[10].attn.c_proj_weight = Tensorf<2>((float*)(data + 0x1ab64400), 768, 768);
m.h[10].ln_2.bias = Tensorf<1>((float*)(data + 0x1ada4400), 768);
m.h[10].ln_2.weight = Tensorf<1>((float*)(data + 0x1ada5000), 768);
m.h[10].mlp.c_fc_bias = Tensorf<1>((float*)(data + 0x1ada5c00), 3072);
m.h[10].mlp.c_fc_weight = Tensorf<2>((float*)(data + 0x1ada8c00), 3072, 768);
m.h[10].mlp.c_proj_bias = Tensorf<1>((float*)(data + 0x1b6a8c00), 768);
m.h[10].mlp.c_proj_weight = Tensorf<2>((float*)(data + 0x1b6a9800), 768, 3072);
m.h[11].ln_1.bias = Tensorf<1>((float*)(data + 0x1bfa9800), 768);
m.h[11].ln_1.weight = Tensorf<1>((float*)(data + 0x1bfaa400), 768);
m.h[11].attn.num_heads = 12;
m.h[11].attn.c_attn_bias = Tensorf<1>((float*)(data + 0x1bfab000), 2304);
m.h[11].attn.c_attn_weight = Tensorf<2>((float*)(data + 0x1bfad400), 2304, 768);
m.h[11].attn.c_proj_bias = Tensorf<1>((float*)(data + 0x1c66d400), 768);
m.h[11].attn.c_proj_weight = Tensorf<2>((float*)(data + 0x1c66e000), 768, 768);
m.h[11].ln_2.bias = Tensorf<1>((float*)(data + 0x1c8ae000), 768);
m.h[11].ln_2.weight = Tensorf<1>((float*)(data + 0x1c8aec00), 768);
m.h[11].mlp.c_fc_bias = Tensorf<1>((float*)(data + 0x1c8af800), 3072);
m.h[11].mlp.c_fc_weight = Tensorf<2>((float*)(data + 0x1c8b2800), 3072, 768);
m.h[11].mlp.c_proj_bias = Tensorf<1>((float*)(data + 0x1d1b2800), 768);
m.h[11].mlp.c_proj_weight = Tensorf<2>((float*)(data + 0x1d1b3400), 768, 3072);
close(fd);
return true;
}