-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmixing_law_fitting.py
278 lines (214 loc) · 8.36 KB
/
mixing_law_fitting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
import glob
import pickle
import numpy as np
import pandas as pd
from scipy.optimize import curve_fit, minimize
from scipy.special import softmax
import fire
import logging
def load_weights_from_folder(folder_path, seed, t):
"""
Load weights array from a .pkl file in a folder for a specific iteration.
args:
- folder_path (str): Path to the directory containing .pkl files.
- seed (int): Seed value used in the file naming.
- t (int): Specific iteration number to load.
returns:
- Tuple: Contains iteration number and corresponding weights array.
"""
print(folder_path)
if seed is None:
# match the first file in the directory
loss_file_name = glob.glob(folder_path + f"test_seed_*_checkpoint-{t}.pkl")[0]
else:
loss_file_name = f"test_seed_{seed}_checkpoint-{t}.pkl"
with open(loss_file_name, "rb") as file:
loss = pickle.load(file)["task_loss"].to_numpy()
return loss
def load_weight_diff_from_folder(folder_path, seed, start_t, end_t):
"""
Load weights array from a .pkl file in a folder for a specific iteration.
Relevant for fitting dynamic mixing laws.
args:
- folder_path (str): Path to the directory containing .pkl files.
- seed (int): Seed value used in the file naming.
- t (int): Specific iteration number to load.
returns:
- Tuple: Contains iteration number and corresponding weights array.
"""
if seed is None:
# match the first file in the directory
start_loss_file_name = glob.glob(
folder_path + f"val_seed_*_checkpoint-{start_t}.pkl"
)[0]
end_loss_file_name = glob.glob(
folder_path + f"val_seed_*_checkpoint-{end_t}.pkl"
)[0]
else:
start_loss_file_name = folder_path + f"val_seed_{seed}_checkpoint-{start_t}.pkl"
end_loss_file_name = folder_path + f"val_seed_{seed}_checkpoint-{end_t}.pkl"
with open(start_loss_file_name, "rb") as file:
start_loss = pickle.load(file)["task_loss"].to_numpy()
with open(end_loss_file_name, "rb") as file:
end_loss = pickle.load(file)["task_loss"].to_numpy()
return end_loss - start_loss
def collate(dir, seed, max_t, step, save=False):
# ts, losses = load_weights_from_folder(dir, seed, max_t, step)
ts = []
losses = []
for t in range(0, max_t, step):
loss = load_weights_from_folder(dir, seed, t)
ts.append([t, seed])
losses.append(loss)
ts = np.array(ts)
losses = np.array(losses)
data = np.hstack((ts, losses))
df = pd.DataFrame(
data, columns=["t", "seed"] + [f"l{i}" for i in range(losses.shape[1])]
)
if save:
df.to_csv(dir + "collated.csv")
return df
def diagonal_model(x, *params):
# Assuming a, b, m are vectors
n_params = len(params) // 3 # Number of elements in each vector (a, b, m)
a = np.array(params[:n_params])
b = np.array(params[n_params : 2 * n_params])
m = np.array(params[2 * n_params :])
result = a * np.exp(x * m) + b
# Ensure the result is a 1D array of floats
return result.flatten()
def model(x, *params):
"""
The mixing law assumed by Ye et al. (2024) "Data Mixing Laws."
"""
n_params = x.shape[-1]
a = np.array(params[:n_params])
b = np.array(params[n_params : 2 * n_params])
m = np.array(params[2 * n_params :]).reshape(n_params, -1)
result = a * np.exp(x @ m) + b
return result.flatten()
def linear_model(x, *params):
n_params = x.shape[-1]
a = np.array(params).reshape(n_params, -1)
result = x @ a
return result.flatten()
def fit_model(x, y, model, initial_guess):
# Use curve_fit to find the best-fitting parameters
popt, pcov = curve_fit(model, x, y.flatten(), p0=initial_guess, maxfev=100000)
# Calculate the predicted values using the optimized parameters
y_pred = model(x, *popt)
# Calculate the mean squared error
mse = np.mean((y.flatten() - y_pred) ** 2)
return popt, pcov, mse, y_pred, y
def linreg(run_dir, proportions_file, min_t=0, max_t=5000, loss_diff=True):
np.set_printoptions(precision=3, suppress=True)
logging.getLogger().setLevel(logging.INFO)
# read lines in proportions file
with open(proportions_file, "r") as f:
lines = f.readlines()
# remove commas
lines = [line.strip().split(",") for line in lines]
xs = []
ys = []
for proportions in lines:
proportion_str = "".join(proportions)
proportions = np.array([float(p) for p in proportions])
dir_regex = (
run_dir
+ "slimpj_pythia-160m_from_scratch_40000_mixture_arxiv_book_c4_cc_github_stackexchange_wikipedia_weights_"
+ proportion_str[:6]
+ "*/"
)
files = glob.glob(dir_regex)
if len(files) == 0:
logging.info(f"Directory {dir_regex} not found.")
continue
current_run_dir = files[0]
if loss_diff:
loss = load_weight_diff_from_folder(
current_run_dir, min_t, max_t, seed=None
)
else:
loss = load_weights_from_folder(current_run_dir, None, max_t)
xs.append(proportions)
ys.append(loss)
xs = np.vstack(xs)
ys = np.vstack(ys)
# get min index of ys
min_index = np.argmin(ys.mean(axis=1))
lowest_played_sample = xs[min_index]
print(
f"Lowest played sample: {lowest_played_sample}. Average loss: {ys.mean(axis=1)[min_index]}"
)
n_params = xs.shape[1]
# log_linear
popt, pcov, mse, y_pred, y = fit_model(
xs, ys, model, ([1] * n_params + [1] * n_params + [-1] * n_params**2)
)
# uncomment below to try other models
# diagonal
# popt, pcov, mse, y_pred, y = fit_model(xs, ys, diagonal_model, [1] * n_params + [1] * n_params + [-1] * n_params)
# linear
# popt, pcov, mse, y_pred, y = fit_model(xs, ys, linear_model, [1] * n_params ** 2)
print("MSE: ", mse)
# r^2
ss_res = np.sum((y.flatten() - y_pred) ** 2)
ss_tot = np.sum((y.flatten() - np.mean(y.flatten())) ** 2)
r2 = 1 - (ss_res / ss_tot)
print("R^2: ", r2)
# After fitting the model
print(f"Actual minimum loss from samples: {np.min(ys.mean(axis=1))}")
print(
f"Predicted loss for sample with minimum actual loss: {np.mean(model(lowest_played_sample, *popt))}"
)
def objective(x):
return np.average(model(x, *popt))
def transform_params(y):
# Transform unconstrained parameters to sum to 1
return softmax(y)
def objective_transformed(y):
# Apply the transformation before calculating the objective
x = transform_params(y)
return objective(x)
# Initial guess (unconstrained)
y0 = np.array([0.0] * xs.shape[1])
# Use Nelder-Mead method for optimization
res = minimize(
objective_transformed, y0, method="Nelder-Mead", options={"maxiter": 10000}
)
if res.success:
logging.info("Optimization successful.")
params = transform_params(res.x)
print(params)
# print predicted loss
print(np.mean(model(params, *popt)))
else:
logging.info("Optimization failed.")
logging.info("Message: %s", res.message)
ITOMETHOD = {
0: "1000000",
1: "0100000",
2: "0010000",
3: "0001000",
4: "0000100",
5: "0000010",
6: "0000001",
}
SLICE_LIST = ["arxiv", "book", "cc", "c4", "github", "stackexchange", "wikipedia"]
def skills_graph(run_dir, seed, start=0, end=2857):
dirstring = "slimpj_pythia-160m_from_scratch_{}_mixture_arxiv_book_c4_cc_github_stackexchange_wikipedia_weights_{}_static_lr_0.0005_linear_warmup_cosine"
A = np.zeros((len(SLICE_LIST), len(SLICE_LIST)))
for i, skill_i in enumerate(SLICE_LIST):
thedir = run_dir + dirstring.format(end, ITOMETHOD[i])
start_loss = load_weights_from_folder(thedir, seed, start)
end_loss = load_weights_from_folder(thedir, seed, end)
A[i] = (start_loss - end_loss) / start_loss
print(A)
np.save(run_dir + f"skills_graph_{seed}.npy", A)
if __name__ == "__main__":
"""
linreg fits a linear regression to the data mixes, according to Ye et al. (2024) "Data Mixing Laws: Optimizing Data Mixtures by Predicting Language Modeling Performance."
skills_graph computes the skill-it skills graph after performing all the independent runs.
"""
fire.Fire({"linreg": linreg, "skills_graph": skills_graph})