diff --git a/src/lc0ctl/leela2onnx.cc b/src/lc0ctl/leela2onnx.cc index 107131ae06..5336aa08bf 100644 --- a/src/lc0ctl/leela2onnx.cc +++ b/src/lc0ctl/leela2onnx.cc @@ -53,6 +53,8 @@ const OptionId kOutputValue{ "ONNX name to use for value policy head output node."}; const OptionId kOutputMlh{"mlh-head-name", "MlhHeadName", "ONNX name to use for the MLH head output node."}; +const OptionId kOnnxToPytorch{"onnx2pytorch", "Onnx2Pytorch", + "Only use layer definitions supported by onnx2pytorch."}; bool ProcessParameters(OptionsParser* options) { options->Add(kInputFilenameId); @@ -63,6 +65,7 @@ bool ProcessParameters(OptionsParser* options) { options->Add(kOutputWdl) = "/output/wdl"; options->Add(kOutputValue) = "/output/value"; options->Add(kOutputMlh) = "/output/mlh"; + options->Add(kOnnxToPytorch) = false; if (!options->ProcessAllFlags()) return false; const OptionsDict& dict = options->GetOptionsDict(); @@ -94,6 +97,9 @@ void ConvertLeelaToOnnx() { onnx_options.output_wdl = dict.Get(kOutputWdl); onnx_options.output_value = dict.Get(kOutputValue); onnx_options.output_wdl = dict.Get(kOutputWdl); + // onnx2pytorch only needs an alternate layernorm-implementation, so it's currently + // only enables that. Might need to be extended in the future. + onnx_options.alternative_layer_normalization = dict.Get(kOnnxToPytorch); weights_file = ConvertWeightsToOnnx(weights_file, onnx_options); } diff --git a/src/neural/onnx/builder.cc b/src/neural/onnx/builder.cc index 7dcb04c19a..fb54f0f698 100644 --- a/src/neural/onnx/builder.cc +++ b/src/neural/onnx/builder.cc @@ -137,10 +137,12 @@ std::string OnnxBuilder::Conv(const std::string& name, const OnnxConst& kernel_weights, const OnnxConst& bias_weights, int pads) { auto* node = model_.mutable_graph()->add_node(); + auto shape = kernel_weights.GetDimensions().back(); auto out = PopulateStdNodeFields(node, name, input_name, "Conv"); node->add_input(AddInitializer(name + "/w/kernel", kernel_weights)); node->add_input(AddInitializer(name + "/w/bias", bias_weights)); AddIntsAttribute(node, "pads", {pads, pads, pads, pads}); + AddIntsAttribute(node, "kernel_shape", {shape, shape}); return out; } @@ -438,4 +440,40 @@ std::string OnnxBuilder::Mish(const std::string& name, return PopulateStdNodeFields(node, name, input, "Mish"); } +std::string OnnxBuilder::Sqrt(const std::string& name, + const std::string& input) { + auto* node = model_.mutable_graph()->add_node(); + return PopulateStdNodeFields(node, name, input, "Sqrt"); +} + +std::string OnnxBuilder::Reciprocal(const std::string& name, + const std::string& input) { + auto* node = model_.mutable_graph()->add_node(); + return PopulateStdNodeFields(node, name, input, "Reciprocal"); +} + +std::string OnnxBuilder::Cast(const std::string& name, const std::string& input, + pblczero::TensorProto::DataType type) { + auto* node = model_.mutable_graph()->add_node(); + auto out = PopulateStdNodeFields(node, name, input, "Cast"); + AddIntAttribute(node, "to", type); + return out; +} + +std::string OnnxBuilder::ReduceMean(const std::string& name, + const std::string& input, + std::initializer_list axes) { + auto* node = model_.mutable_graph()->add_node(); + auto out = PopulateStdNodeFields(node, name, input, "ReduceMean"); + if (opset_ < 18) { + AddIntsAttribute(node, "axes", axes); + } else { + node->add_input(AddInitializer( + name + "/axes", + Int64OnnxConst(std::vector(begin(axes), end(axes)), + {static_cast(axes.size())}))); + } + return out; +} + } // namespace lczero diff --git a/src/neural/onnx/builder.h b/src/neural/onnx/builder.h index c1f6c5e957..ba0d3705a5 100644 --- a/src/neural/onnx/builder.h +++ b/src/neural/onnx/builder.h @@ -65,7 +65,7 @@ class OnnxBuilder { std::string Add(const std::string& name, const std::string& input1, const std::string& input2); std::string Add(const std::string& name, const std::string& input1, - const OnnxConst&); + const OnnxConst& input2); std::string GlobalAveragePool(const std::string& name, const std::string& input); std::string Squeeze(const std::string& name, const std::string& input, @@ -120,6 +120,12 @@ class OnnxBuilder { std::string Where(const std::string& name, const std::string& input1, const std::string& input2, const std::string& input3); std::string Mish(const std::string& name, const std::string& input); + std::string Sqrt(const std::string& name, const std::string& input); + std::string Reciprocal(const std::string& name, const std::string& input); + std::string Cast(const std::string& name, const std::string& input, + pblczero::TensorProto::DataType type); + std::string ReduceMean(const std::string& name, const std::string& input, + std::initializer_list axes); // Returns ONNX model as protobuf. const pblczero::ModelProto& as_proto() const { return model_; } // Returns serialized model. diff --git a/src/neural/onnx/converter.cc b/src/neural/onnx/converter.cc index 59297b3ead..6c2869a81c 100644 --- a/src/neural/onnx/converter.cc +++ b/src/neural/onnx/converter.cc @@ -111,6 +111,11 @@ class Converter { const std::string& encoder_in, const std::string& name); + std::string MakeLayerNorm(OnnxBuilder* builder, const std::string& input, + const std::string& name, + const lczero::OnnxConst& gammas, + const lczero::OnnxConst& betas, float eps = 1e-6); + std::string MakeEncoderLayer(OnnxBuilder* builder, const LegacyWeights::EncoderLayer& layer, int embedding_size, int heads, @@ -320,10 +325,10 @@ std::string Converter::MakeSmolgen(OnnxBuilder* builder, name + "/smolgen/dense1/b", flow, *GetWeghtsConverter(layer.mha.smolgen.dense1_b, {smolgen_hidden_sz})); flow = MakeActivation(builder, flow, name + "/smolgen/dense1", activation); - flow = builder->LayerNormalization( - name + "/smolgen/ln1", flow, + flow = MakeLayerNorm( + builder, flow, name + "/smolgen/ln1", *GetWeghtsConverter(layer.mha.smolgen.ln1_gammas, {smolgen_hidden_sz}), - *GetWeghtsConverter(layer.mha.smolgen.ln1_betas, {smolgen_hidden_sz}), 1, + *GetWeghtsConverter(layer.mha.smolgen.ln1_betas, {smolgen_hidden_sz}), 1e-3); flow = builder->MatMul( name + "/smolgen/dense2/w", flow, @@ -333,13 +338,12 @@ std::string Converter::MakeSmolgen(OnnxBuilder* builder, *GetWeghtsConverter(layer.mha.smolgen.dense2_b, {smolgen_gen_sz * heads})); flow = MakeActivation(builder, flow, name + "/smolgen/dense2", activation); - flow = builder->LayerNormalization( - name + "/smolgen/ln2", flow, - *GetWeghtsConverter(layer.mha.smolgen.ln2_gammas, - {smolgen_gen_sz * heads}), - *GetWeghtsConverter(layer.mha.smolgen.ln2_betas, - {smolgen_gen_sz * heads}), - 1, 1e-3); + flow = MakeLayerNorm(builder, flow, name + "/smolgen/ln2", + *GetWeghtsConverter(layer.mha.smolgen.ln2_gammas, + {smolgen_gen_sz * heads}), + *GetWeghtsConverter(layer.mha.smolgen.ln2_betas, + {smolgen_gen_sz * heads}), + 1e-3); flow = builder->Reshape(name + "/smolgen/gen_from/reshape", flow, builder->AddInitializer( @@ -354,6 +358,33 @@ std::string Converter::MakeSmolgen(OnnxBuilder* builder, return flow; } +std::string Converter::MakeLayerNorm(OnnxBuilder* builder, + const std::string& input, + const std::string& name, + const lczero::OnnxConst& gammas, + const lczero::OnnxConst& betas, + float eps) { + if (!options_.alternative_layer_normalization) { + return builder->LayerNormalization(name, input, gammas, betas, 1, eps); + } + auto in = + builder->Cast(name + "/to_float", input, pblczero::TensorProto::FLOAT); + auto flow = builder->ReduceMean(name + "/mean", in, {1}); + in = builder->Sub(name + "/centered", in, flow); + flow = builder->Mul(name + "/squared", in, in); + flow = builder->ReduceMean(name + "/var", flow, {1}); + flow = + builder->Add(name + "/var_eps", flow, + static_cast(FloatOnnxConst({eps}, {1}))); + flow = builder->Sqrt(name + "/std", flow); + flow = builder->Reciprocal(name + "/inv_std", flow); + flow = builder->Mul(name + "/normalized", in, flow); + flow = builder->Cast(name + "/to_data_type", flow, GetDataType()); + flow = builder->Mul(name + "/gammas", flow, gammas); + flow = builder->Add(name + "/betas", flow, betas); + return flow; +} + std::string Converter::MakeEncoderLayer( OnnxBuilder* builder, const LegacyWeights::EncoderLayer& layer, int embedding_size, int heads, const std::string& encoder_in, @@ -430,11 +461,10 @@ std::string Converter::MakeEncoderLayer( alpha_in = encoder_in; } flow = builder->Add(name + "/mha/out/skip", flow, alpha_in); - - auto ffn_in = builder->LayerNormalization( - name + "/ln1", flow, - *GetWeghtsConverter(layer.ln1_gammas, {embedding_size}), - *GetWeghtsConverter(layer.ln1_betas, {embedding_size}), 1); + auto ffn_in = + MakeLayerNorm(builder, flow, name + "/ln1", + *GetWeghtsConverter(layer.ln1_gammas, {embedding_size}), + *GetWeghtsConverter(layer.ln1_betas, {embedding_size})); const int dff_size = layer.ffn.dense1_b.size(); flow = builder->MatMul(name + "/ffn/dense1/w", ffn_in, @@ -462,10 +492,9 @@ std::string Converter::MakeEncoderLayer( alpha_ffn_in = ffn_in; } flow = builder->Add(name + "/ffn/skip", flow, alpha_ffn_in); - flow = builder->LayerNormalization( - name + "/ln2", flow, - *GetWeghtsConverter(layer.ln2_gammas, {embedding_size}), - *GetWeghtsConverter(layer.ln2_betas, {embedding_size}), 1); + flow = MakeLayerNorm(builder, flow, name + "/ln2", + *GetWeghtsConverter(layer.ln2_gammas, {embedding_size}), + *GetWeghtsConverter(layer.ln2_betas, {embedding_size})); return flow; } diff --git a/src/neural/onnx/converter.h b/src/neural/onnx/converter.h index b906aebfce..f397014324 100644 --- a/src/neural/onnx/converter.h +++ b/src/neural/onnx/converter.h @@ -44,6 +44,7 @@ struct WeightsToOnnxConverterOptions { int batch_size = -1; int opset = 17; bool alt_mish = false; + bool alternative_layer_normalization = false; }; // Converts "classical" weights file to weights file with embedded ONNX model. diff --git a/src/neural/onnx/network_onnx.cc b/src/neural/onnx/network_onnx.cc index e5eb8e0443..ef3b0b33af 100644 --- a/src/neural/onnx/network_onnx.cc +++ b/src/neural/onnx/network_onnx.cc @@ -446,6 +446,8 @@ std::unique_ptr MakeOnnxNetwork(const std::optional& w, converter_options.opset = opts.GetOrDefault("opset", 17); converter_options.alt_mish = opts.GetOrDefault( "alt_mish", kProvider == OnnxProvider::CPU ? true : false); + converter_options.alternative_layer_normalization = + opts.GetOrDefault("alternative_layer_normalization", true); converter_options.data_type_ = fp16 ? WeightsToOnnxConverterOptions::DataType::kFloat16 : WeightsToOnnxConverterOptions::DataType::kFloat32;